git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/super.c

   1 /*
   2  * bcache setup/teardown code, and some metadata io - read a superblock and
   3  * figure out what to do with it.
   4  *
   5  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   6  * Copyright 2012 Google, Inc.
   7  */
   8
   9 #include "bcache.h"
  10 #include "blockdev.h"
  11 #include "alloc.h"
  12 #include "btree_cache.h"
  13 #include "btree_gc.h"
  14 #include "btree_update.h"
  15 #include "btree_io.h"
  16 #include "chardev.h"
  17 #include "checksum.h"
  18 #include "clock.h"
  19 #include "compress.h"
  20 #include "debug.h"
  21 #include "error.h"
  22 #include "fs.h"
  23 #include "fs-gc.h"
  24 #include "inode.h"
  25 #include "io.h"
  26 #include "journal.h"
  27 #include "keylist.h"
  28 #include "move.h"
  29 #include "migrate.h"
  30 #include "movinggc.h"
  31 #include "notify.h"
  32 #include "stats.h"
  33 #include "super.h"
  34 #include "tier.h"
  35 #include "writeback.h"
  36
  37 #include <linux/backing-dev.h>
  38 #include <linux/blkdev.h>
  39 #include <linux/debugfs.h>
  40 #include <linux/genhd.h>
  41 #include <linux/idr.h>
  42 #include <linux/kthread.h>
  43 #include <linux/module.h>
  44 #include <linux/percpu.h>
  45 #include <linux/random.h>
  46 #include <linux/reboot.h>
  47 #include <linux/sysfs.h>
  48 #include <crypto/hash.h>
  49
  50 #include <trace/events/bcache.h>
  51
  52 MODULE_LICENSE("GPL");
  53 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
  54
  55 static const uuid_le invalid_uuid = {
  56         .b = {
  57                 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
  58                 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
  59         }
  60 };
  61
  62 static struct kset *bcache_kset;
  63 struct mutex bch_register_lock;
  64 LIST_HEAD(bch_cache_sets);
  65
  66 static int bch_chardev_major;
  67 static struct class *bch_chardev_class;
  68 static struct device *bch_chardev;
  69 static DEFINE_IDR(bch_chardev_minor);
  70 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
  71 struct workqueue_struct *bcache_io_wq;
  72 struct crypto_shash *bch_sha1;
  73
  74 static void bch_cache_stop(struct cache *);
  75 static int bch_cache_online(struct cache *);
  76
  77 static bool bch_is_open_cache(struct block_device *bdev)
  78 {
  79         struct cache_set *c;
  80         struct cache *ca;
  81         unsigned i;
  82
  83         rcu_read_lock();
  84         list_for_each_entry(c, &bch_cache_sets, list)
  85                 for_each_cache_rcu(ca, c, i)
  86                         if (ca->disk_sb.bdev == bdev) {
  87                                 rcu_read_unlock();
  88                                 return true;
  89                         }
  90         rcu_read_unlock();
  91         return false;
  92 }
  93
  94 static bool bch_is_open(struct block_device *bdev)
  95 {
  96         lockdep_assert_held(&bch_register_lock);
  97
  98         return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
  99 }
 100
 101 static const char *bch_blkdev_open(const char *path, void *holder,
 102                                    struct block_device **ret)
 103 {
 104         struct block_device *bdev;
 105         const char *err;
 106
 107         *ret = NULL;
 108         bdev = blkdev_get_by_path(path, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
 109                                   holder);
 110
 111         if (bdev == ERR_PTR(-EBUSY)) {
 112                 bdev = lookup_bdev(path);
 113                 if (IS_ERR(bdev))
 114                         return "device busy";
 115
 116                 err = bch_is_open(bdev)
 117                         ? "device already registered"
 118                         : "device busy";
 119
 120                 bdput(bdev);
 121                 return err;
 122         }
 123
 124         if (IS_ERR(bdev))
 125                 return "failed to open device";
 126
 127         bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
 128
 129         *ret = bdev;
 130         return NULL;
 131 }
 132
 133 static int bch_congested_fn(void *data, int bdi_bits)
 134 {
 135         struct backing_dev_info *bdi;
 136         struct cache_set *c = data;
 137         struct cache *ca;
 138         unsigned i;
 139         int ret = 0;
 140
 141         rcu_read_lock();
 142         if (bdi_bits & (1 << WB_sync_congested)) {
 143                 /* Reads - check all devices: */
 144                 for_each_cache_rcu(ca, c, i) {
 145                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 146
 147                         if (bdi_congested(bdi, bdi_bits)) {
 148                                 ret = 1;
 149                                 break;
 150                         }
 151                 }
 152         } else {
 153                 /* Writes only go to tier 0: */
 154                 group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
 155                         bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 156
 157                         if (bdi_congested(bdi, bdi_bits)) {
 158                                 ret = 1;
 159                                 break;
 160                         }
 161                 }
 162         }
 163         rcu_read_unlock();
 164
 165         return ret;
 166 }
 167
 168 /* Superblock */
 169
 170 static struct cache_member_cpu cache_mi_to_cpu_mi(struct cache_member *mi)
 171 {
 172         return (struct cache_member_cpu) {
 173                 .nbuckets       = le64_to_cpu(mi->nbuckets),
 174                 .first_bucket   = le16_to_cpu(mi->first_bucket),
 175                 .bucket_size    = le16_to_cpu(mi->bucket_size),
 176                 .state          = CACHE_STATE(mi),
 177                 .tier           = CACHE_TIER(mi),
 178                 .replication_set= CACHE_REPLICATION_SET(mi),
 179                 .has_metadata   = CACHE_HAS_METADATA(mi),
 180                 .has_data       = CACHE_HAS_DATA(mi),
 181                 .replacement    = CACHE_REPLACEMENT(mi),
 182                 .discard        = CACHE_DISCARD(mi),
 183                 .valid          = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
 184         };
 185 }
 186
 187 static const char *validate_cache_super(struct bcache_superblock *disk_sb)
 188 {
 189         struct cache_sb *sb = disk_sb->sb;
 190         struct cache_member_cpu mi;
 191         u16 block_size;
 192         unsigned i;
 193
 194         switch (le64_to_cpu(sb->version)) {
 195         case BCACHE_SB_VERSION_CDEV_V0:
 196         case BCACHE_SB_VERSION_CDEV_WITH_UUID:
 197         case BCACHE_SB_VERSION_CDEV_V2:
 198         case BCACHE_SB_VERSION_CDEV_V3:
 199                 break;
 200         default:
 201                 return"Unsupported superblock version";
 202         }
 203
 204         if (CACHE_SET_SYNC(sb) &&
 205             le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V3)
 206                 return "Unsupported superblock version";
 207
 208         block_size = le16_to_cpu(sb->block_size);
 209
 210         if (!is_power_of_2(block_size) ||
 211             block_size > PAGE_SECTORS)
 212                 return "Bad block size";
 213
 214         if (bch_is_zero(sb->disk_uuid.b, sizeof(uuid_le)))
 215                 return "Bad disk UUID";
 216
 217         if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
 218                 return "Bad user UUID";
 219
 220         if (bch_is_zero(sb->set_uuid.b, sizeof(uuid_le)))
 221                 return "Bad set UUID";
 222
 223         if (!sb->nr_in_set ||
 224             sb->nr_in_set <= sb->nr_this_dev ||
 225             sb->nr_in_set > MAX_CACHES_PER_SET)
 226                 return "Bad cache device number in set";
 227
 228         if (!CACHE_SET_META_REPLICAS_WANT(sb) ||
 229             CACHE_SET_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
 230                 return "Invalid number of metadata replicas";
 231
 232         if (!CACHE_SET_META_REPLICAS_HAVE(sb) ||
 233             CACHE_SET_META_REPLICAS_HAVE(sb) >
 234             CACHE_SET_META_REPLICAS_WANT(sb))
 235                 return "Invalid number of metadata replicas";
 236
 237         if (!CACHE_SET_DATA_REPLICAS_WANT(sb) ||
 238             CACHE_SET_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
 239                 return "Invalid number of data replicas";
 240
 241         if (!CACHE_SET_DATA_REPLICAS_HAVE(sb) ||
 242             CACHE_SET_DATA_REPLICAS_HAVE(sb) >
 243             CACHE_SET_DATA_REPLICAS_WANT(sb))
 244                 return "Invalid number of data replicas";
 245
 246         if (CACHE_SB_CSUM_TYPE(sb) >= BCH_CSUM_NR)
 247                 return "Invalid checksum type";
 248
 249         if (!CACHE_SET_BTREE_NODE_SIZE(sb))
 250                 return "Btree node size not set";
 251
 252         if (!is_power_of_2(CACHE_SET_BTREE_NODE_SIZE(sb)))
 253                 return "Btree node size not a power of two";
 254
 255         if (CACHE_SET_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
 256                 return "Btree node size too large";
 257
 258         /* Default value, for old filesystems: */
 259         if (!CACHE_SET_GC_RESERVE(sb))
 260                 SET_CACHE_SET_GC_RESERVE(sb, 10);
 261
 262         if (CACHE_SET_GC_RESERVE(sb) < 5)
 263                 return "gc reserve percentage too small";
 264
 265         if (!CACHE_SET_JOURNAL_ENTRY_SIZE(sb))
 266                 SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb, 9);
 267
 268         /* 4 mb max: */
 269         if (512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
 270                 return "max journal entry size too big";
 271
 272         if (le16_to_cpu(sb->u64s) < bch_journal_buckets_offset(sb))
 273                 return "Invalid superblock: member info area missing";
 274
 275         mi = cache_mi_to_cpu_mi(sb->members + sb->nr_this_dev);
 276
 277         if (mi.nbuckets > LONG_MAX)
 278                 return "Too many buckets";
 279
 280         if (mi.nbuckets < 1 << 8)
 281                 return "Not enough buckets";
 282
 283         if (!is_power_of_2(mi.bucket_size) ||
 284             mi.bucket_size < PAGE_SECTORS ||
 285             mi.bucket_size < block_size)
 286                 return "Bad bucket size";
 287
 288         if (get_capacity(disk_sb->bdev->bd_disk) <
 289             mi.bucket_size * mi.nbuckets)
 290                 return "Invalid superblock: device too small";
 291
 292         if (le64_to_cpu(sb->offset) +
 293             (__set_blocks(sb, le16_to_cpu(sb->u64s),
 294                           block_size << 9) * block_size) >
 295             mi.first_bucket * mi.bucket_size)
 296                 return "Invalid superblock: first bucket comes before end of super";
 297
 298         for (i = 0; i < bch_nr_journal_buckets(sb); i++)
 299                 if (journal_bucket(sb, i) <  mi.first_bucket ||
 300                     journal_bucket(sb, i) >= mi.nbuckets)
 301                         return "bad journal bucket";
 302
 303         return NULL;
 304 }
 305
 306 void free_super(struct bcache_superblock *sb)
 307 {
 308         if (sb->bio)
 309                 bio_put(sb->bio);
 310         if (!IS_ERR_OR_NULL(sb->bdev))
 311                 blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 312
 313         free_pages((unsigned long) sb->sb, sb->page_order);
 314         memset(sb, 0, sizeof(*sb));
 315 }
 316
 317 static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
 318 {
 319         struct cache_sb *new_sb;
 320         struct bio *bio;
 321
 322         if (sb->page_order >= order && sb->sb)
 323                 return 0;
 324
 325         new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
 326         if (!new_sb)
 327                 return -ENOMEM;
 328
 329         bio = (dynamic_fault("bcache:add:super_realloc")
 330                ? NULL
 331                : bio_kmalloc(GFP_KERNEL, 1 << order));
 332         if (!bio) {
 333                 free_pages((unsigned long) new_sb, order);
 334                 return -ENOMEM;
 335         }
 336
 337         if (sb->sb)
 338                 memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
 339
 340         free_pages((unsigned long) sb->sb, sb->page_order);
 341         sb->sb = new_sb;
 342
 343         if (sb->bio)
 344                 bio_put(sb->bio);
 345         sb->bio = bio;
 346
 347         sb->page_order = order;
 348
 349         return 0;
 350 }
 351
 352 int bch_super_realloc(struct bcache_superblock *sb, unsigned u64s)
 353 {
 354         struct cache_member *mi = sb->sb->members + sb->sb->nr_this_dev;
 355         char buf[BDEVNAME_SIZE];
 356         size_t bytes = __set_bytes((struct cache_sb *) NULL, u64s);
 357         u64 want = bytes + (SB_SECTOR << 9);
 358
 359         u64 first_bucket_offset = (u64) le16_to_cpu(mi->first_bucket) *
 360                 ((u64) le16_to_cpu(mi->bucket_size) << 9);
 361
 362         if (want > first_bucket_offset) {
 363                 pr_err("%s: superblock too big: want %llu but have %llu",
 364                        bdevname(sb->bdev, buf), want, first_bucket_offset);
 365                 return -ENOSPC;
 366         }
 367
 368         return __bch_super_realloc(sb, get_order(bytes));
 369 }
 370
 371 static const char *read_super(struct bcache_superblock *sb,
 372                               const char *path)
 373 {
 374         const char *err;
 375         unsigned order = 0;
 376
 377         lockdep_assert_held(&bch_register_lock);
 378
 379         memset(sb, 0, sizeof(*sb));
 380
 381         err = bch_blkdev_open(path, &sb, &sb->bdev);
 382         if (err)
 383                 return err;
 384 retry:
 385         err = "cannot allocate memory";
 386         if (__bch_super_realloc(sb, order))
 387                 goto err;
 388
 389         err = "dynamic fault";
 390         if (cache_set_init_fault("read_super"))
 391                 goto err;
 392
 393         bio_reset(sb->bio);
 394         sb->bio->bi_bdev = sb->bdev;
 395         sb->bio->bi_iter.bi_sector = SB_SECTOR;
 396         sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
 397         bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
 398         bch_bio_map(sb->bio, sb->sb);
 399
 400         err = "IO error";
 401         if (submit_bio_wait(sb->bio))
 402                 goto err;
 403
 404         err = "Not a bcache superblock";
 405         if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
 406                 goto err;
 407
 408         err = "Superblock has incorrect offset";
 409         if (le64_to_cpu(sb->sb->offset) != SB_SECTOR)
 410                 goto err;
 411
 412         pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
 413                  le64_to_cpu(sb->sb->version),
 414                  le64_to_cpu(sb->sb->flags),
 415                  le64_to_cpu(sb->sb->seq),
 416                  le16_to_cpu(sb->sb->u64s));
 417
 418         err = "Superblock block size smaller than device block size";
 419         if (le16_to_cpu(sb->sb->block_size) << 9 <
 420             bdev_logical_block_size(sb->bdev))
 421                 goto err;
 422
 423         order = get_order(__set_bytes(sb->sb, le16_to_cpu(sb->sb->u64s)));
 424         if (order > sb->page_order)
 425                 goto retry;
 426
 427         err = "bad checksum reading superblock";
 428         if (le64_to_cpu(sb->sb->csum) !=
 429             __csum_set(sb->sb, le16_to_cpu(sb->sb->u64s),
 430                        le64_to_cpu(sb->sb->version) <
 431                        BCACHE_SB_VERSION_CDEV_V3
 432                        ? BCH_CSUM_CRC64
 433                        : CACHE_SB_CSUM_TYPE(sb->sb)))
 434                 goto err;
 435
 436         return NULL;
 437 err:
 438         free_super(sb);
 439         return err;
 440 }
 441
 442 void __write_super(struct cache_set *c, struct bcache_superblock *disk_sb)
 443 {
 444         struct cache_sb *sb = disk_sb->sb;
 445         struct bio *bio = disk_sb->bio;
 446
 447         bio->bi_bdev            = disk_sb->bdev;
 448         bio->bi_iter.bi_sector  = SB_SECTOR;
 449         bio->bi_iter.bi_size    =
 450                 roundup(__set_bytes(sb, le16_to_cpu(sb->u64s)),
 451                         bdev_logical_block_size(disk_sb->bdev));
 452         bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
 453         bch_bio_map(bio, sb);
 454
 455         pr_debug("ver %llu, flags %llu, seq %llu",
 456                  le64_to_cpu(sb->version),
 457                  le64_to_cpu(sb->flags),
 458                  le64_to_cpu(sb->seq));
 459
 460         bch_generic_make_request(bio, c);
 461 }
 462
 463 static void write_super_endio(struct bio *bio)
 464 {
 465         struct cache *ca = bio->bi_private;
 466
 467         /* XXX: return errors directly */
 468
 469         cache_fatal_io_err_on(bio->bi_error, ca, "superblock write");
 470
 471         bch_account_io_completion(ca);
 472
 473         closure_put(&ca->set->sb_write);
 474         percpu_ref_put(&ca->ref);
 475 }
 476
 477 static void bcache_write_super_unlock(struct closure *cl)
 478 {
 479         struct cache_set *c = container_of(cl, struct cache_set, sb_write);
 480
 481         up(&c->sb_write_mutex);
 482 }
 483
 484 /* Update cached mi: */
 485 static int cache_set_mi_update(struct cache_set *c,
 486                                struct cache_member *mi,
 487                                unsigned nr_in_set)
 488 {
 489         struct cache_member_rcu *new, *old;
 490         struct cache *ca;
 491         unsigned i;
 492
 493         mutex_lock(&c->mi_lock);
 494
 495         new = kzalloc(sizeof(struct cache_member_rcu) +
 496                       sizeof(struct cache_member_cpu) * nr_in_set,
 497                       GFP_KERNEL);
 498         if (!new) {
 499                 mutex_unlock(&c->mi_lock);
 500                 return -ENOMEM;
 501         }
 502
 503         new->nr_in_set = nr_in_set;
 504
 505         for (i = 0; i < nr_in_set; i++)
 506                 new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
 507
 508         rcu_read_lock();
 509         for_each_cache(ca, c, i)
 510                 ca->mi = new->m[i];
 511         rcu_read_unlock();
 512
 513         old = rcu_dereference_protected(c->members,
 514                                 lockdep_is_held(&c->mi_lock));
 515
 516         rcu_assign_pointer(c->members, new);
 517         if (old)
 518                 kfree_rcu(old, rcu);
 519
 520         mutex_unlock(&c->mi_lock);
 521         return 0;
 522 }
 523
 524 /* doesn't copy member info */
 525 static void __copy_super(struct cache_sb *dst, struct cache_sb *src)
 526 {
 527         dst->version            = src->version;
 528         dst->seq                = src->seq;
 529         dst->user_uuid          = src->user_uuid;
 530         dst->set_uuid           = src->set_uuid;
 531         memcpy(dst->label, src->label, SB_LABEL_SIZE);
 532         dst->flags              = src->flags;
 533         dst->flags2             = src->flags2;
 534         dst->nr_in_set          = src->nr_in_set;
 535         dst->block_size         = src->block_size;
 536 }
 537
 538 static int cache_sb_to_cache_set(struct cache_set *c, struct cache_sb *src)
 539 {
 540         struct cache_member *new;
 541
 542         lockdep_assert_held(&bch_register_lock);
 543
 544         new = kzalloc(sizeof(struct cache_member) * src->nr_in_set,
 545                       GFP_KERNEL);
 546         if (!new)
 547                 return -ENOMEM;
 548
 549         memcpy(new, src->members,
 550                src->nr_in_set * sizeof(struct cache_member));
 551
 552         if (cache_set_mi_update(c, new, src->nr_in_set)) {
 553                 kfree(new);
 554                 return -ENOMEM;
 555         }
 556
 557         kfree(c->disk_mi);
 558         c->disk_mi = new;
 559
 560         __copy_super(&c->disk_sb, src);
 561
 562         c->sb.block_size        = le16_to_cpu(src->block_size);
 563         c->sb.btree_node_size   = CACHE_SET_BTREE_NODE_SIZE(src);
 564         c->sb.nr_in_set         = src->nr_in_set;
 565         c->sb.clean             = CACHE_SET_CLEAN(src);
 566         c->sb.meta_replicas_have= CACHE_SET_META_REPLICAS_HAVE(src);
 567         c->sb.data_replicas_have= CACHE_SET_DATA_REPLICAS_HAVE(src);
 568         c->sb.str_hash_type     = CACHE_SET_STR_HASH_TYPE(src);
 569
 570         return 0;
 571 }
 572
 573 static int cache_sb_from_cache_set(struct cache_set *c, struct cache *ca)
 574 {
 575         struct cache_sb *src = &c->disk_sb, *dst = ca->disk_sb.sb;
 576
 577         if (src->nr_in_set != dst->nr_in_set) {
 578                 /*
 579                  * We have to preserve the list of journal buckets on the
 580                  * cache's superblock:
 581                  */
 582                 unsigned old_offset = bch_journal_buckets_offset(dst);
 583                 unsigned u64s = bch_journal_buckets_offset(src)
 584                         + bch_nr_journal_buckets(dst);
 585                 int ret = bch_super_realloc(&ca->disk_sb, u64s);
 586
 587                 if (ret)
 588                         return ret;
 589
 590                 dst->nr_in_set  = src->nr_in_set;
 591                 dst->u64s       = cpu_to_le16(u64s);
 592
 593                 memmove(dst->_data + bch_journal_buckets_offset(dst),
 594                         dst->_data + old_offset,
 595                         bch_nr_journal_buckets(dst) * sizeof(u64));
 596         }
 597
 598         memcpy(dst->_data,
 599                c->disk_mi,
 600                src->nr_in_set * sizeof(struct cache_member));
 601
 602         __copy_super(dst, src);
 603
 604         return 0;
 605 }
 606
 607 static void __bcache_write_super(struct cache_set *c)
 608 {
 609         struct closure *cl = &c->sb_write;
 610         struct cache *ca;
 611         unsigned i;
 612
 613         cache_set_mi_update(c, c->disk_mi, c->sb.nr_in_set);
 614
 615         closure_init(cl, &c->cl);
 616
 617         le64_add_cpu(&c->disk_sb.seq, 1);
 618
 619         for_each_cache(ca, c, i) {
 620                 struct cache_sb *sb = ca->disk_sb.sb;
 621                 struct bio *bio = ca->disk_sb.bio;
 622
 623                 cache_sb_from_cache_set(c, ca);
 624
 625                 SET_CACHE_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
 626                 sb->csum = cpu_to_le64(__csum_set(sb,
 627                                                   le16_to_cpu(sb->u64s),
 628                                                   CACHE_SB_CSUM_TYPE(sb)));
 629
 630                 bio_reset(bio);
 631                 bio->bi_bdev    = ca->disk_sb.bdev;
 632                 bio->bi_end_io  = write_super_endio;
 633                 bio->bi_private = ca;
 634
 635                 closure_get(cl);
 636                 percpu_ref_get(&ca->ref);
 637                 __write_super(c, &ca->disk_sb);
 638         }
 639
 640         closure_return_with_destructor(cl, bcache_write_super_unlock);
 641 }
 642
 643 void bcache_write_super(struct cache_set *c)
 644 {
 645         down(&c->sb_write_mutex);
 646         __bcache_write_super(c);
 647 }
 648
 649 void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
 650                                    bool meta)
 651 {
 652         struct cache_member *mi;
 653         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
 654         const struct bch_extent_ptr *ptr;
 655
 656         if (!CACHE_SET_SYNC(&c->disk_sb))
 657                 return;
 658
 659         down(&c->sb_write_mutex);
 660
 661         /* recheck, might have raced */
 662         if (bch_check_super_marked(c, k, meta)) {
 663                 up(&c->sb_write_mutex);
 664                 return;
 665         }
 666
 667         mi = c->disk_mi;
 668
 669         extent_for_each_ptr(e, ptr)
 670                 if (bch_extent_ptr_is_dirty(c, e, ptr))
 671                         (meta
 672                          ? SET_CACHE_HAS_METADATA
 673                          : SET_CACHE_HAS_DATA)(mi + ptr->dev, true);
 674
 675         __bcache_write_super(c);
 676 }
 677
 678 /* Cache set RO/RW: */
 679
 680 /*
 681  * For startup/shutdown of RW stuff, the dependencies are:
 682  *
 683  * - foreground writes depend on copygc and tiering (to free up space)
 684  *
 685  * - copygc and tiering depend on mark and sweep gc (they actually probably
 686  *   don't because they either reserve ahead of time or don't block if
 687  *   allocations fail, but allocations can require mark and sweep gc to run
 688  *   because of generation number wraparound)
 689  *
 690  * - all of the above depends on the allocator threads
 691  *
 692  * - allocator depends on the journal (when it rewrites prios and gens)
 693  */
 694
 695 static void __bch_cache_set_read_only(struct cache_set *c)
 696 {
 697         struct cache *ca;
 698         unsigned i;
 699
 700         c->tiering_pd.rate.rate = UINT_MAX;
 701         bch_ratelimit_reset(&c->tiering_pd.rate);
 702         bch_tiering_read_stop(c);
 703
 704         for_each_cache(ca, c, i)
 705                 bch_moving_gc_stop(ca);
 706
 707         bch_gc_thread_stop(c);
 708
 709         bch_btree_flush(c);
 710
 711         for_each_cache(ca, c, i)
 712                 bch_cache_allocator_stop(ca);
 713
 714         /*
 715          * Write a journal entry after flushing the btree, so we don't end up
 716          * replaying everything we just flushed:
 717          */
 718         if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
 719                 int ret;
 720
 721                 bch_journal_flush_async(&c->journal, NULL);
 722                 ret = bch_journal_meta(&c->journal);
 723                 BUG_ON(ret && !bch_journal_error(&c->journal));
 724         }
 725
 726         cancel_delayed_work_sync(&c->journal.write_work);
 727         cancel_delayed_work_sync(&c->journal.reclaim_work);
 728 }
 729
 730 static void bch_writes_disabled(struct percpu_ref *writes)
 731 {
 732         struct cache_set *c = container_of(writes, struct cache_set, writes);
 733
 734         set_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
 735         wake_up(&bch_read_only_wait);
 736 }
 737
 738 static void bch_cache_set_read_only_work(struct work_struct *work)
 739 {
 740         struct cache_set *c =
 741                 container_of(work, struct cache_set, read_only_work);
 742
 743         percpu_ref_put(&c->writes);
 744
 745         del_timer(&c->foreground_write_wakeup);
 746         cancel_delayed_work(&c->pd_controllers_update);
 747
 748         c->foreground_write_pd.rate.rate = UINT_MAX;
 749         bch_wake_delayed_writes((unsigned long) c);
 750
 751         if (!test_bit(CACHE_SET_EMERGENCY_RO, &c->flags)) {
 752                 /*
 753                  * If we're not doing an emergency shutdown, we want to wait on
 754                  * outstanding writes to complete so they don't see spurious
 755                  * errors due to shutting down the allocator:
 756                  */
 757                 wait_event(bch_read_only_wait,
 758                            test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
 759
 760                 __bch_cache_set_read_only(c);
 761
 762                 if (!bch_journal_error(&c->journal) &&
 763                     !test_bit(CACHE_SET_ERROR, &c->flags)) {
 764                         SET_CACHE_SET_CLEAN(&c->disk_sb, true);
 765                         bcache_write_super(c);
 766                 }
 767         } else {
 768                 /*
 769                  * If we are doing an emergency shutdown outstanding writes may
 770                  * hang until we shutdown the allocator so we don't want to wait
 771                  * on outstanding writes before shutting everything down - but
 772                  * we do need to wait on them before returning and signalling
 773                  * that going RO is complete:
 774                  */
 775                 __bch_cache_set_read_only(c);
 776
 777                 wait_event(bch_read_only_wait,
 778                            test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
 779         }
 780
 781         bch_notify_cache_set_read_only(c);
 782         trace_bcache_cache_set_read_only_done(c);
 783
 784         set_bit(CACHE_SET_RO_COMPLETE, &c->flags);
 785         wake_up(&bch_read_only_wait);
 786 }
 787
 788 bool bch_cache_set_read_only(struct cache_set *c)
 789 {
 790         if (test_and_set_bit(CACHE_SET_RO, &c->flags))
 791                 return false;
 792
 793         trace_bcache_cache_set_read_only(c);
 794
 795         percpu_ref_get(&c->writes);
 796
 797         /*
 798          * Block new foreground-end write operations from starting - any new
 799          * writes will return -EROFS:
 800          *
 801          * (This is really blocking new _allocations_, writes to previously
 802          * allocated space can still happen until stopping the allocator in
 803          * bch_cache_allocator_stop()).
 804          */
 805         percpu_ref_kill(&c->writes);
 806
 807         queue_work(system_freezable_wq, &c->read_only_work);
 808         return true;
 809 }
 810
 811 bool bch_cache_set_emergency_read_only(struct cache_set *c)
 812 {
 813         bool ret = !test_and_set_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
 814
 815         bch_cache_set_read_only(c);
 816         bch_journal_halt(&c->journal);
 817
 818         wake_up(&bch_read_only_wait);
 819         return ret;
 820 }
 821
 822 void bch_cache_set_read_only_sync(struct cache_set *c)
 823 {
 824         /* so we don't race with bch_cache_set_read_write() */
 825         lockdep_assert_held(&bch_register_lock);
 826
 827         bch_cache_set_read_only(c);
 828
 829         wait_event(bch_read_only_wait,
 830                    test_bit(CACHE_SET_RO_COMPLETE, &c->flags) &&
 831                    test_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags));
 832 }
 833
 834 static const char *__bch_cache_set_read_write(struct cache_set *c)
 835 {
 836         struct cache *ca;
 837         const char *err;
 838         unsigned i;
 839
 840         lockdep_assert_held(&bch_register_lock);
 841
 842         err = "error starting allocator thread";
 843         for_each_cache(ca, c, i)
 844                 if (ca->mi.state == CACHE_ACTIVE &&
 845                     bch_cache_allocator_start(ca)) {
 846                         percpu_ref_put(&ca->ref);
 847                         goto err;
 848                 }
 849
 850         err = "error starting btree GC thread";
 851         if (bch_gc_thread_start(c))
 852                 goto err;
 853
 854         for_each_cache(ca, c, i) {
 855                 if (ca->mi.state != CACHE_ACTIVE)
 856                         continue;
 857
 858                 err = "error starting moving GC thread";
 859                 if (bch_moving_gc_thread_start(ca)) {
 860                         percpu_ref_put(&ca->ref);
 861                         goto err;
 862                 }
 863         }
 864
 865         err = "error starting tiering thread";
 866         if (bch_tiering_read_start(c))
 867                 goto err;
 868
 869         schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 870
 871         return NULL;
 872 err:
 873         __bch_cache_set_read_only(c);
 874         return err;
 875 }
 876
 877 const char *bch_cache_set_read_write(struct cache_set *c)
 878 {
 879         const char *err;
 880
 881         lockdep_assert_held(&bch_register_lock);
 882
 883         if (!test_bit(CACHE_SET_RO_COMPLETE, &c->flags))
 884                 return NULL;
 885
 886         err = __bch_cache_set_read_write(c);
 887         if (err)
 888                 return err;
 889
 890         percpu_ref_reinit(&c->writes);
 891
 892         clear_bit(CACHE_SET_WRITE_DISABLE_COMPLETE, &c->flags);
 893         clear_bit(CACHE_SET_EMERGENCY_RO, &c->flags);
 894         clear_bit(CACHE_SET_RO_COMPLETE, &c->flags);
 895         clear_bit(CACHE_SET_RO, &c->flags);
 896         return NULL;
 897 }
 898
 899 /* Cache set startup/shutdown: */
 900
 901 static void cache_set_free(struct cache_set *c)
 902 {
 903         del_timer_sync(&c->foreground_write_wakeup);
 904         cancel_delayed_work_sync(&c->pd_controllers_update);
 905         cancel_work_sync(&c->read_only_work);
 906         cancel_work_sync(&c->bio_submit_work);
 907         cancel_work_sync(&c->read_retry_work);
 908
 909         bch_btree_cache_free(c);
 910         bch_journal_free(&c->journal);
 911         bch_io_clock_exit(&c->io_clock[WRITE]);
 912         bch_io_clock_exit(&c->io_clock[READ]);
 913         bch_compress_free(c);
 914         bdi_destroy(&c->bdi);
 915         lg_lock_free(&c->bucket_stats_lock);
 916         free_percpu(c->bucket_stats_percpu);
 917         mempool_exit(&c->btree_bounce_pool);
 918         mempool_exit(&c->bio_bounce_pages);
 919         bioset_exit(&c->bio_write);
 920         bioset_exit(&c->bio_read_split);
 921         bioset_exit(&c->bio_read);
 922         bioset_exit(&c->btree_read_bio);
 923         mempool_exit(&c->btree_interior_update_pool);
 924         mempool_exit(&c->btree_reserve_pool);
 925         mempool_exit(&c->fill_iter);
 926         mempool_exit(&c->search);
 927         percpu_ref_exit(&c->writes);
 928
 929         if (c->copygc_wq)
 930                 destroy_workqueue(c->copygc_wq);
 931         if (c->wq)
 932                 destroy_workqueue(c->wq);
 933
 934         kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
 935         kfree(c->disk_mi);
 936         kfree(c);
 937         module_put(THIS_MODULE);
 938 }
 939
 940 /*
 941  * should be __cache_set_stop4 - block devices are closed, now we can finally
 942  * free it
 943  */
 944 void bch_cache_set_release(struct kobject *kobj)
 945 {
 946         struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 947         struct completion *stop_completion = c->stop_completion;
 948
 949         bch_notify_cache_set_stopped(c);
 950         bch_info(c, "stopped");
 951
 952         cache_set_free(c);
 953
 954         if (stop_completion)
 955                 complete(stop_completion);
 956 }
 957
 958 /*
 959  * All activity on the cache_set should have stopped now - close devices:
 960  */
 961 static void __cache_set_stop3(struct closure *cl)
 962 {
 963         struct cache_set *c = container_of(cl, struct cache_set, cl);
 964         struct cache *ca;
 965         unsigned i;
 966
 967         mutex_lock(&bch_register_lock);
 968         for_each_cache(ca, c, i)
 969                 bch_cache_stop(ca);
 970         mutex_unlock(&bch_register_lock);
 971
 972         mutex_lock(&bch_register_lock);
 973         list_del(&c->list);
 974         if (c->minor >= 0)
 975                 idr_remove(&bch_chardev_minor, c->minor);
 976         mutex_unlock(&bch_register_lock);
 977
 978         closure_debug_destroy(&c->cl);
 979         kobject_put(&c->kobj);
 980 }
 981
 982 /*
 983  * Openers (i.e. block devices) should have exited, shutdown all userspace
 984  * interfaces and wait for &c->cl to hit 0
 985  */
 986 static void __cache_set_stop2(struct closure *cl)
 987 {
 988         struct cache_set *c = container_of(cl, struct cache_set, caching);
 989
 990         bch_debug_exit_cache_set(c);
 991
 992         if (!IS_ERR_OR_NULL(c->chardev))
 993                 device_unregister(c->chardev);
 994
 995         if (c->kobj.state_in_sysfs)
 996                 kobject_del(&c->kobj);
 997
 998         bch_cache_accounting_destroy(&c->accounting);
 999
1000         kobject_put(&c->time_stats);
1001         kobject_put(&c->opts_dir);
1002         kobject_put(&c->internal);
1003
1004         mutex_lock(&bch_register_lock);
1005         bch_cache_set_read_only_sync(c);
1006         mutex_unlock(&bch_register_lock);
1007
1008         closure_return(cl);
1009 }
1010
1011 /*
1012  * First phase of the shutdown process that's kicked off by cache_set_stop(); we
1013  * haven't waited for anything to stop yet, we're just punting to process
1014  * context to shut down block devices:
1015  */
1016 static void __cache_set_stop1(struct closure *cl)
1017 {
1018         struct cache_set *c = container_of(cl, struct cache_set, caching);
1019
1020         bch_blockdevs_stop(c);
1021
1022         continue_at(cl, __cache_set_stop2, system_wq);
1023 }
1024
1025 void bch_cache_set_stop(struct cache_set *c)
1026 {
1027         if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1028                 closure_queue(&c->caching);
1029 }
1030
1031 void bch_cache_set_unregister(struct cache_set *c)
1032 {
1033         if (!test_and_set_bit(CACHE_SET_UNREGISTERING, &c->flags))
1034                 bch_cache_set_stop(c);
1035 }
1036
1037 static unsigned cache_set_nr_devices(struct cache_set *c)
1038 {
1039         unsigned i, nr = 0;
1040         struct cache_member *mi = c->disk_mi;
1041
1042         lockdep_assert_held(&bch_register_lock);
1043
1044         for (i = 0; i < c->disk_sb.nr_in_set; i++)
1045                 if (!bch_is_zero(mi[i].uuid.b, sizeof(uuid_le)))
1046                         nr++;
1047
1048         return nr;
1049 }
1050
1051 static unsigned cache_set_nr_online_devices(struct cache_set *c)
1052 {
1053         unsigned i, nr = 0;
1054
1055         for (i = 0; i < c->sb.nr_in_set; i++)
1056                 if (c->cache[i])
1057                         nr++;
1058
1059         return nr;
1060 }
1061
1062 #define alloc_bucket_pages(gfp, ca)                     \
1063         ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
1064
1065 static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
1066                                              struct cache_set_opts opts)
1067 {
1068         struct cache_set *c;
1069         unsigned iter_size, journal_entry_bytes;
1070
1071         c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1072         if (!c)
1073                 return NULL;
1074
1075         __module_get(THIS_MODULE);
1076
1077         c->minor                = -1;
1078
1079         sema_init(&c->sb_write_mutex, 1);
1080         INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
1081         mutex_init(&c->btree_cache_lock);
1082         mutex_init(&c->bucket_lock);
1083         mutex_init(&c->btree_root_lock);
1084         INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
1085         mutex_init(&c->mi_lock);
1086
1087         init_rwsem(&c->gc_lock);
1088
1089 #define BCH_TIME_STAT(name, frequency_units, duration_units)            \
1090         spin_lock_init(&c->name##_time.lock);
1091         BCH_TIME_STATS()
1092 #undef BCH_TIME_STAT
1093
1094         bch_open_buckets_init(c);
1095         bch_tiering_init_cache_set(c);
1096
1097         INIT_LIST_HEAD(&c->list);
1098         INIT_LIST_HEAD(&c->cached_devs);
1099         INIT_LIST_HEAD(&c->btree_cache);
1100         INIT_LIST_HEAD(&c->btree_cache_freeable);
1101         INIT_LIST_HEAD(&c->btree_cache_freed);
1102
1103         INIT_LIST_HEAD(&c->btree_interior_update_list);
1104         mutex_init(&c->btree_reserve_cache_lock);
1105         mutex_init(&c->btree_interior_update_lock);
1106
1107         mutex_init(&c->bio_bounce_pages_lock);
1108         INIT_WORK(&c->bio_submit_work, bch_bio_submit_work);
1109         spin_lock_init(&c->bio_submit_lock);
1110         bio_list_init(&c->read_retry_list);
1111         spin_lock_init(&c->read_retry_lock);
1112         INIT_WORK(&c->read_retry_work, bch_read_retry_work);
1113         mutex_init(&c->zlib_workspace_lock);
1114
1115         seqcount_init(&c->gc_pos_lock);
1116
1117         c->prio_clock[READ].hand = 1;
1118         c->prio_clock[READ].min_prio = 0;
1119         c->prio_clock[WRITE].hand = 1;
1120         c->prio_clock[WRITE].min_prio = 0;
1121
1122         c->congested_read_threshold_us  = 2000;
1123         c->congested_write_threshold_us = 20000;
1124         c->error_limit  = 16 << IO_ERROR_SHIFT;
1125         init_waitqueue_head(&c->writeback_wait);
1126
1127         c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
1128
1129         c->copy_gc_enabled = 1;
1130         c->tiering_enabled = 1;
1131         c->tiering_percent = 10;
1132
1133         c->foreground_target_percent = 20;
1134
1135         c->journal.write_time   = &c->journal_write_time;
1136         c->journal.delay_time   = &c->journal_delay_time;
1137         c->journal.blocked_time = &c->journal_blocked_time;
1138         c->journal.flush_seq_time = &c->journal_flush_seq_time;
1139
1140         mutex_init(&c->uevent_lock);
1141
1142         if (cache_sb_to_cache_set(c, sb))
1143                 goto err;
1144
1145         scnprintf(c->name, sizeof(c->name), "%pU", &c->disk_sb.user_uuid);
1146
1147         c->opts = cache_superblock_opts(sb);
1148         cache_set_opts_apply(&c->opts, opts);
1149
1150         c->block_bits           = ilog2(c->sb.block_size);
1151
1152         if (cache_set_init_fault("cache_set_alloc"))
1153                 goto err;
1154
1155         iter_size = (btree_blocks(c) + 1) * 2 *
1156                 sizeof(struct btree_node_iter_set);
1157
1158         journal_entry_bytes = 512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb);
1159
1160         if (!(c->wq = alloc_workqueue("bcache",
1161                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
1162             !(c->copygc_wq = alloc_workqueue("bcache_copygc",
1163                                 WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
1164             percpu_ref_init(&c->writes, bch_writes_disabled, 0, GFP_KERNEL) ||
1165             mempool_init_slab_pool(&c->search, 1, bch_search_cache) ||
1166             mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
1167                                       sizeof(struct btree_reserve)) ||
1168             mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
1169                                       sizeof(struct btree_interior_update)) ||
1170             mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
1171             bioset_init(&c->btree_read_bio, 1, 0) ||
1172             bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
1173             bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
1174             bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
1175             mempool_init_page_pool(&c->bio_bounce_pages,
1176                                    max_t(unsigned,
1177                                          c->sb.btree_node_size,
1178                                          CRC32_EXTENT_SIZE_MAX) /
1179                                    PAGE_SECTORS, 0) ||
1180             !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
1181             lg_lock_init(&c->bucket_stats_lock) ||
1182             mempool_init_page_pool(&c->btree_bounce_pool, 1,
1183                                    ilog2(btree_pages(c))) ||
1184             bdi_setup_and_register(&c->bdi, "bcache") ||
1185             bch_io_clock_init(&c->io_clock[READ]) ||
1186             bch_io_clock_init(&c->io_clock[WRITE]) ||
1187             bch_journal_alloc(&c->journal, journal_entry_bytes) ||
1188             bch_btree_cache_alloc(c) ||
1189             bch_compress_init(c))
1190                 goto err;
1191
1192         c->bdi.ra_pages         = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
1193         c->bdi.congested_fn     = bch_congested_fn;
1194         c->bdi.congested_data   = c;
1195
1196         /*
1197          * Now that all allocations have succeeded, init various refcounty
1198          * things that let us shutdown:
1199          */
1200         closure_init(&c->cl, NULL);
1201
1202         c->kobj.kset = bcache_kset;
1203         kobject_init(&c->kobj, &bch_cache_set_ktype);
1204         kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1205         kobject_init(&c->opts_dir, &bch_cache_set_opts_dir_ktype);
1206         kobject_init(&c->time_stats, &bch_cache_set_time_stats_ktype);
1207
1208         bch_cache_accounting_init(&c->accounting, &c->cl);
1209
1210         closure_init(&c->caching, &c->cl);
1211         set_closure_fn(&c->caching, __cache_set_stop1, system_wq);
1212
1213         continue_at_noreturn(&c->cl, __cache_set_stop3, system_wq);
1214         return c;
1215 err:
1216         cache_set_free(c);
1217         return NULL;
1218 }
1219
1220 static int bch_cache_set_online(struct cache_set *c)
1221 {
1222         struct cache *ca;
1223         unsigned i;
1224
1225         lockdep_assert_held(&bch_register_lock);
1226
1227         if (c->kobj.state_in_sysfs)
1228                 return 0;
1229
1230         c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
1231         if (c->minor < 0)
1232                 return c->minor;
1233
1234         c->chardev = device_create(bch_chardev_class, NULL,
1235                                    MKDEV(bch_chardev_major, c->minor), NULL,
1236                                    "bcache%u-ctl", c->minor);
1237         if (IS_ERR(c->chardev))
1238                 return PTR_ERR(c->chardev);
1239
1240         if (kobject_add(&c->kobj, NULL, "%pU", c->disk_sb.user_uuid.b) ||
1241             kobject_add(&c->internal, &c->kobj, "internal") ||
1242             kobject_add(&c->opts_dir, &c->kobj, "options") ||
1243             kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
1244             bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1245                 return -1;
1246
1247         for_each_cache(ca, c, i)
1248                 if (bch_cache_online(ca)) {
1249                         percpu_ref_put(&ca->ref);
1250                         return -1;
1251                 }
1252
1253         list_add(&c->list, &bch_cache_sets);
1254         return 0;
1255 }
1256
1257 static const char *run_cache_set(struct cache_set *c)
1258 {
1259         const char *err = "cannot allocate memory";
1260         struct cache *ca;
1261         unsigned i, id;
1262         time64_t now;
1263         LIST_HEAD(journal);
1264         struct jset *j;
1265         int ret = -EINVAL;
1266
1267         lockdep_assert_held(&bch_register_lock);
1268         BUG_ON(test_bit(CACHE_SET_RUNNING, &c->flags));
1269
1270         /* We don't want bch_fatal_error() to free underneath us */
1271         closure_get(&c->caching);
1272
1273         /*
1274          * Make sure that each cache object's mi is up to date before
1275          * we start testing it.
1276          */
1277         for_each_cache(ca, c, i)
1278                 cache_sb_from_cache_set(c, ca);
1279
1280         /*
1281          * CACHE_SET_SYNC is true if the cache set has already been run
1282          * and potentially has data.
1283          * It is false if it is the first time it is run.
1284          */
1285
1286         if (CACHE_SET_SYNC(&c->disk_sb)) {
1287                 ret = bch_journal_read(c, &journal);
1288                 if (ret)
1289                         goto err;
1290
1291                 pr_debug("btree_journal_read() done");
1292
1293                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1294
1295                 err = "error reading priorities";
1296                 for_each_cache(ca, c, i) {
1297                         ret = bch_prio_read(ca);
1298                         if (ret) {
1299                                 percpu_ref_put(&ca->ref);
1300                                 goto err;
1301                         }
1302                 }
1303
1304                 c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
1305                 c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
1306
1307                 for_each_cache(ca, c, i) {
1308                         bch_recalc_min_prio(ca, READ);
1309                         bch_recalc_min_prio(ca, WRITE);
1310                 }
1311
1312                 /*
1313                  * If bch_prio_read() fails it'll call cache_set_error and we'll
1314                  * tear everything down right away, but if we perhaps checked
1315                  * sooner we could avoid journal replay.
1316                  */
1317
1318                 for (id = 0; id < BTREE_ID_NR; id++) {
1319                         unsigned level;
1320                         struct bkey_i *k;
1321
1322                         err = "bad btree root";
1323                         k = bch_journal_find_btree_root(c, j, id, &level);
1324                         if (!k && id == BTREE_ID_EXTENTS)
1325                                 goto err;
1326                         if (!k) {
1327                                 pr_debug("missing btree root: %d", id);
1328                                 continue;
1329                         }
1330
1331                         err = "error reading btree root";
1332                         if (bch_btree_root_read(c, id, k, level))
1333                                 goto err;
1334                 }
1335
1336                 bch_verbose(c, "starting mark and sweep:");
1337
1338                 err = "error in recovery";
1339                 if (bch_initial_gc(c, &journal))
1340                         goto err;
1341
1342                 bch_verbose(c, "mark and sweep done");
1343
1344                 /*
1345                  * bch_journal_start() can't happen sooner, or btree_gc_finish()
1346                  * will give spurious errors about oldest_gen > bucket_gen -
1347                  * this is a hack but oh well.
1348                  */
1349                 bch_journal_start(c);
1350
1351                 err = "error starting allocator thread";
1352                 for_each_cache(ca, c, i)
1353                         if (ca->mi.state == CACHE_ACTIVE &&
1354                             bch_cache_allocator_start(ca)) {
1355                                 percpu_ref_put(&ca->ref);
1356                                 goto err;
1357                         }
1358
1359                 bch_verbose(c, "starting journal replay:");
1360
1361                 err = "journal replay failed";
1362                 ret = bch_journal_replay(c, &journal);
1363                 if (ret)
1364                         goto err;
1365
1366                 bch_verbose(c, "journal replay done");
1367
1368                 /*
1369                  * Write a new journal entry _before_ we start journalling new
1370                  * data - otherwise, we could end up with btree node bsets with
1371                  * journal seqs arbitrarily far in the future vs. the most
1372                  * recently written journal entry on disk, if we crash before
1373                  * writing the next journal entry:
1374                  */
1375                 err = "error writing journal entry";
1376                 if (bch_journal_meta(&c->journal))
1377                         goto err;
1378
1379                 bch_verbose(c, "starting fs gc:");
1380                 err = "error in fs gc";
1381                 ret = bch_gc_inode_nlinks(c);
1382                 if (ret)
1383                         goto err;
1384                 bch_verbose(c, "fs gc done");
1385
1386                 if (!c->opts.nofsck) {
1387                         bch_verbose(c, "starting fsck:");
1388                         err = "error in fsck";
1389                         ret = bch_fsck(c);
1390                         if (ret)
1391                                 goto err;
1392                         bch_verbose(c, "fsck done");
1393                 }
1394         } else {
1395                 struct bkey_i_inode inode;
1396                 struct closure cl;
1397
1398                 closure_init_stack(&cl);
1399
1400                 bch_notice(c, "initializing new filesystem");
1401
1402                 err = "unable to allocate journal buckets";
1403                 for_each_cache(ca, c, i)
1404                         if (bch_cache_journal_alloc(ca)) {
1405                                 percpu_ref_put(&ca->ref);
1406                                 goto err;
1407                         }
1408
1409                 bch_initial_gc(c, NULL);
1410
1411                 /*
1412                  * journal_res_get() will crash if called before this has
1413                  * set up the journal.pin FIFO and journal.cur pointer:
1414                  */
1415                 bch_journal_start(c);
1416                 bch_journal_set_replay_done(&c->journal);
1417
1418                 err = "error starting allocator thread";
1419                 for_each_cache(ca, c, i)
1420                         if (ca->mi.state == CACHE_ACTIVE &&
1421                             bch_cache_allocator_start(ca)) {
1422                                 percpu_ref_put(&ca->ref);
1423                                 goto err;
1424                         }
1425
1426                 err = "cannot allocate new btree root";
1427                 for (id = 0; id < BTREE_ID_NR; id++)
1428                         if (bch_btree_root_alloc(c, id, &cl)) {
1429                                 closure_sync(&cl);
1430                                 goto err;
1431                         }
1432
1433                 /* Wait for new btree roots to be written: */
1434                 closure_sync(&cl);
1435
1436                 bkey_inode_init(&inode.k_i);
1437                 inode.k.p.inode = BCACHE_ROOT_INO;
1438                 inode.v.i_mode = cpu_to_le16(S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO);
1439                 inode.v.i_nlink = cpu_to_le32(2);
1440                 get_random_bytes(&inode.v.i_hash_seed, sizeof(inode.v.i_hash_seed));
1441                 SET_INODE_STR_HASH_TYPE(&inode.v, c->sb.str_hash_type);
1442
1443                 err = "error creating root directory";
1444                 if (bch_btree_insert(c, BTREE_ID_INODES, &inode.k_i,
1445                                      NULL, NULL, NULL, 0))
1446                         goto err;
1447
1448                 err = "error writing first journal entry";
1449                 if (bch_journal_meta(&c->journal))
1450                         goto err;
1451         }
1452
1453         if (c->opts.read_only) {
1454                 bch_cache_set_read_only_sync(c);
1455         } else {
1456                 err = __bch_cache_set_read_write(c);
1457                 if (err)
1458                         goto err;
1459         }
1460
1461         now = ktime_get_seconds();
1462         rcu_read_lock();
1463         for_each_cache_rcu(ca, c, i)
1464                 c->disk_mi[ca->sb.nr_this_dev].last_mount = cpu_to_le64(now);
1465         rcu_read_unlock();
1466
1467         /* Mark cache set as initialized: */
1468         SET_CACHE_SET_SYNC(&c->disk_sb, true);
1469         SET_CACHE_SET_CLEAN(&c->disk_sb, false);
1470         bcache_write_super(c);
1471
1472         err = "dynamic fault";
1473         if (cache_set_init_fault("run_cache_set"))
1474                 goto err;
1475
1476         err = "error creating kobject";
1477         if (bch_cache_set_online(c))
1478                 goto err;
1479
1480         err = "can't bring up blockdev volumes";
1481         if (bch_blockdev_volumes_start(c))
1482                 goto err;
1483
1484         bch_debug_init_cache_set(c);
1485         set_bit(CACHE_SET_RUNNING, &c->flags);
1486         bch_attach_backing_devs(c);
1487
1488         closure_put(&c->caching);
1489
1490         bch_notify_cache_set_read_write(c);
1491
1492         BUG_ON(!list_empty(&journal));
1493         return NULL;
1494 err:
1495         switch (ret) {
1496         case BCH_FSCK_ERRORS_NOT_FIXED:
1497                 bch_err(c, "filesystem contains errors: please report this to the developers");
1498                 pr_cont("mount with -o fix_errors to repair");
1499                 err = "fsck error";
1500                 break;
1501         case BCH_FSCK_REPAIR_UNIMPLEMENTED:
1502                 bch_err(c, "filesystem contains errors: please report this to the developers");
1503                 pr_cont("repair unimplemented: inform the developers so that it can be added");
1504                 err = "fsck error";
1505                 break;
1506         case BCH_FSCK_REPAIR_IMPOSSIBLE:
1507                 bch_err(c, "filesystem contains errors, but repair impossible");
1508                 err = "fsck error";
1509                 break;
1510         case BCH_FSCK_UNKNOWN_VERSION:
1511                 err = "unknown metadata version";;
1512                 break;
1513         case -ENOMEM:
1514                 err = "cannot allocate memory";
1515                 break;
1516         case -EIO:
1517                 err = "IO error";
1518                 break;
1519         }
1520
1521         BUG_ON(!err);
1522
1523         bch_journal_entries_free(&journal);
1524         set_bit(CACHE_SET_ERROR, &c->flags);
1525         bch_cache_set_unregister(c);
1526         closure_put(&c->caching);
1527         return err;
1528 }
1529
1530 static const char *can_add_cache(struct cache_sb *sb,
1531                                  struct cache_set *c)
1532 {
1533         if (le16_to_cpu(sb->block_size) != c->sb.block_size)
1534                 return "mismatched block size";
1535
1536         if (le16_to_cpu(sb->members[sb->nr_this_dev].bucket_size) <
1537             CACHE_SET_BTREE_NODE_SIZE(&c->disk_sb))
1538                 return "new cache bucket_size is too small";
1539
1540         return NULL;
1541 }
1542
1543 static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
1544 {
1545         const char *err;
1546         bool match;
1547
1548         err = can_add_cache(sb, c);
1549         if (err)
1550                 return err;
1551
1552         /*
1553          * When attaching an existing device, the cache set superblock must
1554          * already contain member_info with a matching UUID
1555          */
1556         match = le64_to_cpu(sb->seq) <= le64_to_cpu(c->disk_sb.seq)
1557                 ? (sb->nr_this_dev < c->disk_sb.nr_in_set &&
1558                    !memcmp(&c->disk_mi[sb->nr_this_dev].uuid,
1559                            &sb->disk_uuid, sizeof(uuid_le)))
1560                 : (sb->nr_this_dev < sb->nr_in_set &&
1561                    !memcmp(&sb->members[sb->nr_this_dev].uuid,
1562                            &sb->disk_uuid, sizeof(uuid_le)));
1563
1564         if (!match)
1565                 return "cache sb does not match set";
1566
1567         return NULL;
1568 }
1569
1570 /* Cache device */
1571
1572 bool bch_cache_read_only(struct cache *ca)
1573 {
1574         struct cache_set *c = ca->set;
1575         char buf[BDEVNAME_SIZE];
1576
1577         bdevname(ca->disk_sb.bdev, buf);
1578
1579         lockdep_assert_held(&bch_register_lock);
1580
1581         if (ca->mi.state != CACHE_ACTIVE)
1582                 return false;
1583
1584         if (!bch_cache_may_remove(ca)) {
1585                 bch_err(c, "required member %s going RO, forcing fs RO", buf);
1586                 bch_cache_set_read_only_sync(c);
1587         }
1588
1589         trace_bcache_cache_read_only(ca);
1590
1591         bch_moving_gc_stop(ca);
1592
1593         /*
1594          * This stops new data writes (e.g. to existing open data
1595          * buckets) and then waits for all existing writes to
1596          * complete.
1597          */
1598         bch_cache_allocator_stop(ca);
1599
1600         bch_cache_group_remove_cache(&c->journal.devs, ca);
1601
1602         /*
1603          * Device data write barrier -- no non-meta-data writes should
1604          * occur after this point.  However, writes to btree buckets,
1605          * journal buckets, and the superblock can still occur.
1606          */
1607         trace_bcache_cache_read_only_done(ca);
1608
1609         bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
1610         bch_notify_cache_read_only(ca);
1611
1612         SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO);
1613         bcache_write_super(c);
1614         return true;
1615 }
1616
1617 static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca)
1618 {
1619         lockdep_assert_held(&bch_register_lock);
1620
1621         if (ca->mi.state == CACHE_ACTIVE)
1622                 return NULL;
1623
1624         if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
1625                 return "removing";
1626
1627         trace_bcache_cache_read_write(ca);
1628
1629         if (bch_cache_allocator_start(ca))
1630                 return "error starting allocator thread";
1631
1632         if (bch_moving_gc_thread_start(ca))
1633                 return "error starting moving GC thread";
1634
1635         bch_cache_group_add_cache(&c->journal.devs, ca);
1636
1637         wake_up_process(c->tiering_read);
1638
1639         bch_notify_cache_read_write(ca);
1640         trace_bcache_cache_read_write_done(ca);
1641
1642         return NULL;
1643 }
1644
1645 const char *bch_cache_read_write(struct cache *ca)
1646 {
1647         struct cache_set *c = ca->set;
1648         const char *err;
1649
1650         err = __bch_cache_read_write(c, ca);
1651         if (err)
1652                 return err;
1653
1654         SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_ACTIVE);
1655         bcache_write_super(c);
1656
1657         return NULL;
1658 }
1659
1660 /*
1661  * bch_cache_stop has already returned, so we no longer hold the register
1662  * lock at the point this is called.
1663  */
1664
1665 void bch_cache_release(struct kobject *kobj)
1666 {
1667         struct cache *ca = container_of(kobj, struct cache, kobj);
1668
1669         percpu_ref_exit(&ca->ref);
1670         kfree(ca);
1671 }
1672
1673 static void bch_cache_free_work(struct work_struct *work)
1674 {
1675         struct cache *ca = container_of(work, struct cache, free_work);
1676         struct cache_set *c = ca->set;
1677         unsigned i;
1678
1679         cancel_work_sync(&ca->io_error_work);
1680
1681         if (c && c->kobj.state_in_sysfs) {
1682                 char buf[12];
1683
1684                 sprintf(buf, "cache%u", ca->sb.nr_this_dev);
1685                 sysfs_remove_link(&c->kobj, buf);
1686         }
1687
1688         if (ca->kobj.state_in_sysfs)
1689                 kobject_del(&ca->kobj);
1690
1691         free_super(&ca->disk_sb);
1692
1693         /*
1694          * bch_cache_stop can be called in the middle of initialization
1695          * of the struct cache object.
1696          * As such, not all the sub-structures may be initialized.
1697          * However, they were zeroed when the object was allocated.
1698          */
1699
1700         free_percpu(ca->sectors_written);
1701         bioset_exit(&ca->replica_set);
1702         free_percpu(ca->bucket_stats_percpu);
1703         kfree(ca->journal.bucket_seq);
1704         free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1705         kfree(ca->prio_buckets);
1706         kfree(ca->bio_prio);
1707         kfree(ca->journal.bio);
1708         vfree(ca->buckets);
1709         vfree(ca->oldest_gens);
1710         free_heap(&ca->heap);
1711         free_fifo(&ca->free_inc);
1712
1713         for (i = 0; i < RESERVE_NR; i++)
1714                 free_fifo(&ca->free[i]);
1715
1716         kobject_put(&ca->kobj);
1717
1718         if (c)
1719                 kobject_put(&c->kobj);
1720 }
1721
1722 static void bch_cache_percpu_ref_release(struct percpu_ref *ref)
1723 {
1724         struct cache *ca = container_of(ref, struct cache, ref);
1725
1726         schedule_work(&ca->free_work);
1727 }
1728
1729 static void bch_cache_free_rcu(struct rcu_head *rcu)
1730 {
1731         struct cache *ca = container_of(rcu, struct cache, free_rcu);
1732
1733         /*
1734          * This decrements the ref count to ca, and once the ref count
1735          * is 0 (outstanding bios to the ca also incremented it and
1736          * decrement it on completion/error), bch_cache_percpu_ref_release
1737          * is called, and that eventually results in bch_cache_free_work
1738          * being called, which in turn results in bch_cache_release being
1739          * called.
1740          *
1741          * In particular, these functions won't be called until there are no
1742          * bios outstanding (the per-cpu ref counts are all 0), so it
1743          * is safe to remove the actual sysfs device at that point,
1744          * and that can indicate success to the user.
1745          */
1746
1747         percpu_ref_kill(&ca->ref);
1748 }
1749
1750 static void bch_cache_stop(struct cache *ca)
1751 {
1752         struct cache_set *c = ca->set;
1753
1754         lockdep_assert_held(&bch_register_lock);
1755
1756         if (c) {
1757                 BUG_ON(rcu_access_pointer(c->cache[ca->sb.nr_this_dev]) != ca);
1758                 rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], NULL);
1759         }
1760
1761         call_rcu(&ca->free_rcu, bch_cache_free_rcu);
1762 }
1763
1764 static void bch_cache_remove_work(struct work_struct *work)
1765 {
1766         struct cache *ca = container_of(work, struct cache, remove_work);
1767         struct cache_set *c = ca->set;
1768         char name[BDEVNAME_SIZE];
1769         bool force = test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
1770         unsigned dev = ca->sb.nr_this_dev;
1771
1772         bdevname(ca->disk_sb.bdev, name);
1773
1774         /*
1775          * Device should already be RO, now migrate data off:
1776          *
1777          * XXX: locking is sketchy, bch_cache_read_write() has to check
1778          * CACHE_DEV_REMOVING bit
1779          */
1780         if (!ca->mi.has_data) {
1781                 /* Nothing to do: */
1782         } else if (!bch_move_data_off_device(ca)) {
1783                 lockdep_assert_held(&bch_register_lock);
1784                 SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
1785
1786                 bcache_write_super(c);
1787         } else if (force) {
1788                 bch_flag_data_bad(ca);
1789
1790                 lockdep_assert_held(&bch_register_lock);
1791                 SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
1792
1793                 bcache_write_super(c);
1794         } else {
1795                 bch_err(c, "Remove of %s failed, unable to migrate data off",
1796                         name);
1797                 clear_bit(CACHE_DEV_REMOVING, &ca->flags);
1798                 return;
1799         }
1800
1801         /* Now metadata: */
1802
1803         if (!ca->mi.has_metadata) {
1804                 /* Nothing to do: */
1805         } else if (!bch_move_meta_data_off_device(ca)) {
1806                 lockdep_assert_held(&bch_register_lock);
1807                 SET_CACHE_HAS_METADATA(&c->disk_mi[ca->sb.nr_this_dev], false);
1808
1809                 bcache_write_super(c);
1810         } else {
1811                 bch_err(c, "Remove of %s failed, unable to migrate metadata off",
1812                         name);
1813                 clear_bit(CACHE_DEV_REMOVING, &ca->flags);
1814                 return;
1815         }
1816
1817         /*
1818          * Ok, really doing the remove:
1819          * Drop device's prio pointer before removing it from superblock:
1820          */
1821         bch_notify_cache_removed(ca);
1822
1823         spin_lock(&c->journal.lock);
1824         c->journal.prio_buckets[dev] = 0;
1825         spin_unlock(&c->journal.lock);
1826
1827         bch_journal_meta(&c->journal);
1828
1829         /*
1830          * Stop device before removing it from the cache set's list of devices -
1831          * and get our own ref on cache set since ca is going away:
1832          */
1833         closure_get(&c->cl);
1834
1835         mutex_lock(&bch_register_lock);
1836         bch_cache_stop(ca);
1837
1838         /*
1839          * RCU barrier between dropping between c->cache and dropping from
1840          * member info:
1841          */
1842         synchronize_rcu();
1843
1844         lockdep_assert_held(&bch_register_lock);
1845
1846         /*
1847          * Free this device's slot in the cache_member array - all pointers to
1848          * this device must be gone:
1849          */
1850         memset(&c->disk_mi[dev].uuid, 0, sizeof(c->disk_mi[dev].uuid));
1851
1852         bcache_write_super(c);
1853         mutex_unlock(&bch_register_lock);
1854
1855         closure_put(&c->cl);
1856 }
1857
1858 bool bch_cache_remove(struct cache *ca, bool force)
1859 {
1860         mutex_lock(&bch_register_lock);
1861
1862         if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
1863                 return false;
1864
1865         if (!bch_cache_may_remove(ca)) {
1866                 bch_err(ca->set, "Can't remove last device in tier %u",
1867                         ca->mi.tier);
1868                 bch_notify_cache_remove_failed(ca);
1869                 return false;
1870         }
1871
1872         /* First, go RO before we try to migrate data off: */
1873         bch_cache_read_only(ca);
1874
1875         if (force)
1876                 set_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
1877         set_bit(CACHE_DEV_REMOVING, &ca->flags);
1878         bch_notify_cache_removing(ca);
1879
1880         mutex_unlock(&bch_register_lock);
1881
1882         /* Migrate the data and finish removal asynchronously: */
1883
1884         queue_work(system_long_wq, &ca->remove_work);
1885         return true;
1886 }
1887
1888 static int bch_cache_online(struct cache *ca)
1889 {
1890         char buf[12];
1891
1892         lockdep_assert_held(&bch_register_lock);
1893
1894         sprintf(buf, "cache%u", ca->sb.nr_this_dev);
1895
1896         if (kobject_add(&ca->kobj,
1897                         &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
1898                         "bcache") ||
1899             sysfs_create_link(&ca->kobj, &ca->set->kobj, "set") ||
1900             sysfs_create_link(&ca->set->kobj, &ca->kobj, buf))
1901                 return -1;
1902
1903         return 0;
1904 }
1905
1906 static const char *cache_alloc(struct bcache_superblock *sb,
1907                                struct cache_set *c,
1908                                struct cache **ret)
1909 {
1910         size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
1911         size_t heap_size;
1912         unsigned i, journal_entry_pages;
1913         const char *err = "cannot allocate memory";
1914         struct cache *ca;
1915
1916         if (c->sb.nr_in_set == 1)
1917                 bdevname(sb->bdev, c->name);
1918
1919         if (cache_set_init_fault("cache_alloc"))
1920                 return err;
1921
1922         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1923         if (!ca)
1924                 return err;
1925
1926         if (percpu_ref_init(&ca->ref, bch_cache_percpu_ref_release,
1927                             0, GFP_KERNEL)) {
1928                 kfree(ca);
1929                 return err;
1930         }
1931
1932         kobject_init(&ca->kobj, &bch_cache_ktype);
1933
1934         spin_lock_init(&ca->self.lock);
1935         ca->self.nr_devices = 1;
1936         rcu_assign_pointer(ca->self.d[0].dev, ca);
1937         ca->sb.nr_this_dev = sb->sb->nr_this_dev;
1938
1939         INIT_WORK(&ca->free_work, bch_cache_free_work);
1940         INIT_WORK(&ca->remove_work, bch_cache_remove_work);
1941         spin_lock_init(&ca->freelist_lock);
1942         spin_lock_init(&ca->prio_buckets_lock);
1943         mutex_init(&ca->heap_lock);
1944         bch_moving_init_cache(ca);
1945
1946         ca->disk_sb = *sb;
1947         ca->disk_sb.bdev->bd_holder = ca;
1948         memset(sb, 0, sizeof(*sb));
1949
1950         INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
1951
1952         err = "dynamic fault";
1953         if (cache_set_init_fault("cache_alloc"))
1954                 goto err;
1955
1956         ca->mi = cache_mi_to_cpu_mi(ca->disk_sb.sb->members +
1957                                     ca->disk_sb.sb->nr_this_dev);
1958         ca->bucket_bits = ilog2(ca->mi.bucket_size);
1959
1960         /* XXX: tune these */
1961         movinggc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
1962         reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
1963         /*
1964          * free_inc must be smaller than the copygc reserve: if it was bigger,
1965          * one copygc iteration might not make enough buckets available to fill
1966          * up free_inc and allow the allocator to make forward progress
1967          */
1968         free_inc_reserve = movinggc_reserve / 2;
1969         heap_size = movinggc_reserve * 8;
1970
1971         journal_entry_pages =
1972                 DIV_ROUND_UP(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
1973                              PAGE_SECTORS);
1974
1975         if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1976             !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
1977             !init_fifo(&ca->free[RESERVE_MOVINGGC],
1978                        movinggc_reserve, GFP_KERNEL) ||
1979             !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1980             !init_fifo(&ca->free_inc,   free_inc_reserve, GFP_KERNEL) ||
1981             !init_heap(&ca->heap,       heap_size, GFP_KERNEL) ||
1982             !(ca->oldest_gens   = vzalloc(sizeof(u8) *
1983                                           ca->mi.nbuckets)) ||
1984             !(ca->buckets       = vzalloc(sizeof(struct bucket) *
1985                                           ca->mi.nbuckets)) ||
1986             !(ca->prio_buckets  = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1987                                           2, GFP_KERNEL)) ||
1988             !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1989             !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
1990             !(ca->journal.bucket_seq = kcalloc(bch_nr_journal_buckets(ca->disk_sb.sb),
1991                                                sizeof(u64), GFP_KERNEL)) ||
1992             !(ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages)) ||
1993             !(ca->bio_prio = bio_kmalloc(GFP_KERNEL, bucket_pages(ca))) ||
1994             bioset_init(&ca->replica_set, 4,
1995                         offsetof(struct bch_write_bio, bio)) ||
1996             !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
1997                 goto err;
1998
1999         ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2000
2001         total_reserve = ca->free_inc.size;
2002         for (i = 0; i < RESERVE_NR; i++)
2003                 total_reserve += ca->free[i].size;
2004         pr_debug("%zu buckets reserved", total_reserve);
2005
2006         ca->copygc_write_point.group = &ca->self;
2007         ca->tiering_write_point.group = &ca->self;
2008
2009         kobject_get(&c->kobj);
2010         ca->set = c;
2011
2012         kobject_get(&ca->kobj);
2013         rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], ca);
2014
2015         if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb.seq))
2016                 cache_sb_to_cache_set(c, ca->disk_sb.sb);
2017
2018         /*
2019          * Increase journal write timeout if flushes to this device are
2020          * expensive:
2021          */
2022         if (!blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)) &&
2023             journal_flushes_device(ca))
2024                 c->journal.write_delay_ms =
2025                         max(c->journal.write_delay_ms, 1000U);
2026
2027         err = "error creating kobject";
2028         if (c->kobj.state_in_sysfs &&
2029             bch_cache_online(ca))
2030                 goto err;
2031
2032         if (ret)
2033                 *ret = ca;
2034         else
2035                 kobject_put(&ca->kobj);
2036         return NULL;
2037 err:
2038         bch_cache_stop(ca);
2039         return err;
2040 }
2041
2042 static struct cache_set *cache_set_lookup(uuid_le uuid)
2043 {
2044         struct cache_set *c;
2045
2046         lockdep_assert_held(&bch_register_lock);
2047
2048         list_for_each_entry(c, &bch_cache_sets, list)
2049                 if (!memcmp(&c->disk_sb.set_uuid, &uuid, sizeof(uuid_le)))
2050                         return c;
2051
2052         return NULL;
2053 }
2054
2055 static const char *register_cache(struct bcache_superblock *sb,
2056                                   struct cache_set_opts opts)
2057 {
2058         char name[BDEVNAME_SIZE];
2059         const char *err = "cannot allocate memory";
2060         struct cache_set *c;
2061
2062         err = validate_cache_super(sb);
2063         if (err)
2064                 return err;
2065
2066         bdevname(sb->bdev, name);
2067
2068         c = cache_set_lookup(sb->sb->set_uuid);
2069         if (c) {
2070                 if ((err = (can_attach_cache(sb->sb, c) ?:
2071                             cache_alloc(sb, c, NULL))))
2072                         return err;
2073
2074                 if (cache_set_nr_online_devices(c) == cache_set_nr_devices(c)) {
2075                         err = run_cache_set(c);
2076                         if (err)
2077                                 return err;
2078                 }
2079                 goto out;
2080         }
2081
2082         c = bch_cache_set_alloc(sb->sb, opts);
2083         if (!c)
2084                 return err;
2085
2086         err = cache_alloc(sb, c, NULL);
2087         if (err)
2088                 goto err_stop;
2089
2090         if (cache_set_nr_online_devices(c) == cache_set_nr_devices(c)) {
2091                 err = run_cache_set(c);
2092                 if (err)
2093                         goto err_stop;
2094         }
2095
2096         err = "error creating kobject";
2097         if (bch_cache_set_online(c))
2098                 goto err_stop;
2099 out:
2100
2101         bch_info(c, "started");
2102         return NULL;
2103 err_stop:
2104         bch_cache_set_stop(c);
2105         return err;
2106 }
2107
2108 int bch_cache_set_add_cache(struct cache_set *c, const char *path)
2109 {
2110         struct bcache_superblock sb;
2111         const char *err;
2112         struct cache *ca;
2113         struct cache_member *new_mi = NULL;
2114         struct cache_member mi;
2115         unsigned nr_this_dev, nr_in_set, u64s;
2116         int ret = -EINVAL;
2117
2118         mutex_lock(&bch_register_lock);
2119
2120         err = read_super(&sb, path);
2121         if (err)
2122                 goto err_unlock;
2123
2124         err = validate_cache_super(&sb);
2125         if (err)
2126                 goto err_unlock;
2127
2128         err = can_add_cache(sb.sb, c);
2129         if (err)
2130                 goto err_unlock;
2131
2132         /*
2133          * Preserve the old cache member information (esp. tier)
2134          * before we start bashing the disk stuff.
2135          */
2136         mi = sb.sb->members[sb.sb->nr_this_dev];
2137         mi.last_mount = cpu_to_le64(ktime_get_seconds());
2138
2139         down_read(&c->gc_lock);
2140
2141         if (dynamic_fault("bcache:add:no_slot"))
2142                 goto no_slot;
2143
2144         if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
2145                 goto no_slot;
2146
2147         for (nr_this_dev = 0; nr_this_dev < MAX_CACHES_PER_SET; nr_this_dev++)
2148                 if (nr_this_dev >= c->sb.nr_in_set ||
2149                     bch_is_zero(c->disk_mi[nr_this_dev].uuid.b,
2150                                  sizeof(uuid_le)))
2151                         goto have_slot;
2152 no_slot:
2153         up_read(&c->gc_lock);
2154
2155         err = "no slots available in superblock";
2156         ret = -ENOSPC;
2157         goto err_unlock;
2158
2159 have_slot:
2160         nr_in_set = max_t(unsigned, nr_this_dev + 1, c->sb.nr_in_set);
2161         up_read(&c->gc_lock);
2162
2163         u64s = nr_in_set * (sizeof(struct cache_member) / sizeof(u64));
2164         err = "no space in superblock for member info";
2165         if (bch_super_realloc(&sb, u64s))
2166                 goto err_unlock;
2167
2168         new_mi = dynamic_fault("bcache:add:member_info_realloc")
2169                 ? NULL
2170                 : kmalloc(sizeof(struct cache_member) * nr_in_set,
2171                           GFP_KERNEL);
2172         if (!new_mi) {
2173                 err = "cannot allocate memory";
2174                 ret = -ENOMEM;
2175                 goto err_unlock;
2176         }
2177
2178         memcpy(new_mi, c->disk_mi,
2179                sizeof(struct cache_member) * nr_in_set);
2180         new_mi[nr_this_dev] = mi;
2181
2182         sb.sb->nr_this_dev      = nr_this_dev;
2183         sb.sb->nr_in_set        = nr_in_set;
2184         sb.sb->u64s             = cpu_to_le16(u64s);
2185         memcpy(sb.sb->members, new_mi,
2186                sizeof(struct cache_member) * nr_in_set);
2187
2188         if (cache_set_mi_update(c, new_mi, nr_in_set)) {
2189                 err = "cannot allocate memory";
2190                 ret = -ENOMEM;
2191                 goto err_unlock;
2192         }
2193
2194         /* commit new member info */
2195         swap(c->disk_mi, new_mi);
2196         kfree(new_mi);
2197         new_mi = NULL;
2198         c->disk_sb.nr_in_set = nr_in_set;
2199         c->sb.nr_in_set = nr_in_set;
2200
2201         err = cache_alloc(&sb, c, &ca);
2202         if (err)
2203                 goto err_unlock;
2204
2205         bcache_write_super(c);
2206
2207         err = "journal alloc failed";
2208         if (bch_cache_journal_alloc(ca))
2209                 goto err_put;
2210
2211         bch_notify_cache_added(ca);
2212
2213         if (ca->mi.state == CACHE_ACTIVE) {
2214                 err = __bch_cache_read_write(c, ca);
2215                 if (err)
2216                         goto err_put;
2217         }
2218
2219         kobject_put(&ca->kobj);
2220         mutex_unlock(&bch_register_lock);
2221         return 0;
2222 err_put:
2223         bch_cache_stop(ca);
2224 err_unlock:
2225         kfree(new_mi);
2226         free_super(&sb);
2227         mutex_unlock(&bch_register_lock);
2228
2229         bch_err(c, "Unable to add device: %s", err);
2230         return ret ?: -EINVAL;
2231 }
2232
2233 const char *bch_register_cache_set(char * const *devices, unsigned nr_devices,
2234                                    struct cache_set_opts opts,
2235                                    struct cache_set **ret)
2236 {
2237         const char *err;
2238         struct cache_set *c = NULL;
2239         struct bcache_superblock *sb;
2240         uuid_le uuid;
2241         unsigned i;
2242
2243         memset(&uuid, 0, sizeof(uuid_le));
2244
2245         if (!nr_devices)
2246                 return "need at least one device";
2247
2248         if (!try_module_get(THIS_MODULE))
2249                 return "module unloading";
2250
2251         err = "cannot allocate memory";
2252         sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
2253         if (!sb)
2254                 goto err;
2255
2256         /*
2257          * read_super() needs to happen under register_lock, so that the
2258          * exclusive open is atomic with adding the new cache set to the list of
2259          * cache sets:
2260          */
2261         mutex_lock(&bch_register_lock);
2262
2263         for (i = 0; i < nr_devices; i++) {
2264                 err = read_super(&sb[i], devices[i]);
2265                 if (err)
2266                         goto err_unlock;
2267
2268                 err = "attempting to register backing device";
2269                 if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
2270                         goto err_unlock;
2271
2272                 err = validate_cache_super(&sb[i]);
2273                 if (err)
2274                         goto err_unlock;
2275         }
2276
2277         err = "cache set already registered";
2278         if (cache_set_lookup(sb->sb->set_uuid))
2279                 goto err_unlock;
2280
2281         err = "cannot allocate memory";
2282         c = bch_cache_set_alloc(sb[0].sb, opts);
2283         if (!c)
2284                 goto err_unlock;
2285
2286         for (i = 0; i < nr_devices; i++) {
2287                 err = cache_alloc(&sb[i], c, NULL);
2288                 if (err)
2289                         goto err_unlock;
2290         }
2291
2292         err = "insufficient devices";
2293         if (cache_set_nr_online_devices(c) != cache_set_nr_devices(c))
2294                 goto err_unlock;
2295
2296         err = run_cache_set(c);
2297         if (err)
2298                 goto err_unlock;
2299
2300         err = "error creating kobject";
2301         if (bch_cache_set_online(c))
2302                 goto err_unlock;
2303
2304         if (ret) {
2305                 closure_get(&c->cl);
2306                 *ret = c;
2307         }
2308
2309         mutex_unlock(&bch_register_lock);
2310
2311         err = NULL;
2312 out:
2313         kfree(sb);
2314         module_put(THIS_MODULE);
2315         return err;
2316 err_unlock:
2317         if (c)
2318                 bch_cache_set_stop(c);
2319         mutex_unlock(&bch_register_lock);
2320 err:
2321         for (i = 0; i < nr_devices; i++)
2322                 free_super(&sb[i]);
2323         goto out;
2324 }
2325
2326 const char *bch_register_one(const char *path)
2327 {
2328         struct bcache_superblock sb;
2329         const char *err;
2330
2331         mutex_lock(&bch_register_lock);
2332
2333         err = read_super(&sb, path);
2334         if (err)
2335                 goto err;
2336
2337         if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
2338                 err = bch_backing_dev_register(&sb);
2339         else
2340                 err = register_cache(&sb, cache_set_opts_empty());
2341
2342         free_super(&sb);
2343 err:
2344         mutex_unlock(&bch_register_lock);
2345         return err;
2346 }
2347
2348 /* Global interfaces/init */
2349
2350 #define kobj_attribute_write(n, fn)                                     \
2351         static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
2352
2353 #define kobj_attribute_rw(n, show, store)                               \
2354         static struct kobj_attribute ksysfs_##n =                       \
2355                 __ATTR(n, S_IWUSR|S_IRUSR, show, store)
2356
2357 static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
2358                                const char *, size_t);
2359
2360 kobj_attribute_write(register,          register_bcache);
2361 kobj_attribute_write(register_quiet,    register_bcache);
2362
2363 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2364                                const char *buffer, size_t size)
2365 {
2366         ssize_t ret = -EINVAL;
2367         const char *err = "cannot allocate memory";
2368         char *path = NULL;
2369
2370         if (!try_module_get(THIS_MODULE))
2371                 return -EBUSY;
2372
2373         if (!(path = kstrndup(skip_spaces(buffer), size, GFP_KERNEL)))
2374                 goto err;
2375
2376         err = bch_register_one(strim(path));
2377         if (err)
2378                 goto err;
2379
2380         ret = size;
2381 out:
2382         kfree(path);
2383         module_put(THIS_MODULE);
2384         return ret;
2385 err:
2386         pr_err("error opening %s: %s", path, err);
2387         goto out;
2388 }
2389
2390 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2391 {
2392         if (code == SYS_DOWN ||
2393             code == SYS_HALT ||
2394             code == SYS_POWER_OFF) {
2395                 struct cache_set *c;
2396
2397                 mutex_lock(&bch_register_lock);
2398
2399                 if (!list_empty(&bch_cache_sets))
2400                         pr_info("Setting all devices read only:");
2401
2402                 list_for_each_entry(c, &bch_cache_sets, list)
2403                         bch_cache_set_read_only(c);
2404
2405                 list_for_each_entry(c, &bch_cache_sets, list)
2406                         bch_cache_set_read_only_sync(c);
2407
2408                 mutex_unlock(&bch_register_lock);
2409         }
2410
2411         return NOTIFY_DONE;
2412 }
2413
2414 static struct notifier_block reboot = {
2415         .notifier_call  = bcache_reboot,
2416         .priority       = INT_MAX, /* before any real devices */
2417 };
2418
2419 static ssize_t reboot_test(struct kobject *k, struct kobj_attribute *attr,
2420                            const char *buffer, size_t size)
2421 {
2422         bcache_reboot(NULL, SYS_DOWN, NULL);
2423         return size;
2424 }
2425
2426 kobj_attribute_write(reboot,            reboot_test);
2427
2428 static void bcache_exit(void)
2429 {
2430         bch_debug_exit();
2431         bch_fs_exit();
2432         bch_blockdev_exit();
2433         if (bcache_kset)
2434                 kset_unregister(bcache_kset);
2435         if (bcache_io_wq)
2436                 destroy_workqueue(bcache_io_wq);
2437         if (!IS_ERR_OR_NULL(bch_chardev_class))
2438                 device_destroy(bch_chardev_class,
2439                                MKDEV(bch_chardev_major, 0));
2440         if (!IS_ERR_OR_NULL(bch_chardev_class))
2441                 class_destroy(bch_chardev_class);
2442         if (bch_chardev_major > 0)
2443                 unregister_chrdev(bch_chardev_major, "bcache");
2444         if (!IS_ERR_OR_NULL(bch_sha1))
2445                 crypto_free_shash(bch_sha1);
2446         unregister_reboot_notifier(&reboot);
2447 }
2448
2449 static int __init bcache_init(void)
2450 {
2451         static const struct attribute *files[] = {
2452                 &ksysfs_register.attr,
2453                 &ksysfs_register_quiet.attr,
2454                 &ksysfs_reboot.attr,
2455                 NULL
2456         };
2457
2458         mutex_init(&bch_register_lock);
2459         register_reboot_notifier(&reboot);
2460         closure_debug_init();
2461         bkey_pack_test();
2462
2463         bch_sha1 = crypto_alloc_shash("sha1", 0, 0);
2464         if (IS_ERR(bch_sha1))
2465                 goto err;
2466
2467         bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops);
2468         if (bch_chardev_major < 0)
2469                 goto err;
2470
2471         bch_chardev_class = class_create(THIS_MODULE, "bcache");
2472         if (IS_ERR(bch_chardev_class))
2473                 goto err;
2474
2475         bch_chardev = device_create(bch_chardev_class, NULL,
2476                                     MKDEV(bch_chardev_major, 255),
2477                                     NULL, "bcache-ctl");
2478         if (IS_ERR(bch_chardev))
2479                 goto err;
2480
2481         if (!(bcache_io_wq = create_freezable_workqueue("bcache_io")) ||
2482             !(bcache_kset = kset_create_and_add("bcache", NULL, fs_kobj)) ||
2483             sysfs_create_files(&bcache_kset->kobj, files) ||
2484             bch_blockdev_init() ||
2485             bch_fs_init() ||
2486             bch_debug_init())
2487                 goto err;
2488
2489         return 0;
2490 err:
2491         bcache_exit();
2492         return -ENOMEM;
2493 }
2494
2495 #define BCH_DEBUG_PARAM(name, description)                      \
2496         bool bch_##name;                                        \
2497         module_param_named(name, bch_##name, bool, 0644);       \
2498         MODULE_PARM_DESC(name, description);
2499 BCH_DEBUG_PARAMS()
2500 #undef BCH_DEBUG_PARAM
2501
2502 module_exit(bcache_exit);
2503 module_init(bcache_init);