]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
New on disk format - encryption
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 4 Oct 2016 03:22:17 +0000 (19:22 -0800)
committerKent Overstreet <kent.overstreet@gmail.com>
Tue, 28 Feb 2017 12:05:38 +0000 (03:05 -0900)
99 files changed:
.bcache_revision
Makefile
bcache-userspace-shim.c
bcache.c
cmd_debug.c
cmd_device.c
cmd_format.c
cmd_key.c [new file with mode: 0644]
cmds.h
crypto.c [new file with mode: 0644]
crypto.h [new file with mode: 0644]
include/crypto/algapi.h
include/crypto/chacha20.h
include/crypto/hash.h
include/crypto/internal/hash.h
include/crypto/poly1305.h
include/crypto/sha.h [deleted file]
include/crypto/sha1_base.h [deleted file]
include/keys/user-type.h [new file with mode: 0644]
include/linux/bcache.h
include/linux/crypto.h
include/linux/cryptohash.h [deleted file]
include/linux/kernel.h
include/linux/key.h [new file with mode: 0644]
include/linux/mempool.h
include/linux/page.h
include/linux/scatterlist.h [new file with mode: 0644]
include/linux/time64.h
include/trace/events/bcache.h
libbcache.c
libbcache.h
libbcache/acl.c
libbcache/alloc.c
libbcache/alloc_types.h
libbcache/bcache.h
libbcache/bkey.c
libbcache/bkey.h
libbcache/blockdev.c
libbcache/bset.c
libbcache/bset.h
libbcache/btree_cache.c
libbcache/btree_gc.c
libbcache/btree_gc.h
libbcache/btree_io.c
libbcache/btree_types.h
libbcache/btree_update.c
libbcache/btree_update.h
libbcache/buckets.c
libbcache/buckets.h
libbcache/chardev.c
libbcache/checksum.c
libbcache/checksum.h
libbcache/compress.c
libbcache/compress.h
libbcache/debug.c
libbcache/dirent.c
libbcache/extents.c
libbcache/extents.h
libbcache/fs-gc.c
libbcache/fs-io.c
libbcache/fs.c
libbcache/fs.h
libbcache/inode.c
libbcache/inode.h
libbcache/io.c
libbcache/io.h
libbcache/io_types.h
libbcache/journal.c
libbcache/journal.h
libbcache/journal_types.h
libbcache/migrate.c
libbcache/move.c
libbcache/movinggc.c
libbcache/notify.c
libbcache/opts.c
libbcache/opts.h
libbcache/siphash.c
libbcache/str_hash.h
libbcache/super-io.c [new file with mode: 0644]
libbcache/super-io.h [new file with mode: 0644]
libbcache/super.c
libbcache/super.h
libbcache/super_types.h
libbcache/sysfs.c
libbcache/tier.c
libbcache/vstructs.h [new file with mode: 0644]
libbcache/xattr.c
linux/crypto/algapi.c [deleted file]
linux/crypto/api.c
linux/crypto/blkcipher.c [new file with mode: 0644]
linux/crypto/chacha20_generic.c [new file with mode: 0644]
linux/crypto/cipher.c [deleted file]
linux/crypto/internal.h
linux/crypto/poly1305_generic.c [new file with mode: 0644]
linux/crypto/sha1_generic.c [deleted file]
linux/crypto/sha256_generic.c [new file with mode: 0644]
linux/crypto/shash.c
linux/lz4hc_compress.c [deleted file]
linux/sha1.c [deleted file]

index 5caaaba2efbcee68700eae8bebeefa7bd0567385..8fb728e417191525917f9e50be88870c0b84a6c0 100644 (file)
@@ -1 +1 @@
-BCACHE_REVISION=76e3b2312705df2cb5adb8834bc6df56a288932e
+BCACHE_REVISION=561f3067172cbfc63a680cfb670d558724441123
index a3bf8d8e2e488807fd658653148cb42aa960a283..bc0402c383c11191a3ec70ba271a923575db1c47 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -20,9 +20,10 @@ else
        LDFLAGS+=-flto
 endif
 
-PKGCONFIG_LIBS="blkid uuid liburcu"
+PKGCONFIG_LIBS="blkid uuid liburcu libsodium"
 CFLAGS+=`pkg-config --cflags   ${PKGCONFIG_LIBS}`
-LDLIBS+=`pkg-config --libs     ${PKGCONFIG_LIBS}` -lm -lpthread -lrt
+LDLIBS+=`pkg-config --libs     ${PKGCONFIG_LIBS}`              \
+       -lm -lpthread -lrt -lscrypt -lkeyutils
 
 ifeq ($(PREFIX),/usr)
        ROOT_SBINDIR=/sbin
@@ -48,7 +49,9 @@ OBJS=bcache.o                 \
      cmd_fs.o                  \
      cmd_fsck.o                        \
      cmd_format.o              \
+     cmd_key.o                 \
      cmd_run.o                 \
+     crypto.o                  \
      libbcache.o               \
      qcow2.o                   \
      tools-util.o              \
index 9be5b507566f0a2b6e02bd6aaf21ef810d1bc265..8634d8f730954e6b399c2ea557bbe8c716fea181 100644 (file)
@@ -144,6 +144,7 @@ enum fsck_err_opts fsck_err_opt;
 #include "six.c"
 //#include "stats.c"
 #include "super.c"
+#include "super-io.c"
 //#include "sysfs.c"
 #include "tier.c"
 #include "trace.c"
index 1fb1a55ebbcefd48085b2b8cf23321192aa790f5..ac9eb07edf934187fafb004ee250e462c8adcfec 100644 (file)
--- a/bcache.c
+++ b/bcache.c
@@ -30,6 +30,7 @@ static void usage(void)
             "\n"
             "Commands for formatting, startup and shutdown:\n"
             "  format         Format a new filesystem\n"
+            "  unlock         Unlock an encrypted filesystem prior to running/mounting\n"
             "  assemble       Assemble an existing multi device filesystem\n"
             "  incremental    Incrementally assemble an existing multi device filesystem\n"
             "  run            Start a partially assembled filesystem\n"
@@ -46,6 +47,7 @@ static void usage(void)
             "\n"
             "Repair:\n"
             "  bcache fsck    Check an existing filesystem for errors\n"
+            "\n"
             "Debug:\n"
             "  bcache dump    Dump filesystem metadata to a qcow2 image\n"
             "  bcache list    List filesystem metadata in textual form\n");
@@ -94,6 +96,9 @@ int main(int argc, char *argv[])
        if (!strcmp(cmd, "fsck"))
                return cmd_fsck(argc, argv);
 
+       if (!strcmp(cmd, "unlock"))
+               return cmd_unlock(argc, argv);
+
        if (!strcmp(cmd, "dump"))
                return cmd_dump(argc, argv);
        if (!strcmp(cmd, "list"))
index 0813d292c4c1e455946e2160f1df18a999599bb5..df23ae102bfc435913dd166a28650f008e52a51e 100644 (file)
@@ -27,21 +27,27 @@ static void dump_usage(void)
             "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 
-void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
+static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
 {
-       struct cache_sb *sb = ca->disk_sb.sb;
+       struct bch_sb *sb = ca->disk_sb.sb;
        sparse_data data;
        unsigned i;
 
        darray_init(data);
 
        /* Superblock: */
-       data_add(&data, SB_SECTOR << 9, __set_bytes(sb, le16_to_cpu(sb->u64s)));
+       data_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+                sizeof(struct bch_sb_layout));
+
+       for (i = 0; i < sb->layout.nr_superblocks; i++)
+               data_add(&data,
+                        le64_to_cpu(sb->layout.sb_offset[i]) << 9,
+                        vstruct_bytes(sb));
 
        /* Journal: */
-       for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++)
+       for (i = 0; i < ca->journal.nr; i++)
                if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
-                       u64 bucket = journal_bucket(ca->disk_sb.sb, i);
+                       u64 bucket = ca->journal.buckets[i];
 
                        data_add(&data,
                                 bucket_bytes(ca) * bucket,
@@ -64,7 +70,7 @@ void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
                        struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 
                        extent_for_each_ptr(e, ptr)
-                               if (ptr->dev == ca->sb.nr_this_dev)
+                               if (ptr->dev == ca->dev_idx)
                                        data_add(&data,
                                                 ptr->offset << 9,
                                                 b->written << 9);
@@ -120,13 +126,13 @@ int cmd_dump(int argc, char *argv[])
 
        down_read(&c->gc_lock);
 
-       for (i = 0; i < c->sb.nr_in_set; i++)
+       for (i = 0; i < c->sb.nr_devices; i++)
                if (c->cache[i])
                        nr_devices++;
 
        BUG_ON(!nr_devices);
 
-       for (i = 0; i < c->sb.nr_in_set; i++) {
+       for (i = 0; i < c->sb.nr_devices; i++) {
                int mode = O_WRONLY|O_CREAT|O_TRUNC;
 
                if (!force)
@@ -155,8 +161,8 @@ int cmd_dump(int argc, char *argv[])
        return 0;
 }
 
-void list_keys(struct cache_set *c, enum btree_id btree_id,
-              struct bpos start, struct bpos end, int mode)
+static void list_keys(struct cache_set *c, enum btree_id btree_id,
+                     struct bpos start, struct bpos end, int mode)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
@@ -173,8 +179,8 @@ void list_keys(struct cache_set *c, enum btree_id btree_id,
        bch_btree_iter_unlock(&iter);
 }
 
-void list_btree_formats(struct cache_set *c, enum btree_id btree_id,
-                       struct bpos start, struct bpos end, int mode)
+static void list_btree_formats(struct cache_set *c, enum btree_id btree_id,
+                              struct bpos start, struct bpos end, int mode)
 {
        struct btree_iter iter;
        struct btree *b;
@@ -190,7 +196,7 @@ void list_btree_formats(struct cache_set *c, enum btree_id btree_id,
        bch_btree_iter_unlock(&iter);
 }
 
-struct bpos parse_pos(char *buf)
+static struct bpos parse_pos(char *buf)
 {
        char *s = buf;
        char *inode     = strsep(&s, ":");
index ecb63bb461a4452d7dc30398e4f42abb7e806469..1c5208af103f32d1c6fc38baf01ddffdf4f106fb 100644 (file)
@@ -103,7 +103,7 @@ int cmd_device_show(int argc, char *argv[])
        struct bcache_dev devices[256];
        unsigned i, j, nr_devices = 0, nr_active_tiers = 0;
 
-       unsigned tiers[CACHE_TIERS]; /* number of devices in each tier */
+       unsigned tiers[BCH_TIER_MAX]; /* number of devices in each tier */
        memset(tiers, 0, sizeof(tiers));
 
        while ((entry = readdir(fs.sysfs))) {
@@ -133,14 +133,14 @@ int cmd_device_show(int argc, char *argv[])
                close(fd);
        }
 
-       for (i = 0; i < CACHE_TIERS; i++)
+       for (i = 0; i < BCH_TIER_MAX; i++)
                if (tiers[i])
                        nr_active_tiers++;
 
        /* Print out devices sorted by tier: */
        bool first = true;
 
-       for (i = 0; i < CACHE_TIERS; i++) {
+       for (i = 0; i < BCH_TIER_MAX; i++) {
                if (!tiers[i])
                        continue;
 
@@ -168,7 +168,7 @@ int cmd_device_show(int argc, char *argv[])
 
 int cmd_device_show(int argc, char *argv[])
 {
-       struct cache_sb *sb;
+       struct bch_sb *sb;
 
        if (argc != 2)
                die("please supply a single device");
index b955b4164accfdd24fe7e3ab4b0d4a5c9137e04c..2b1453eeba9ed6d87b1476affc4da117a53f911d 100644 (file)
@@ -24,6 +24,7 @@
 
 #include "cmds.h"
 #include "libbcache.h"
+#include "crypto.h"
 #include "opts.h"
 #include "util.h"
 
@@ -80,6 +81,7 @@ static void usage(void)
             "      --metadata_checksum_type=(none|crc32c|crc64)\n"
             "      --data_checksum_type=(none|crc32c|crc64)\n"
             "      --compression_type=(none|lz4|gzip)\n"
+            "      --encrypted\n"
             "      --error_action=(continue|readonly|panic)\n"
             "                              Action to take on filesystem error\n"
             "      --max_journal_entry_size=size\n"
@@ -107,6 +109,7 @@ static void usage(void)
        OPT(0,          metadata_checksum_type, required_argument)      \
        OPT(0,          data_checksum_type,     required_argument)      \
        OPT(0,          compression_type,       required_argument)      \
+       OPT(0,          encrypted,              no_argument)            \
        OPT('e',        error_action,           required_argument)      \
        OPT(0,          max_journal_entry_size, required_argument)      \
        OPT('L',        label,                  required_argument)      \
@@ -164,6 +167,7 @@ int cmd_format(int argc, char *argv[])
        unsigned meta_csum_type = BCH_CSUM_CRC32C;
        unsigned data_csum_type = BCH_CSUM_CRC32C;
        unsigned compression_type = BCH_COMPRESSION_NONE;
+       bool encrypted = false;
        unsigned on_error_action = BCH_ON_ERROR_RO;
        char *label = NULL;
        uuid_le uuid;
@@ -208,6 +212,9 @@ int cmd_format(int argc, char *argv[])
                                                bch_compression_types,
                                                "compression type");
                        break;
+               case Opt_encrypted:
+                       encrypted = true;
+                       break;
                case Opt_error_action:
                case 'e':
                        on_error_action = read_string_list_or_die(optarg,
@@ -242,7 +249,7 @@ int cmd_format(int argc, char *argv[])
                case Opt_tier:
                case 't':
                        if (kstrtouint(optarg, 10, &tier) ||
-                           tier >= CACHE_TIERS)
+                           tier >= BCH_TIER_MAX)
                                die("invalid tier");
                        break;
                case Opt_discard:
@@ -270,6 +277,24 @@ int cmd_format(int argc, char *argv[])
        if (uuid_is_null(uuid.b))
                uuid_generate(uuid.b);
 
+       if (encrypted) {
+               passphrase = read_passphrase("Enter passphrase: ");
+
+               if (isatty(STDIN_FILENO)) {
+                       char *pass2 =
+                               read_passphrase("Enter same passphrase again: ");
+
+                       if (strcmp(passphrase, pass2)) {
+                               memzero_explicit(passphrase, strlen(passphrase));
+                               memzero_explicit(pass2, strlen(pass2));
+                               die("Passphrases do not match");
+                       }
+
+                       memzero_explicit(pass2, strlen(pass2));
+                       free(pass2);
+               }
+       }
+
        darray_foreach(dev, devices)
                dev->fd = open_for_format(dev->path, force);
 
@@ -279,6 +304,7 @@ int cmd_format(int argc, char *argv[])
                      meta_csum_type,
                      data_csum_type,
                      compression_type,
+                     passphrase,
                      1,
                      1,
                      on_error_action,
diff --git a/cmd_key.c b/cmd_key.c
new file mode 100644 (file)
index 0000000..587ecbe
--- /dev/null
+++ b/cmd_key.c
@@ -0,0 +1,62 @@
+#include <errno.h>
+#include <unistd.h>
+#include <keyutils.h>
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "checksum.h"
+#include "crypto.h"
+#include "libbcache.h"
+
+int cmd_unlock(int argc, char *argv[])
+{
+       struct bch_encrypted_key sb_key;
+       struct bch_key passphrase_key;
+       struct bch_sb *sb;
+       struct bch_sb_field_crypt *crypt;
+       char *passphrase;
+       char uuid[40];
+       char description[60];
+
+       if (argc != 2)
+               die("please supply a single device");
+
+       sb = bcache_super_read(argv[1]);
+
+       crypt = bch_sb_get_crypt(sb);
+       if (!crypt)
+               die("filesystem is not encrypted");
+
+       sb_key = crypt->key;
+
+       if (!bch_key_is_encrypted(&sb_key))
+               die("filesystem does not have encryption key");
+
+       passphrase = read_passphrase("Enter passphrase: ");
+       derive_passphrase(crypt, &passphrase_key, passphrase);
+
+       /* Check if the user supplied the correct passphrase: */
+       if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+                                  &sb_key, sizeof(sb_key)))
+               die("error encrypting key");
+
+       if (bch_key_is_encrypted(&sb_key))
+               die("incorrect passphrase");
+
+       uuid_unparse_lower(sb->user_uuid.b, uuid);
+       sprintf(description, "bcache:%s", uuid);
+
+       if (add_key("logon", description,
+                   &passphrase_key, sizeof(passphrase_key),
+                   KEY_SPEC_USER_KEYRING) < 0 ||
+           add_key("user", description,
+                   &passphrase_key, sizeof(passphrase_key),
+                   KEY_SPEC_USER_KEYRING) < 0)
+               die("add_key error: %s", strerror(errno));
+
+       memzero_explicit(&sb_key, sizeof(sb_key));
+       memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+       memzero_explicit(passphrase, strlen(passphrase));
+       free(passphrase);
+       return 0;
+}
diff --git a/cmds.h b/cmds.h
index c0c8aa56b59bc977db950e3b008265ac2372f822..946acfda0da327e18577b3f6170e22d3d17b68c9 100644 (file)
--- a/cmds.h
+++ b/cmds.h
@@ -11,6 +11,7 @@
 
 int cmd_format(int argc, char *argv[]);
 
+int cmd_unlock(int argc, char *argv[]);
 int cmd_assemble(int argc, char *argv[]);
 int cmd_incremental(int argc, char *argv[]);
 int cmd_run(int argc, char *argv[]);
diff --git a/crypto.c b/crypto.c
new file mode 100644 (file)
index 0000000..86da70a
--- /dev/null
+++ b/crypto.c
@@ -0,0 +1,103 @@
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/random.h>
+#include <libscrypt.h>
+
+#include "checksum.h"
+#include "crypto.h"
+
+char *read_passphrase(const char *prompt)
+{
+       char *buf = NULL;
+       size_t buflen = 0;
+       ssize_t len;
+
+       if (isatty(STDIN_FILENO)) {
+               struct termios old, new;
+
+               fprintf(stderr, "%s", prompt);
+               fflush(stderr);
+
+               if (tcgetattr(STDIN_FILENO, &old))
+                       die("error getting terminal attrs");
+
+               new = old;
+               new.c_lflag &= ~ECHO;
+               if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &new))
+                       die("error setting terminal attrs");
+
+               len = getline(&buf, &buflen, stdin);
+
+               tcsetattr(STDIN_FILENO, TCSAFLUSH, &old);
+               fprintf(stderr, "\n");
+       } else {
+               len = getline(&buf, &buflen, stdin);
+       }
+
+       if (len < 0)
+               die("error reading passphrase");
+       if (len && buf[len - 1] == '\n')
+               buf[len - 1] = '\0';
+
+       return buf;
+}
+
+void derive_passphrase(struct bch_sb_field_crypt *crypt,
+                      struct bch_key *key,
+                      const char *passphrase)
+{
+       const unsigned char salt[] = "bcache";
+       int ret;
+
+       switch (BCH_CRYPT_KDF_TYPE(crypt)) {
+       case BCH_KDF_SCRYPT:
+               ret = libscrypt_scrypt((void *) passphrase, strlen(passphrase),
+                                      salt, sizeof(salt),
+                                      1ULL << BCH_KDF_SCRYPT_N(crypt),
+                                      1ULL << BCH_KDF_SCRYPT_R(crypt),
+                                      1ULL << BCH_KDF_SCRYPT_P(crypt),
+                                      (void *) key, sizeof(*key));
+               if (ret)
+                       die("scrypt error: %i", ret);
+               break;
+       default:
+               die("unknown kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+       }
+}
+
+void bch_sb_crypt_init(struct bch_sb *sb,
+                      struct bch_sb_field_crypt *crypt,
+                      const char *passphrase)
+{
+       struct bch_key passphrase_key;
+
+       SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
+       SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
+       SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
+       SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
+
+       derive_passphrase(crypt, &passphrase_key, passphrase);
+
+       crypt->key.magic = BCH_KEY_MAGIC;
+       get_random_bytes(&crypt->key.key, sizeof(crypt->key.key));
+
+       assert(!bch_key_is_encrypted(&crypt->key));
+
+       if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+                                  &crypt->key, sizeof(crypt->key)))
+               die("error encrypting key");
+
+       assert(bch_key_is_encrypted(&crypt->key));
+
+       memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+}
diff --git a/crypto.h b/crypto.h
new file mode 100644 (file)
index 0000000..643073e
--- /dev/null
+++ b/crypto.h
@@ -0,0 +1,13 @@
+#ifndef _CRYPTO_H
+#define _CRYPTO_H
+
+#include "super-io.h"
+#include "tools-util.h"
+
+char *read_passphrase(const char *);
+void derive_passphrase(struct bch_sb_field_crypt *,
+                      struct bch_key *, const char *);
+void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *,
+                      const char *);
+
+#endif /* _CRYPTO_H */
index 31f453ee8f752a649e830da26e86b1931c917fa2..d8bfcc1f3524fab376759afd552800bf0f059d09 100644 (file)
 #define _CRYPTO_ALGAPI_H
 
 #include <linux/crypto.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/kernel.h>
-#include <linux/kthread.h>
-
-struct crypto_aead;
-struct crypto_instance;
-struct module;
-struct rtattr;
-struct seq_file;
-struct sk_buff;
 
 struct crypto_type {
        unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask);
        unsigned int (*extsize)(struct crypto_alg *alg);
        int (*init)(struct crypto_tfm *tfm, u32 type, u32 mask);
        int (*init_tfm)(struct crypto_tfm *tfm);
-       void (*show)(struct seq_file *m, struct crypto_alg *alg);
-       struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask);
-       void (*free)(struct crypto_instance *inst);
-
-       unsigned int type;
-       unsigned int maskclear;
-       unsigned int maskset;
-       unsigned int tfmsize;
-};
-
-struct crypto_instance {
-       struct crypto_alg alg;
-
-       struct crypto_template *tmpl;
-       struct hlist_node list;
-
-       void *__ctx[] CRYPTO_MINALIGN_ATTR;
-};
-
-struct crypto_template {
-       struct list_head list;
-       struct hlist_head instances;
-       struct module *module;
-
-       struct crypto_instance *(*alloc)(struct rtattr **tb);
-       void (*free)(struct crypto_instance *inst);
-       int (*create)(struct crypto_template *tmpl, struct rtattr **tb);
-
-       char name[CRYPTO_MAX_ALG_NAME];
-};
-
-struct scatter_walk {
-       struct scatterlist *sg;
-       unsigned int offset;
-};
-
-struct blkcipher_walk {
-       union {
-               struct {
-                       struct page *page;
-                       unsigned long offset;
-               } phys;
-
-               struct {
-                       u8 *page;
-                       u8 *addr;
-               } virt;
-       } src, dst;
 
-       struct scatter_walk in;
-       unsigned int nbytes;
-
-       struct scatter_walk out;
-       unsigned int total;
-
-       void *page;
-       u8 *buffer;
-       u8 *iv;
-       unsigned int ivsize;
-
-       int flags;
-       unsigned int walk_blocksize;
-       unsigned int cipher_blocksize;
-       unsigned int alignmask;
+       unsigned        type;
+       unsigned        maskclear;
+       unsigned        maskset;
+       unsigned        tfmsize;
 };
 
 extern const struct crypto_type crypto_blkcipher_type;
 
-struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb);
-int crypto_check_attr_type(struct rtattr **tb, u32 type);
-const char *crypto_attr_alg_name(struct rtattr *rta);
-struct crypto_alg *crypto_attr_alg2(struct rtattr *rta,
-                                   const struct crypto_type *frontend,
-                                   u32 type, u32 mask);
-
-static inline struct crypto_alg *crypto_attr_alg(struct rtattr *rta,
-                                                u32 type, u32 mask)
-{
-       return crypto_attr_alg2(rta, NULL, type, mask);
-}
-
-int crypto_attr_u32(struct rtattr *rta, u32 *num);
-
-/* These functions require the input/output to be aligned as u32. */
-void crypto_inc(u8 *a, unsigned int size);
-void crypto_xor(u8 *dst, const u8 *src, unsigned int size);
-
-int blkcipher_walk_done(struct blkcipher_desc *desc,
-                       struct blkcipher_walk *walk, int err);
-int blkcipher_walk_virt(struct blkcipher_desc *desc,
-                       struct blkcipher_walk *walk);
-int blkcipher_walk_phys(struct blkcipher_desc *desc,
-                       struct blkcipher_walk *walk);
-int blkcipher_walk_virt_block(struct blkcipher_desc *desc,
-                             struct blkcipher_walk *walk,
-                             unsigned int blocksize);
-int blkcipher_aead_walk_virt_block(struct blkcipher_desc *desc,
-                                  struct blkcipher_walk *walk,
-                                  struct crypto_aead *tfm,
-                                  unsigned int blocksize);
-
-static inline void *crypto_tfm_ctx_aligned(struct crypto_tfm *tfm)
-{
-       return PTR_ALIGN(crypto_tfm_ctx(tfm),
-                        crypto_tfm_alg_alignmask(tfm) + 1);
-}
-
-static inline struct crypto_instance *crypto_tfm_alg_instance(
-       struct crypto_tfm *tfm)
-{
-       return container_of(tfm->__crt_alg, struct crypto_instance, alg);
-}
-
-static inline void *crypto_instance_ctx(struct crypto_instance *inst)
-{
-       return inst->__ctx;
-}
-
 static inline void *crypto_blkcipher_ctx(struct crypto_blkcipher *tfm)
 {
        return crypto_tfm_ctx(&tfm->base);
 }
 
-static inline void *crypto_blkcipher_ctx_aligned(struct crypto_blkcipher *tfm)
-{
-       return crypto_tfm_ctx_aligned(&tfm->base);
-}
-
-static inline struct cipher_alg *crypto_cipher_alg(struct crypto_cipher *tfm)
-{
-       return &crypto_cipher_tfm(tfm)->__crt_alg->cra_cipher;
-}
-
-static inline void blkcipher_walk_init(struct blkcipher_walk *walk,
-                                      struct scatterlist *dst,
-                                      struct scatterlist *src,
-                                      unsigned int nbytes)
-{
-       walk->in.sg = src;
-       walk->out.sg = dst;
-       walk->total = nbytes;
-}
-
-static inline struct crypto_alg *crypto_get_attr_alg(struct rtattr **tb,
-                                                    u32 type, u32 mask)
-{
-       return crypto_attr_alg(tb[1], type, mask);
-}
-
-static inline int crypto_requires_sync(u32 type, u32 mask)
-{
-       return (type ^ CRYPTO_ALG_ASYNC) & mask & CRYPTO_ALG_ASYNC;
-}
-
-noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size);
-
-/**
- * crypto_memneq - Compare two areas of memory without leaking
- *                timing information.
- *
- * @a: One area of memory
- * @b: Another area of memory
- * @size: The size of the area.
- *
- * Returns 0 when data is equal, 1 otherwise.
- */
-static inline int crypto_memneq(const void *a, const void *b, size_t size)
-{
-       return __crypto_memneq(a, b, size) != 0UL ? 1 : 0;
-}
-
-static inline void crypto_yield(u32 flags)
-{
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
-       if (flags & CRYPTO_TFM_REQ_MAY_SLEEP)
-               cond_resched();
-#endif
-}
-
 #endif /* _CRYPTO_ALGAPI_H */
index 20d20f681a72c5178f9254edbe3127b664bf157a..1cdc77babedead5b1333a02fb99232a44c7dcdaf 100644 (file)
 #define CHACHA20_KEY_SIZE      32
 #define CHACHA20_BLOCK_SIZE    64
 
-struct chacha20_ctx {
-       u32 key[8];
-};
-
-void chacha20_block(u32 *state, void *stream);
-void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
-int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
-                          unsigned int keysize);
-int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                         struct scatterlist *src, unsigned int nbytes);
-
 #endif
index 00bd4e7ef3db8064013f2358fbcdf0256128f76d..97edaa885fb599da12662031ec8b9ea29ddb0bfe 100644 (file)
 #include <linux/crypto.h>
 #include <linux/string.h>
 
-struct hash_alg_common {
-       unsigned int digestsize;
-       unsigned int statesize;
-
-       struct crypto_alg base;
-};
-
 struct shash_desc {
        struct crypto_shash *tfm;
        u32 flags;
@@ -37,31 +30,21 @@ struct shash_desc {
 
 struct shash_alg {
        int (*init)(struct shash_desc *desc);
-       int (*update)(struct shash_desc *desc, const u8 *data,
-                     unsigned int len);
+       int (*update)(struct shash_desc *desc, const u8 *data, unsigned len);
        int (*final)(struct shash_desc *desc, u8 *out);
        int (*finup)(struct shash_desc *desc, const u8 *data,
-                    unsigned int len, u8 *out);
+                    unsigned len, u8 *out);
        int (*digest)(struct shash_desc *desc, const u8 *data,
-                     unsigned int len, u8 *out);
-       int (*export)(struct shash_desc *desc, void *out);
-       int (*import)(struct shash_desc *desc, const void *in);
-       int (*setkey)(struct crypto_shash *tfm, const u8 *key,
-                     unsigned int keylen);
-
-       unsigned int descsize;
-
-       /* These fields must match hash_alg_common. */
-       unsigned int digestsize
-               __attribute__ ((aligned(__alignof__(struct hash_alg_common))));
-       unsigned int statesize;
+                     unsigned len, u8 *out);
 
-       struct crypto_alg base;
+       unsigned                descsize;
+       unsigned                digestsize;
+       struct crypto_alg       base;
 };
 
 struct crypto_shash {
-       unsigned int descsize;
-       struct crypto_tfm base;
+       unsigned                descsize;
+       struct crypto_tfm       base;
 };
 
 struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
@@ -77,27 +60,6 @@ static inline void crypto_free_shash(struct crypto_shash *tfm)
        crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm));
 }
 
-static inline const char *crypto_shash_alg_name(struct crypto_shash *tfm)
-{
-       return crypto_tfm_alg_name(crypto_shash_tfm(tfm));
-}
-
-static inline const char *crypto_shash_driver_name(struct crypto_shash *tfm)
-{
-       return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
-}
-
-static inline unsigned int crypto_shash_alignmask(
-       struct crypto_shash *tfm)
-{
-       return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm));
-}
-
-static inline unsigned int crypto_shash_blocksize(struct crypto_shash *tfm)
-{
-       return crypto_tfm_alg_blocksize(crypto_shash_tfm(tfm));
-}
-
 static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg)
 {
        return container_of(alg, struct shash_alg, base);
@@ -108,32 +70,12 @@ static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm)
        return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg);
 }
 
-static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm)
+static inline unsigned crypto_shash_digestsize(struct crypto_shash *tfm)
 {
        return crypto_shash_alg(tfm)->digestsize;
 }
 
-static inline unsigned int crypto_shash_statesize(struct crypto_shash *tfm)
-{
-       return crypto_shash_alg(tfm)->statesize;
-}
-
-static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm)
-{
-       return crypto_tfm_get_flags(crypto_shash_tfm(tfm));
-}
-
-static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags)
-{
-       crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags);
-}
-
-static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags)
-{
-       crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags);
-}
-
-static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm)
+static inline unsigned crypto_shash_descsize(struct crypto_shash *tfm)
 {
        return tfm->descsize;
 }
@@ -143,39 +85,32 @@ static inline void *shash_desc_ctx(struct shash_desc *desc)
        return desc->__ctx;
 }
 
-int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
-                       unsigned int keylen);
-
-int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
-                       unsigned int len, u8 *out);
-
-static inline int crypto_shash_export(struct shash_desc *desc, void *out)
+static inline int crypto_shash_init(struct shash_desc *desc)
 {
-       return crypto_shash_alg(desc->tfm)->export(desc, out);
+       return crypto_shash_alg(desc->tfm)->init(desc);
 }
 
-static inline int crypto_shash_import(struct shash_desc *desc, const void *in)
+static inline int crypto_shash_update(struct shash_desc *desc,
+                                     const u8 *data, unsigned len)
 {
-       return crypto_shash_alg(desc->tfm)->import(desc, in);
+       return crypto_shash_alg(desc->tfm)->update(desc, data, len);
 }
 
-static inline int crypto_shash_init(struct shash_desc *desc)
+static inline int crypto_shash_final(struct shash_desc *desc, u8 *out)
 {
-       return crypto_shash_alg(desc->tfm)->init(desc);
+       return crypto_shash_alg(desc->tfm)->final(desc, out);
 }
 
-int crypto_shash_update(struct shash_desc *desc, const u8 *data,
-                       unsigned int len);
-
-int crypto_shash_final(struct shash_desc *desc, u8 *out);
-
-int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
-                      unsigned int len, u8 *out);
+static inline int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
+                                    unsigned len, u8 *out)
+{
+       return crypto_shash_alg(desc->tfm)->finup(desc, data, len, out);
+}
 
-static inline void shash_desc_zero(struct shash_desc *desc)
+static inline int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
+                                     unsigned len, u8 *out)
 {
-       memzero_explicit(desc,
-                        sizeof(*desc) + crypto_shash_descsize(desc->tfm));
+       return crypto_shash_alg(desc->tfm)->digest(desc, data, len, out);
 }
 
 #endif /* _CRYPTO_HASH_H */
index 2d85c8038c7cc765cc2982fed2a00f5e7d91ff56..3973047b857248714818433f565bbc0bccb94657 100644 (file)
@@ -5,9 +5,6 @@
 #include <crypto/hash.h>
 
 int crypto_register_shash(struct shash_alg *alg);
-int crypto_unregister_shash(struct shash_alg *alg);
-int crypto_register_shashes(struct shash_alg *algs, int count);
-int crypto_unregister_shashes(struct shash_alg *algs, int count);
 
 static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm)
 {
index 894df59b74e44a403947ca75e669688f5165640f..9fcfbfeb16ea9a537eb5f9f6df95f3b6c96defac 100644 (file)
@@ -5,37 +5,9 @@
 #ifndef _CRYPTO_POLY1305_H
 #define _CRYPTO_POLY1305_H
 
-#include <linux/types.h>
-#include <linux/crypto.h>
+#include <sodium/crypto_onetimeauth_poly1305.h>
 
-#define POLY1305_BLOCK_SIZE    16
-#define POLY1305_KEY_SIZE      32
-#define POLY1305_DIGEST_SIZE   16
-
-struct poly1305_desc_ctx {
-       /* key */
-       u32 r[5];
-       /* finalize key */
-       u32 s[4];
-       /* accumulator */
-       u32 h[5];
-       /* partial buffer */
-       u8 buf[POLY1305_BLOCK_SIZE];
-       /* bytes used in partial buffer */
-       unsigned int buflen;
-       /* r key has been set */
-       bool rset;
-       /* s key has been set */
-       bool sset;
-};
-
-int crypto_poly1305_init(struct shash_desc *desc);
-int crypto_poly1305_setkey(struct crypto_shash *tfm,
-                          const u8 *key, unsigned int keylen);
-unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
-                                       const u8 *src, unsigned int srclen);
-int crypto_poly1305_update(struct shash_desc *desc,
-                          const u8 *src, unsigned int srclen);
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
+#define POLY1305_KEY_SIZE      crypto_onetimeauth_poly1305_KEYBYTES
+#define POLY1305_DIGEST_SIZE   crypto_onetimeauth_poly1305_BYTES
 
 #endif
diff --git a/include/crypto/sha.h b/include/crypto/sha.h
deleted file mode 100644 (file)
index c94d3eb..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Common values for SHA algorithms
- */
-
-#ifndef _CRYPTO_SHA_H
-#define _CRYPTO_SHA_H
-
-#include <linux/types.h>
-
-#define SHA1_DIGEST_SIZE        20
-#define SHA1_BLOCK_SIZE         64
-
-#define SHA224_DIGEST_SIZE     28
-#define SHA224_BLOCK_SIZE      64
-
-#define SHA256_DIGEST_SIZE      32
-#define SHA256_BLOCK_SIZE       64
-
-#define SHA384_DIGEST_SIZE      48
-#define SHA384_BLOCK_SIZE       128
-
-#define SHA512_DIGEST_SIZE      64
-#define SHA512_BLOCK_SIZE       128
-
-#define SHA1_H0                0x67452301UL
-#define SHA1_H1                0xefcdab89UL
-#define SHA1_H2                0x98badcfeUL
-#define SHA1_H3                0x10325476UL
-#define SHA1_H4                0xc3d2e1f0UL
-
-#define SHA224_H0      0xc1059ed8UL
-#define SHA224_H1      0x367cd507UL
-#define SHA224_H2      0x3070dd17UL
-#define SHA224_H3      0xf70e5939UL
-#define SHA224_H4      0xffc00b31UL
-#define SHA224_H5      0x68581511UL
-#define SHA224_H6      0x64f98fa7UL
-#define SHA224_H7      0xbefa4fa4UL
-
-#define SHA256_H0      0x6a09e667UL
-#define SHA256_H1      0xbb67ae85UL
-#define SHA256_H2      0x3c6ef372UL
-#define SHA256_H3      0xa54ff53aUL
-#define SHA256_H4      0x510e527fUL
-#define SHA256_H5      0x9b05688cUL
-#define SHA256_H6      0x1f83d9abUL
-#define SHA256_H7      0x5be0cd19UL
-
-#define SHA384_H0      0xcbbb9d5dc1059ed8ULL
-#define SHA384_H1      0x629a292a367cd507ULL
-#define SHA384_H2      0x9159015a3070dd17ULL
-#define SHA384_H3      0x152fecd8f70e5939ULL
-#define SHA384_H4      0x67332667ffc00b31ULL
-#define SHA384_H5      0x8eb44a8768581511ULL
-#define SHA384_H6      0xdb0c2e0d64f98fa7ULL
-#define SHA384_H7      0x47b5481dbefa4fa4ULL
-
-#define SHA512_H0      0x6a09e667f3bcc908ULL
-#define SHA512_H1      0xbb67ae8584caa73bULL
-#define SHA512_H2      0x3c6ef372fe94f82bULL
-#define SHA512_H3      0xa54ff53a5f1d36f1ULL
-#define SHA512_H4      0x510e527fade682d1ULL
-#define SHA512_H5      0x9b05688c2b3e6c1fULL
-#define SHA512_H6      0x1f83d9abfb41bd6bULL
-#define SHA512_H7      0x5be0cd19137e2179ULL
-
-extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE];
-
-extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE];
-
-extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE];
-
-struct sha1_state {
-       u32 state[SHA1_DIGEST_SIZE / 4];
-       u64 count;
-       u8 buffer[SHA1_BLOCK_SIZE];
-};
-
-struct sha256_state {
-       u32 state[SHA256_DIGEST_SIZE / 4];
-       u64 count;
-       u8 buf[SHA256_BLOCK_SIZE];
-};
-
-struct sha512_state {
-       u64 state[SHA512_DIGEST_SIZE / 8];
-       u64 count[2];
-       u8 buf[SHA512_BLOCK_SIZE];
-};
-
-struct shash_desc;
-
-extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
-                             unsigned int len);
-
-extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
-                            unsigned int len, u8 *hash);
-
-extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
-                             unsigned int len);
-
-extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data,
-                              unsigned int len, u8 *hash);
-
-extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
-                             unsigned int len);
-
-extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
-                              unsigned int len, u8 *hash);
-#endif
diff --git a/include/crypto/sha1_base.h b/include/crypto/sha1_base.h
deleted file mode 100644 (file)
index 01b002d..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * sha1_base.h - core logic for SHA-1 implementations
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha.h>
-#include <linux/byteorder.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-
-#include <asm/unaligned.h>
-
-typedef void (sha1_block_fn)(struct sha1_state *sst, u8 const *src, int blocks);
-
-static inline int sha1_base_init(struct shash_desc *desc)
-{
-       struct sha1_state *sctx = shash_desc_ctx(desc);
-
-       sctx->state[0] = SHA1_H0;
-       sctx->state[1] = SHA1_H1;
-       sctx->state[2] = SHA1_H2;
-       sctx->state[3] = SHA1_H3;
-       sctx->state[4] = SHA1_H4;
-       sctx->count = 0;
-
-       return 0;
-}
-
-static inline int sha1_base_do_update(struct shash_desc *desc,
-                                     const u8 *data,
-                                     unsigned int len,
-                                     sha1_block_fn *block_fn)
-{
-       struct sha1_state *sctx = shash_desc_ctx(desc);
-       unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
-
-       sctx->count += len;
-
-       if (unlikely((partial + len) >= SHA1_BLOCK_SIZE)) {
-               int blocks;
-
-               if (partial) {
-                       int p = SHA1_BLOCK_SIZE - partial;
-
-                       memcpy(sctx->buffer + partial, data, p);
-                       data += p;
-                       len -= p;
-
-                       block_fn(sctx, sctx->buffer, 1);
-               }
-
-               blocks = len / SHA1_BLOCK_SIZE;
-               len %= SHA1_BLOCK_SIZE;
-
-               if (blocks) {
-                       block_fn(sctx, data, blocks);
-                       data += blocks * SHA1_BLOCK_SIZE;
-               }
-               partial = 0;
-       }
-       if (len)
-               memcpy(sctx->buffer + partial, data, len);
-
-       return 0;
-}
-
-static inline int sha1_base_do_finalize(struct shash_desc *desc,
-                                       sha1_block_fn *block_fn)
-{
-       const int bit_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
-       struct sha1_state *sctx = shash_desc_ctx(desc);
-       __be64 *bits = (__be64 *)(sctx->buffer + bit_offset);
-       unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
-
-       sctx->buffer[partial++] = 0x80;
-       if (partial > bit_offset) {
-               memset(sctx->buffer + partial, 0x0, SHA1_BLOCK_SIZE - partial);
-               partial = 0;
-
-               block_fn(sctx, sctx->buffer, 1);
-       }
-
-       memset(sctx->buffer + partial, 0x0, bit_offset - partial);
-       *bits = cpu_to_be64(sctx->count << 3);
-       block_fn(sctx, sctx->buffer, 1);
-
-       return 0;
-}
-
-static inline int sha1_base_finish(struct shash_desc *desc, u8 *out)
-{
-       struct sha1_state *sctx = shash_desc_ctx(desc);
-       __be32 *digest = (__be32 *)out;
-       int i;
-
-       for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
-               put_unaligned_be32(sctx->state[i], digest++);
-
-       *sctx = (struct sha1_state){};
-       return 0;
-}
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
new file mode 100644 (file)
index 0000000..a7a2ee4
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef _KEYS_USER_TYPE_H
+#define _KEYS_USER_TYPE_H
+
+#include <linux/key.h>
+
+#endif /* _KEYS_USER_TYPE_H */
index f09a44a6c1b1c7233c854f4d1e16a684d492bde2..4179f8ddd85057841e25935f0dd845b8e98f48b7 100644 (file)
@@ -102,9 +102,17 @@ struct bch_val {
        __u64           __nothing[0];
 };
 
-struct bkey {
-       __u64           _data[0];
+struct bversion {
+#if defined(__LITTLE_ENDIAN)
+       __u64           lo;
+       __u32           hi;
+#elif defined(__BIG_ENDIAN)
+       __u32           hi;
+       __u64           lo;
+#endif
+} __attribute__((packed, aligned(4)));
 
+struct bkey {
        /* Size of combined key and value, in u64s */
        __u8            u64s;
 
@@ -125,13 +133,13 @@ struct bkey {
 #if defined(__LITTLE_ENDIAN)
        __u8            pad[1];
 
-       __u32           version;
+       struct bversion version;
        __u32           size;           /* extent size, in sectors */
        struct bpos     p;
 #elif defined(__BIG_ENDIAN)
        struct bpos     p;
        __u32           size;           /* extent size, in sectors */
-       __u32           version;
+       struct bversion version;
 
        __u8            pad[1];
 #endif
@@ -184,7 +192,8 @@ enum bch_bkey_fields {
        BKEY_FIELD_OFFSET,
        BKEY_FIELD_SNAPSHOT,
        BKEY_FIELD_SIZE,
-       BKEY_FIELD_VERSION,
+       BKEY_FIELD_VERSION_HI,
+       BKEY_FIELD_VERSION_LO,
        BKEY_NR_FIELDS,
 };
 
@@ -200,14 +209,25 @@ enum bch_bkey_fields {
                bkey_format_field(OFFSET,       p.offset),              \
                bkey_format_field(SNAPSHOT,     p.snapshot),            \
                bkey_format_field(SIZE,         size),                  \
-               bkey_format_field(VERSION,      version),               \
+               bkey_format_field(VERSION_HI,   version.hi),            \
+               bkey_format_field(VERSION_LO,   version.lo),            \
        },                                                              \
 })
 
 /* bkey with inline value */
 struct bkey_i {
-       struct bkey     k;
-       struct bch_val  v;
+       __u64                   _data[0];
+
+       union {
+       struct {
+               /* Size of combined key and value, in u64s */
+               __u8            u64s;
+       };
+       struct {
+               struct bkey     k;
+               struct bch_val  v;
+       };
+       };
 };
 
 #ifndef __cplusplus
@@ -358,20 +378,47 @@ BKEY_VAL_TYPE(cookie,             KEY_TYPE_COOKIE);
  * is neither checksummed nor compressed.
  */
 
+/* 128 bits, sufficient for cryptographic MACs: */
+struct bch_csum {
+       __le64                  lo;
+       __le64                  hi;
+} __attribute__((packed, aligned(8)));
+
+#define BCH_CSUM_NONE                  0U
+#define BCH_CSUM_CRC32C                        1U
+#define BCH_CSUM_CRC64                 2U
+#define BCH_CSUM_CHACHA20_POLY1305_80  3U
+#define BCH_CSUM_CHACHA20_POLY1305_128 4U
+#define BCH_CSUM_NR                    5U
+
+static inline _Bool bch_csum_type_is_encryption(unsigned type)
+{
+       switch (type) {
+       case BCH_CSUM_CHACHA20_POLY1305_80:
+       case BCH_CSUM_CHACHA20_POLY1305_128:
+               return true;
+       default:
+               return false;
+       }
+}
+
 enum bch_extent_entry_type {
-       BCH_EXTENT_ENTRY_crc32          = 0,
-       BCH_EXTENT_ENTRY_ptr            = 1,
+       BCH_EXTENT_ENTRY_ptr            = 0,
+       BCH_EXTENT_ENTRY_crc32          = 1,
        BCH_EXTENT_ENTRY_crc64          = 2,
+       BCH_EXTENT_ENTRY_crc128         = 3,
 };
 
-#define BCH_EXTENT_ENTRY_MAX           3
+#define BCH_EXTENT_ENTRY_MAX           4
 
+/* Compressed/uncompressed size are stored biased by 1: */
 struct bch_extent_crc32 {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u32                   type:1,
+       __u32                   type:2,
+                               _compressed_size:7,
+                               _uncompressed_size:7,
                                offset:7,
-                               compressed_size:8,
-                               uncompressed_size:8,
+                               _unused:1,
                                csum_type:4,
                                compression_type:4;
        __u32                   csum;
@@ -379,45 +426,80 @@ struct bch_extent_crc32 {
        __u32                   csum;
        __u32                   compression_type:4,
                                csum_type:4,
-                               uncompressed_size:8,
-                               compressed_size:8,
+                               _unused:1,
                                offset:7,
-                               type:1;
+                               _uncompressed_size:7,
+                               _compressed_size:7,
+                               type:2;
 #endif
 } __attribute__((packed, aligned(8)));
 
-#define CRC32_EXTENT_SIZE_MAX  (1U << 7)
-
-/* 64k */
-#define BCH_COMPRESSED_EXTENT_MAX 128U
+#define CRC32_SIZE_MAX         (1U << 7)
+#define CRC32_NONCE_MAX                0
 
 struct bch_extent_crc64 {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
        __u64                   type:3,
-                               offset:17,
-                               compressed_size:18,
-                               uncompressed_size:18,
+                               _compressed_size:9,
+                               _uncompressed_size:9,
+                               offset:9,
+                               nonce:10,
+                               csum_type:4,
+                               compression_type:4,
+                               csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   csum_hi:16,
+                               compression_type:4,
+                               csum_type:4,
+                               nonce:10,
+                               offset:9,
+                               _uncompressed_size:9,
+                               _compressed_size:9,
+                               type:3;
+#endif
+       __u64                   csum_lo;
+} __attribute__((packed, aligned(8)));
+
+#define CRC64_SIZE_MAX         (1U << 9)
+#define CRC64_NONCE_MAX                ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:4,
+                               _compressed_size:13,
+                               _uncompressed_size:13,
+                               offset:13,
+                               nonce:13,
                                csum_type:4,
                                compression_type:4;
 #elif defined (__BIG_ENDIAN_BITFIELD)
        __u64                   compression_type:4,
                                csum_type:4,
-                               uncompressed_size:18,
-                               compressed_size:18,
-                               offset:17,
+                               nonce:14,
+                               offset:13,
+                               _uncompressed_size:13,
+                               _compressed_size:13,
                                type:3;
 #endif
-       __u64                   csum;
+       struct bch_csum         csum;
 } __attribute__((packed, aligned(8)));
 
-#define CRC64_EXTENT_SIZE_MAX  (1U << 17)
+#define CRC128_SIZE_MAX                (1U << 13)
+#define CRC128_NONCE_MAX       ((1U << 13) - 1)
+
+/*
+ * Max size of an extent that may require bouncing to read or write
+ * (checksummed, compressed): 64k
+ */
+#define BCH_ENCODED_EXTENT_MAX 128U
 
 /*
  * @reservation - pointer hasn't been written to, just reserved
  */
 struct bch_extent_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:2,
+       __u64                   type:1,
+                               cached:1,
                                erasure_coded:1,
                                reservation:1,
                                offset:44, /* 8 petabytes */
@@ -429,10 +511,25 @@ struct bch_extent_ptr {
                                offset:44,
                                reservation:1,
                                erasure_coded:1,
-                               type:2;
+                               cached:1,
+                               type:1;
 #endif
 } __attribute__((packed, aligned(8)));
 
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:5,
+                               unused:23,
+                               replicas:4,
+                               generation:32;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   generation:32,
+                               replicas:4,
+                               unused:23,
+                               type:5;
+#endif
+};
+
 union bch_extent_entry {
 #if defined(__LITTLE_ENDIAN) ||  __BITS_PER_LONG == 64
        unsigned long                   type;
@@ -446,6 +543,7 @@ union bch_extent_entry {
 #endif
        struct bch_extent_crc32         crc32;
        struct bch_extent_crc64         crc64;
+       struct bch_extent_crc128        crc128;
        struct bch_extent_ptr           ptr;
 };
 
@@ -473,9 +571,18 @@ struct bch_extent {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(extent,          BCH_EXTENT);
 
+struct bch_reservation {
+       struct bch_val          v;
+
+       __le32                  generation;
+       __u8                    nr_replicas;
+       __u8                    pad[3];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(reservation,     BCH_RESERVATION);
+
 /* Maximum size (in u64s) a single pointer could be: */
 #define BKEY_EXTENT_PTR_U64s_MAX\
-       ((sizeof(struct bch_extent_crc64) +                     \
+       ((sizeof(struct bch_extent_crc128) +                    \
          sizeof(struct bch_extent_ptr)) / sizeof(u64))
 
 /* Maximum possible size of an entire extent value: */
@@ -506,28 +613,26 @@ enum bch_inode_types {
 struct bch_inode {
        struct bch_val          v;
 
-       __le16                  i_mode;
-       __le16                  pad;
-       __le32                  i_flags;
-
-       /* Nanoseconds */
-       __le64                  i_atime;
-       __le64                  i_ctime;
-       __le64                  i_mtime;
-
-       __le64                  i_size;
-       __le64                  i_sectors;
-
-       __le32                  i_uid;
-       __le32                  i_gid;
-       __le32                  i_nlink;
-
-       __le32                  i_dev;
-
        __le64                  i_hash_seed;
+       __le32                  i_flags;
+       __le16                  i_mode;
+       __u8                    fields[0];
 } __attribute__((packed));
 BKEY_VAL_TYPE(inode,           BCH_INODE_FS);
 
+#define BCH_INODE_FIELDS()                             \
+       BCH_INODE_FIELD(i_atime,        64)             \
+       BCH_INODE_FIELD(i_ctime,        64)             \
+       BCH_INODE_FIELD(i_mtime,        64)             \
+       BCH_INODE_FIELD(i_otime,        64)             \
+       BCH_INODE_FIELD(i_size,         64)             \
+       BCH_INODE_FIELD(i_sectors,      64)             \
+       BCH_INODE_FIELD(i_uid,          32)             \
+       BCH_INODE_FIELD(i_gid,          32)             \
+       BCH_INODE_FIELD(i_nlink,        32)             \
+       BCH_INODE_FIELD(i_generation,   32)             \
+       BCH_INODE_FIELD(i_dev,          32)
+
 enum {
        /*
         * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
@@ -544,9 +649,9 @@ enum {
 
        /* not implemented yet: */
        __BCH_INODE_HAS_XATTRS  = 7, /* has xattrs in xattr btree */
-};
 
-LE32_BITMASK(INODE_STR_HASH_TYPE, struct bch_inode, i_flags, 28, 32);
+       /* bits 20+ reserved for packed fields below: */
+};
 
 #define BCH_INODE_SYNC         (1 << __BCH_INODE_SYNC)
 #define BCH_INODE_IMMUTABLE    (1 << __BCH_INODE_IMMUTABLE)
@@ -557,6 +662,9 @@ LE32_BITMASK(INODE_STR_HASH_TYPE, struct bch_inode, i_flags, 28, 32);
 #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
 #define BCH_INODE_HAS_XATTRS   (1 << __BCH_INODE_HAS_XATTRS)
 
+LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, i_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, i_flags, 24, 32);
+
 struct bch_inode_blockdev {
        struct bch_val          v;
 
@@ -574,6 +682,7 @@ BKEY_VAL_TYPE(inode_blockdev,       BCH_INODE_BLOCKDEV);
 
 /* Thin provisioned volume, or cache for another block device? */
 LE64_BITMASK(CACHED_DEV,       struct bch_inode_blockdev, i_flags, 0,  1)
+
 /* Dirents */
 
 /*
@@ -639,6 +748,7 @@ BKEY_VAL_TYPE(xattr,                BCH_XATTR);
  * Version 4: Backing device with data offset
  * Version 5: All the incompat changes
  * Version 6: Cache device UUIDs all in superblock, another incompat bset change
+ * Version 7: Encryption (expanded checksum fields), other random things
  */
 #define BCACHE_SB_VERSION_CDEV_V0      0
 #define BCACHE_SB_VERSION_BDEV         1
@@ -646,16 +756,15 @@ BKEY_VAL_TYPE(xattr,              BCH_XATTR);
 #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
 #define BCACHE_SB_VERSION_CDEV_V2      5
 #define BCACHE_SB_VERSION_CDEV_V3      6
-#define BCACHE_SB_VERSION_CDEV         6
-#define BCACHE_SB_MAX_VERSION          6
+#define BCACHE_SB_VERSION_CDEV_V4      7
+#define BCACHE_SB_VERSION_CDEV         7
+#define BCACHE_SB_MAX_VERSION          7
 
-#define SB_SECTOR                      8
-#define SB_LABEL_SIZE                  32
-#define MAX_CACHES_PER_SET             64
-
-#define BDEV_DATA_START_DEFAULT                16      /* sectors */
+#define BCH_SB_SECTOR                  8
+#define BCH_SB_LABEL_SIZE              32
+#define BCH_SB_MEMBERS_MAX             64 /* XXX kill */
 
-struct cache_member {
+struct bch_member {
        uuid_le                 uuid;
        __le64                  nbuckets;       /* device size */
        __le16                  first_bucket;   /* index of first bucket used */
@@ -663,164 +772,257 @@ struct cache_member {
        __le32                  pad;
        __le64                  last_mount;     /* time_t */
 
-       __le64                  f1;
-       __le64                  f2;
+       __le64                  flags[2];
 };
 
-LE64_BITMASK(CACHE_STATE,      struct cache_member, f1, 0,  4)
-#define CACHE_ACTIVE                   0U
-#define CACHE_RO                       1U
-#define CACHE_FAILED                   2U
-#define CACHE_SPARE                    3U
-#define CACHE_STATE_NR                 4U
+LE64_BITMASK(BCH_MEMBER_STATE,         struct bch_member, flags[0],  0,  4)
+LE64_BITMASK(BCH_MEMBER_TIER,          struct bch_member, flags[0],  4,  8)
+LE64_BITMASK(BCH_MEMBER_HAS_METADATA,  struct bch_member, flags[0],  8,  9)
+LE64_BITMASK(BCH_MEMBER_HAS_DATA,      struct bch_member, flags[0],  9, 10)
+LE64_BITMASK(BCH_MEMBER_REPLACEMENT,   struct bch_member, flags[0], 10, 14)
+LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags[0], 14, 15);
 
-LE64_BITMASK(CACHE_TIER,               struct cache_member, f1, 4,  8)
-#define CACHE_TIERS                    4U
+#if 0
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
+#endif
 
-LE64_BITMASK(CACHE_REPLICATION_SET,    struct cache_member, f1, 8,  16)
+enum bch_member_state {
+       BCH_MEMBER_STATE_ACTIVE         = 0,
+       BCH_MEMBER_STATE_RO             = 1,
+       BCH_MEMBER_STATE_FAILED         = 2,
+       BCH_MEMBER_STATE_SPARE          = 3,
+       BCH_MEMBER_STATE_NR             = 4,
+};
 
-LE64_BITMASK(CACHE_HAS_METADATA,       struct cache_member, f1, 24, 25)
-LE64_BITMASK(CACHE_HAS_DATA,           struct cache_member, f1, 25, 26)
+#define BCH_TIER_MAX                   4U
 
-LE64_BITMASK(CACHE_REPLACEMENT,        struct cache_member, f1, 26, 30)
-#define CACHE_REPLACEMENT_LRU          0U
-#define CACHE_REPLACEMENT_FIFO         1U
-#define CACHE_REPLACEMENT_RANDOM       2U
-#define CACHE_REPLACEMENT_NR           3U
+enum cache_replacement {
+       CACHE_REPLACEMENT_LRU           = 0,
+       CACHE_REPLACEMENT_FIFO          = 1,
+       CACHE_REPLACEMENT_RANDOM        = 2,
+       CACHE_REPLACEMENT_NR            = 3,
+};
 
-LE64_BITMASK(CACHE_DISCARD,            struct cache_member, f1, 30, 31);
+struct bch_sb_layout {
+       uuid_le                 magic;  /* bcache superblock UUID */
+       __u8                    layout_type;
+       __u8                    sb_max_size_bits; /* base 2 of 512 byte sectors */
+       __u8                    nr_superblocks;
+       __u8                    pad[5];
+       __u64                   sb_offset[61];
+} __attribute__((packed));
 
-LE64_BITMASK(CACHE_NR_READ_ERRORS,     struct cache_member, f2, 0,  20);
-LE64_BITMASK(CACHE_NR_WRITE_ERRORS,    struct cache_member, f2, 20, 40);
+#define BCH_SB_LAYOUT_SECTOR   7
 
-struct cache_sb {
-       __le64                  csum;
-       __le64                  offset; /* sector where this sb was written */
-       __le64                  version; /* of on disk format */
+struct bch_sb_field {
+       __u64                   _data[0];
+       __le32                  u64s;
+       __le32                  type;
+};
 
-       uuid_le                 magic;  /* bcache superblock UUID */
+enum bch_sb_field_types {
+       BCH_SB_FIELD_journal    = 0,
+       BCH_SB_FIELD_members    = 1,
+       BCH_SB_FIELD_crypt      = 2,
+       BCH_SB_FIELD_NR         = 3,
+};
 
-       /* Identifies this disk within the cache set: */
-       uuid_le                 disk_uuid;
+struct bch_sb_field_journal {
+       struct bch_sb_field     field;
+       __le64                  buckets[0];
+};
 
-       /*
-        * Internal cache set UUID - xored with various magic numbers and thus
-        * must never change:
-        */
-       union {
-               uuid_le         set_uuid;
-               __le64          set_magic;
-       };
+struct bch_sb_field_members {
+       struct bch_sb_field     field;
+       struct bch_member       members[0];
+};
+
+/* Crypto: */
 
-       __u8                    label[SB_LABEL_SIZE];
+struct nonce {
+       __le32                  d[4];
+};
+
+struct bch_key {
+       __le64                  key[4];
+};
+
+#define BCH_KEY_MAGIC                                  \
+       (((u64) 'b' <<  0)|((u64) 'c' <<  8)|           \
+        ((u64) 'h' << 16)|((u64) '*' << 24)|           \
+        ((u64) '*' << 32)|((u64) 'k' << 40)|           \
+        ((u64) 'e' << 48)|((u64) 'y' << 56))
+
+struct bch_encrypted_key {
+       __le64                  magic;
+       struct bch_key          key;
+};
+
+/*
+ * If this field is present in the superblock, it stores an encryption key which
+ * is used encrypt all other data/metadata. The key will normally be encrypted
+ * with the key userspace provides, but if encryption has been turned off we'll
+ * just store the master key unencrypted in the superblock so we can access the
+ * previously encrypted data.
+ */
+struct bch_sb_field_crypt {
+       struct bch_sb_field     field;
 
        __le64                  flags;
+       __le64                  kdf_flags;
+       struct bch_encrypted_key key;
+};
 
-       /* Incremented each time superblock is written: */
-       __le64                  seq;
+LE64_BITMASK(BCH_CRYPT_KDF_TYPE,       struct bch_sb_field_crypt, flags, 0, 4);
 
-       /*
-        * User visible UUID for identifying the cache set the user is allowed
-        * to change:
-        */
-       uuid_le                 user_uuid;
+enum bch_kdf_types {
+       BCH_KDF_SCRYPT          = 0,
+       BCH_KDF_NR              = 1,
+};
 
-       __le64                  flags2;
-       __le64                  pad1[5];
+/* stored as base 2 log of scrypt params: */
+LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags,  0, 16);
+LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
+LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
 
-       /* Number of cache_member entries: */
-       __u8                    nr_in_set;
+/*
+ * @offset     - sector where this sb was written
+ * @version    - on disk format version
+ * @magic      - identifies as a bcache superblock (BCACHE_MAGIC)
+ * @seq                - incremented each time superblock is written
+ * @uuid       - used for generating various magic numbers and identifying
+ *                member devices, never changes
+ * @user_uuid  - user visible UUID, may be changed
+ * @label      - filesystem label
+ * @seq                - identifies most recent superblock, incremented each time
+ *               superblock is written
+ * @features   - enabled incompatible features
+ */
+struct bch_sb {
+       struct bch_csum         csum;
+       __le64                  version;
+       uuid_le                 magic;
+       uuid_le                 uuid;
+       uuid_le                 user_uuid;
+       __u8                    label[BCH_SB_LABEL_SIZE];
+       __le64                  offset;
+       __le64                  seq;
 
-       /*
-        * Index of this device - for PTR_DEV(), and also this device's
-        * slot in the cache_member array:
-        */
-       __u8                    nr_this_dev;
-       __le16                  pad2[3];
+       __le16                  block_size;
+       __u8                    dev_idx;
+       __u8                    nr_devices;
+       __le32                  u64s;
 
-       __le16                  block_size;     /* sectors */
-       __le16                  pad3[6];
+       __le64                  time_base_lo;
+       __le32                  time_base_hi;
+       __le32                  time_precision;
+
+       __le64                  flags[8];
+       __le64                  features[2];
+       __le64                  compat[2];
 
-       __le16                  u64s;   /* size of variable length portion */
+       struct bch_sb_layout    layout;
 
        union {
-               struct cache_member     members[0];
-               /*
-                * Journal buckets also in the variable length portion, after
-                * the member info:
-                */
-               __le64                  _data[0];
+               struct bch_sb_field start[0];
+               __le64          _data[0];
        };
-};
+} __attribute__((packed, aligned(8)));
 
-/* XXX: rename CACHE_SET -> BCH_FS or something? */
+/*
+ * Flags:
+ * BCH_SB_INITALIZED   - set on first mount
+ * BCH_SB_CLEAN                - did we shut down cleanly? Just a hint, doesn't affect
+ *                       behaviour of mount/recovery path:
+ * BCH_SB_INODE_32BIT  - limit inode numbers to 32 bits
+ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
+ *                        DATA/META_CSUM_TYPE. Also indicates encryption
+ *                        algorithm in use, if/when we get more than one
+ */
 
-LE64_BITMASK(CACHE_SET_SYNC,           struct cache_sb, flags, 0, 1);
+LE64_BITMASK(BCH_SB_INITIALIZED,       struct bch_sb, flags[0],  0,  1);
+LE64_BITMASK(BCH_SB_CLEAN,             struct bch_sb, flags[0],  1,  2);
+LE64_BITMASK(BCH_SB_CSUM_TYPE,         struct bch_sb, flags[0],  2,  8);
+LE64_BITMASK(BCH_SB_ERROR_ACTION,      struct bch_sb, flags[0],  8, 12);
 
-LE64_BITMASK(CACHE_SET_ERROR_ACTION,   struct cache_sb, flags, 1, 4);
-#define BCH_ON_ERROR_CONTINUE          0U
-#define BCH_ON_ERROR_RO                        1U
-#define BCH_ON_ERROR_PANIC             2U
-#define BCH_NR_ERROR_ACTIONS           3U
+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,   struct bch_sb, flags[0], 12, 28);
 
-LE64_BITMASK(CACHE_SET_META_REPLICAS_WANT,struct cache_sb, flags, 4, 8);
-LE64_BITMASK(CACHE_SET_DATA_REPLICAS_WANT,struct cache_sb, flags, 8, 12);
+LE64_BITMASK(BCH_SB_GC_RESERVE,                struct bch_sb, flags[0], 28, 33);
+LE64_BITMASK(BCH_SB_ROOT_RESERVE,      struct bch_sb, flags[0], 33, 40);
 
-#define BCH_REPLICAS_MAX               4U
+LE64_BITMASK(BCH_SB_META_CSUM_TYPE,    struct bch_sb, flags[0], 40, 44);
+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,    struct bch_sb, flags[0], 44, 48);
 
-LE64_BITMASK(CACHE_SB_CSUM_TYPE,       struct cache_sb, flags, 12, 16);
+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT,        struct bch_sb, flags[0], 48, 52);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT,        struct bch_sb, flags[0], 52, 56);
 
-LE64_BITMASK(CACHE_SET_META_PREFERRED_CSUM_TYPE,struct cache_sb, flags, 16, 20);
-#define BCH_CSUM_NONE                  0U
-#define BCH_CSUM_CRC32C                        1U
-#define BCH_CSUM_CRC64                 2U
-#define BCH_CSUM_NR                    3U
+LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE,        struct bch_sb, flags[0], 56, 60);
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE,        struct bch_sb, flags[0], 60, 64);
 
-LE64_BITMASK(CACHE_SET_BTREE_NODE_SIZE,        struct cache_sb, flags, 20, 36);
+LE64_BITMASK(BCH_SB_STR_HASH_TYPE,     struct bch_sb, flags[1],  0,  4);
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,  struct bch_sb, flags[1],  4,  8);
+LE64_BITMASK(BCH_SB_INODE_32BIT,       struct bch_sb, flags[1],  8,  9);
 
-LE64_BITMASK(CACHE_SET_META_REPLICAS_HAVE,struct cache_sb, flags, 36, 40);
-LE64_BITMASK(CACHE_SET_DATA_REPLICAS_HAVE,struct cache_sb, flags, 40, 44);
+LE64_BITMASK(BCH_SB_128_BIT_MACS,      struct bch_sb, flags[1],  9, 10);
+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,   struct bch_sb, flags[1], 10, 14);
+LE64_BITMASK(BCH_SB_JOURNAL_ENTRY_SIZE,        struct bch_sb, flags[1], 14, 20);
 
-LE64_BITMASK(CACHE_SET_STR_HASH_TYPE,struct cache_sb, flags, 44, 48);
-enum bch_str_hash_type {
-       BCH_STR_HASH_CRC32C             = 0,
-       BCH_STR_HASH_CRC64              = 1,
-       BCH_STR_HASH_SIPHASH            = 2,
-       BCH_STR_HASH_SHA1               = 3,
+/* Features: */
+enum bch_sb_features {
+       BCH_FEATURE_LZ4                 = 0,
+       BCH_FEATURE_GZIP                = 1,
 };
 
-#define BCH_STR_HASH_NR                        4
+/* options: */
 
-LE64_BITMASK(CACHE_SET_DATA_PREFERRED_CSUM_TYPE, struct cache_sb, flags, 48, 52);
+#define BCH_REPLICAS_MAX               4U
 
-LE64_BITMASK(CACHE_SET_COMPRESSION_TYPE, struct cache_sb, flags, 52, 56);
-enum {
-       BCH_COMPRESSION_NONE            = 0,
-       BCH_COMPRESSION_LZ4             = 1,
-       BCH_COMPRESSION_GZIP            = 2,
+#if 0
+#define BCH_ERROR_ACTIONS()                                    \
+       x(BCH_ON_ERROR_CONTINUE,        0, "continue")          \
+       x(BCH_ON_ERROR_RO,              1, "remount-ro")        \
+       x(BCH_ON_ERROR_PANIC,           2, "panic")             \
+       x(BCH_NR_ERROR_ACTIONS,         3, NULL)
+
+enum bch_error_actions {
+#define x(_opt, _nr, _str)     _opt = _nr,
+       BCH_ERROR_ACTIONS()
+#undef x
 };
+#endif
 
-#define BCH_COMPRESSION_NR             3U
-
-/* Limit inode numbers to 32 bits: */
-LE64_BITMASK(CACHE_INODE_32BIT,                struct cache_sb, flags, 56, 57);
-
-LE64_BITMASK(CACHE_SET_GC_RESERVE,     struct cache_sb, flags, 57, 63);
-
-LE64_BITMASK(CACHE_SET_ROOT_RESERVE,   struct cache_sb, flags2, 0,  6);
+enum bch_error_actions {
+       BCH_ON_ERROR_CONTINUE           = 0,
+       BCH_ON_ERROR_RO                 = 1,
+       BCH_ON_ERROR_PANIC              = 2,
+       BCH_NR_ERROR_ACTIONS            = 3,
+};
 
-/*
- * Did we shut down cleanly? Just a hint, doesn't affect behaviour of
- * mount/recovery path:
- */
-LE64_BITMASK(CACHE_SET_CLEAN,          struct cache_sb, flags2, 6, 7);
+enum bch_csum_opts {
+       BCH_CSUM_OPT_NONE               = 0,
+       BCH_CSUM_OPT_CRC32C             = 1,
+       BCH_CSUM_OPT_CRC64              = 2,
+       BCH_CSUM_OPT_NR                 = 3,
+};
 
-LE64_BITMASK(CACHE_SET_JOURNAL_ENTRY_SIZE, struct cache_sb, flags2, 7, 15);
+enum bch_str_hash_opts {
+       BCH_STR_HASH_CRC32C             = 0,
+       BCH_STR_HASH_CRC64              = 1,
+       BCH_STR_HASH_SIPHASH            = 2,
+       BCH_STR_HASH_NR                 = 3,
+};
 
-/* options: */
+enum bch_compression_opts {
+       BCH_COMPRESSION_NONE            = 0,
+       BCH_COMPRESSION_LZ4             = 1,
+       BCH_COMPRESSION_GZIP            = 2,
+       BCH_COMPRESSION_NR              = 3,
+};
 
 /**
- * CACHE_SET_OPT(name, choices, min, max, sb_option, sysfs_writeable)
+ * BCH_OPT(name, choices, min, max, sb_option, sysfs_writeable)
  *
  * @name - name of mount option, sysfs attribute, and struct cache_set_opts
  *     member
@@ -838,56 +1040,60 @@ LE64_BITMASK(CACHE_SET_JOURNAL_ENTRY_SIZE, struct cache_sb, flags2, 7, 15);
  * @sysfs_writeable - if true, option will be modifiable at runtime via sysfs
  */
 
-#define CACHE_SET_SB_OPTS()                                    \
-       CACHE_SET_OPT(errors,                                   \
-                     bch_error_actions,                        \
-                     0, BCH_NR_ERROR_ACTIONS,                  \
-                     CACHE_SET_ERROR_ACTION,                   \
-                     true)                                     \
-       CACHE_SET_OPT(metadata_replicas,                        \
-                     bch_uint_opt,                             \
-                     0, BCH_REPLICAS_MAX,                      \
-                     CACHE_SET_META_REPLICAS_WANT,             \
-                     false)                                    \
-       CACHE_SET_OPT(data_replicas,                            \
-                     bch_uint_opt,                             \
-                     0, BCH_REPLICAS_MAX,                      \
-                     CACHE_SET_DATA_REPLICAS_WANT,             \
-                     false)                                    \
-       CACHE_SET_OPT(metadata_checksum,                        \
-                     bch_csum_types,                           \
-                     0, BCH_CSUM_NR,                           \
-                     CACHE_SET_META_PREFERRED_CSUM_TYPE,       \
-                     true)                                     \
-       CACHE_SET_OPT(data_checksum,                            \
-                     bch_csum_types,                           \
-                     0, BCH_CSUM_NR,                           \
-                     CACHE_SET_DATA_PREFERRED_CSUM_TYPE,       \
-                     true)                                     \
-       CACHE_SET_OPT(compression,                              \
-                     bch_compression_types,                    \
-                     0, BCH_COMPRESSION_NR,                    \
-                     CACHE_SET_COMPRESSION_TYPE,               \
-                     true)                                     \
-       CACHE_SET_OPT(str_hash,                                 \
-                     bch_str_hash_types,                       \
-                     0, BCH_STR_HASH_NR,                       \
-                     CACHE_SET_STR_HASH_TYPE,                  \
-                     true)                                     \
-       CACHE_SET_OPT(inodes_32bit,                             \
-                     bch_bool_opt, 0, 2,                       \
-                     CACHE_INODE_32BIT,                        \
-                     true)                                     \
-       CACHE_SET_OPT(gc_reserve_percent,                       \
-                     bch_uint_opt,                             \
-                     5, 21,                                    \
-                     CACHE_SET_GC_RESERVE,                     \
-                     false)                                    \
-       CACHE_SET_OPT(root_reserve_percent,                     \
-                     bch_uint_opt,                             \
-                     0, 21,                                    \
-                     CACHE_SET_ROOT_RESERVE,                   \
-                     false)
+#define BCH_SB_OPTS()                                          \
+       BCH_OPT(errors,                                         \
+               bch_error_actions,                              \
+               0, BCH_NR_ERROR_ACTIONS,                        \
+               BCH_SB_ERROR_ACTION,                            \
+               true)                                           \
+       BCH_OPT(metadata_replicas,                              \
+               bch_uint_opt,                                   \
+               0, BCH_REPLICAS_MAX,                            \
+               BCH_SB_META_REPLICAS_WANT,                      \
+               false)                                          \
+       BCH_OPT(data_replicas,                                  \
+               bch_uint_opt,                                   \
+               0, BCH_REPLICAS_MAX,                            \
+               BCH_SB_DATA_REPLICAS_WANT,                      \
+               false)                                          \
+       BCH_OPT(metadata_checksum,                              \
+               bch_csum_types,                                 \
+               0, BCH_CSUM_OPT_NR,                             \
+               BCH_SB_META_CSUM_TYPE,                          \
+               true)                                           \
+       BCH_OPT(data_checksum,                                  \
+               bch_csum_types,                                 \
+               0, BCH_CSUM_OPT_NR,                             \
+               BCH_SB_DATA_CSUM_TYPE,                          \
+               true)                                           \
+       BCH_OPT(compression,                                    \
+               bch_compression_types,                          \
+               0, BCH_COMPRESSION_NR,                          \
+               BCH_SB_COMPRESSION_TYPE,                        \
+               true)                                           \
+       BCH_OPT(str_hash,                                       \
+               bch_str_hash_types,                             \
+               0, BCH_STR_HASH_NR,                             \
+               BCH_SB_STR_HASH_TYPE,                           \
+               true)                                           \
+       BCH_OPT(inodes_32bit,                                   \
+               bch_bool_opt, 0, 2,                             \
+               BCH_SB_INODE_32BIT,                             \
+               true)                                           \
+       BCH_OPT(gc_reserve_percent,                             \
+               bch_uint_opt,                                   \
+               5, 21,                                          \
+               BCH_SB_GC_RESERVE,                              \
+               false)                                          \
+       BCH_OPT(root_reserve_percent,                           \
+               bch_uint_opt,                                   \
+               0, 100,                                         \
+               BCH_SB_ROOT_RESERVE,                            \
+               false)                                          \
+       BCH_OPT(wide_macs,                                      \
+               bch_bool_opt, 0, 2,                             \
+               BCH_SB_128_BIT_MACS,                            \
+               true)
 
 /* backing device specific stuff: */
 
@@ -908,7 +1114,7 @@ struct backingdev_sb {
                uuid_le         set_uuid;
                __le64          set_magic;
        };
-       __u8                    label[SB_LABEL_SIZE];
+       __u8                    label[BCH_SB_LABEL_SIZE];
 
        __le64                  flags;
 
@@ -947,15 +1153,7 @@ LE64_BITMASK(BDEV_STATE,          struct backingdev_sb, flags, 61, 63);
 #define BDEV_STATE_DIRTY               2U
 #define BDEV_STATE_STALE               3U
 
-static inline unsigned bch_journal_buckets_offset(struct cache_sb *sb)
-{
-       return sb->nr_in_set * (sizeof(struct cache_member) / sizeof(__u64));
-}
-
-static inline unsigned bch_nr_journal_buckets(struct cache_sb *sb)
-{
-       return __le16_to_cpu(sb->u64s) - bch_journal_buckets_offset(sb);
-}
+#define BDEV_DATA_START_DEFAULT                16      /* sectors */
 
 static inline _Bool __SB_IS_BDEV(__u64 version)
 {
@@ -963,7 +1161,7 @@ static inline _Bool __SB_IS_BDEV(__u64 version)
                || version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
 }
 
-static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
+static inline _Bool SB_IS_BDEV(const struct bch_sb *sb)
 {
        return __SB_IS_BDEV(sb->version);
 }
@@ -981,29 +1179,33 @@ static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
 
 #define BCACHE_STATFS_MAGIC            0xca451a4e
 
-#define BCACHE_SB_MAGIC                        0xca451a4ef67385c6ULL
-#define BCACHE_SB_MAGIC2               0x816dba487ff56582ULL
-#define JSET_MAGIC                     0x245235c1a3625032ULL
-#define PSET_MAGIC                     0x6750e15f87337f91ULL
-#define BSET_MAGIC                     0x90135c78b99e07f5ULL
+#define JSET_MAGIC             __cpu_to_le64(0x245235c1a3625032ULL)
+#define PSET_MAGIC             __cpu_to_le64(0x6750e15f87337f91ULL)
+#define BSET_MAGIC             __cpu_to_le64(0x90135c78b99e07f5ULL)
 
-static inline __u64 jset_magic(struct cache_sb *sb)
+static inline __le64 __bch_sb_magic(struct bch_sb *sb)
 {
-       return __le64_to_cpu(sb->set_magic) ^ JSET_MAGIC;
+       __le64 ret;
+       memcpy(&ret, &sb->uuid, sizeof(ret));
+       return ret;
 }
 
-static inline __u64 pset_magic(struct cache_sb *sb)
+static inline __u64 __jset_magic(struct bch_sb *sb)
 {
-       return __le64_to_cpu(sb->set_magic) ^ PSET_MAGIC;
+       return __le64_to_cpu(__bch_sb_magic(sb) ^ JSET_MAGIC);
 }
 
-static inline __u64 bset_magic(struct cache_sb *sb)
+static inline __u64 __pset_magic(struct bch_sb *sb)
 {
-       return __le64_to_cpu(sb->set_magic) ^ BSET_MAGIC;
+       return __le64_to_cpu(__bch_sb_magic(sb) ^ PSET_MAGIC);
 }
 
-/* Journal */
+static inline __u64 __bset_magic(struct bch_sb *sb)
+{
+       return __le64_to_cpu(__bch_sb_magic(sb) ^ BSET_MAGIC);
+}
 
+/* Journal */
 
 #define BCACHE_JSET_VERSION_UUIDv1     1
 #define BCACHE_JSET_VERSION_UUID       1       /* Always latest UUID format */
@@ -1054,24 +1256,29 @@ enum {
  * version is for on disk format changes.
  */
 struct jset {
-       __le64                  csum;
+       struct bch_csum         csum;
+
        __le64                  magic;
+       __le64                  seq;
        __le32                  version;
        __le32                  flags;
 
-       /* Sequence number of oldest dirty journal entry */
-       __le64                  seq;
-       __le64                  last_seq;
+       __le32                  u64s; /* size of d[] in u64s */
+
+       __u8                    encrypted_start[0];
 
        __le16                  read_clock;
        __le16                  write_clock;
-       __le32                  u64s; /* size of d[] in u64s */
+
+       /* Sequence number of oldest dirty journal entry */
+       __le64                  last_seq;
+
 
        union {
                struct jset_entry start[0];
                __u64           _data[0];
        };
-};
+} __attribute__((packed));
 
 LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
@@ -1081,10 +1288,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN,   struct jset, flags, 4, 5);
 /* Bucket prios/gens */
 
 struct prio_set {
-       __le64                  csum;
+       struct bch_csum         csum;
+
        __le64                  magic;
-       __le32                  version;
-       __le32                  flags;
+       __le32                  nonce[3];
+       __le16                  version;
+       __le16                  flags;
+
+       __u8                    encrypted_start[0];
 
        __le64                  next_bucket;
 
@@ -1093,7 +1304,7 @@ struct prio_set {
                __le16          write_prio;
                __u8            gen;
        } __attribute__((packed)) data[];
-};
+} __attribute__((packed));
 
 LE32_BITMASK(PSET_CSUM_TYPE,   struct prio_set, flags, 0, 4);
 
@@ -1155,28 +1366,49 @@ struct bset {
 
 LE32_BITMASK(BSET_CSUM_TYPE,   struct bset, flags, 0, 4);
 
-/* Only used in first bset */
-LE32_BITMASK(BSET_BTREE_LEVEL, struct bset, flags, 4, 8);
-
-LE32_BITMASK(BSET_BIG_ENDIAN,  struct bset, flags, 8, 9);
+LE32_BITMASK(BSET_BIG_ENDIAN,  struct bset, flags, 4, 5);
 LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
-                               struct bset, flags, 9, 10);
+                               struct bset, flags, 5, 6);
 
 struct btree_node {
-       __le64                  csum;
+       struct bch_csum         csum;
        __le64                  magic;
 
+       /* this flags field is encrypted, unlike bset->flags: */
+       __le64                  flags;
+
        /* Closed interval: */
        struct bpos             min_key;
        struct bpos             max_key;
+       struct bch_extent_ptr   ptr;
        struct bkey_format      format;
 
+       union {
        struct bset             keys;
+       struct {
+               __u8            pad[22];
+               __le16          u64s;
+               __u64           _data[0];
+
+       };
+       };
 } __attribute__((packed));
 
+LE64_BITMASK(BTREE_NODE_ID,    struct btree_node, flags, 0, 4);
+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
+
 struct btree_node_entry {
-       __le64                  csum;
+       struct bch_csum         csum;
+
+       union {
        struct bset             keys;
+       struct {
+               __u8            pad[22];
+               __le16          u64s;
+               __u64           _data[0];
+
+       };
+       };
 } __attribute__((packed));
 
 /* OBSOLETE */
@@ -1237,7 +1469,7 @@ struct jset_v0 {
        __u16                   btree_level;
        __u16                   pad[3];
 
-       __u64                   prio_bucket[MAX_CACHES_PER_SET];
+       __u64                   prio_bucket[64];
 
        union {
                struct bkey     start[0];
index cb9ad24fef00f70f0af282670943aa14948ff772..0dbeaaedff33637739c1a5ae95b70adae38e63ce 100644 (file)
 #include <linux/slab.h>
 #include <linux/string.h>
 
-/*
- * Autoloaded crypto modules should only use a prefixed name to avoid allowing
- * arbitrary modules to be loaded. Loading from userspace may still need the
- * unprefixed names, so retains those aliases as well.
- * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3
- * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro
- * expands twice on the same line. Instead, use a separate base name for the
- * alias.
- */
-#define MODULE_ALIAS_CRYPTO(name)      \
-               __MODULE_INFO(alias, alias_userspace, name);    \
-               __MODULE_INFO(alias, alias_crypto, "crypto-" name)
-
-/*
- * Algorithm masks and types.
- */
 #define CRYPTO_ALG_TYPE_MASK           0x0000000f
-#define CRYPTO_ALG_TYPE_CIPHER         0x00000001
-#define CRYPTO_ALG_TYPE_AEAD           0x00000003
 #define CRYPTO_ALG_TYPE_BLKCIPHER      0x00000004
-#define CRYPTO_ALG_TYPE_ABLKCIPHER     0x00000005
-#define CRYPTO_ALG_TYPE_SKCIPHER       0x00000005
-#define CRYPTO_ALG_TYPE_GIVCIPHER      0x00000006
-#define CRYPTO_ALG_TYPE_KPP            0x00000008
-#define CRYPTO_ALG_TYPE_RNG            0x0000000c
-#define CRYPTO_ALG_TYPE_AKCIPHER       0x0000000d
-#define CRYPTO_ALG_TYPE_DIGEST         0x0000000e
-#define CRYPTO_ALG_TYPE_HASH           0x0000000e
 #define CRYPTO_ALG_TYPE_SHASH          0x0000000e
-#define CRYPTO_ALG_TYPE_AHASH          0x0000000f
-
-#define CRYPTO_ALG_TYPE_HASH_MASK      0x0000000e
-#define CRYPTO_ALG_TYPE_AHASH_MASK     0x0000000e
 #define CRYPTO_ALG_TYPE_BLKCIPHER_MASK 0x0000000c
-
 #define CRYPTO_ALG_ASYNC               0x00000080
 
-/*
- * Set this bit if and only if the algorithm requires another algorithm of
- * the same type to handle corner cases.
- */
-#define CRYPTO_ALG_NEED_FALLBACK       0x00000100
-
-/*
- * This bit is set for symmetric key ciphers that have already been wrapped
- * with a generic IV generator to prevent them from being wrapped again.
- */
-#define CRYPTO_ALG_GENIV               0x00000200
-
-/*
- * Set if the algorithm is an instance that is build from templates.
- */
-#define CRYPTO_ALG_INSTANCE            0x00000800
-
-/* Set this bit if the algorithm provided is hardware accelerated but
- * not available to userspace via instruction set or so.
- */
-#define CRYPTO_ALG_KERN_DRIVER_ONLY    0x00001000
-
-/*
- * Mark a cipher as a service implementation only usable by another
- * cipher and never by a normal user of the kernel crypto API
- */
-#define CRYPTO_ALG_INTERNAL            0x00002000
-
-/*
- * Transform masks and values (for crt_flags).
- */
-#define CRYPTO_TFM_REQ_MASK            0x000fff00
-#define CRYPTO_TFM_RES_MASK            0xfff00000
-
-#define CRYPTO_TFM_REQ_WEAK_KEY                0x00000100
-#define CRYPTO_TFM_REQ_MAY_SLEEP       0x00000200
-#define CRYPTO_TFM_REQ_MAY_BACKLOG     0x00000400
-#define CRYPTO_TFM_RES_WEAK_KEY                0x00100000
-#define CRYPTO_TFM_RES_BAD_KEY_LEN     0x00200000
-#define CRYPTO_TFM_RES_BAD_KEY_SCHED   0x00400000
-#define CRYPTO_TFM_RES_BAD_BLOCK_LEN   0x00800000
-#define CRYPTO_TFM_RES_BAD_FLAGS       0x01000000
-
-/*
- * Miscellaneous stuff.
- */
 #define CRYPTO_MAX_ALG_NAME            64
 
-/*
- * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual
- * declaration) is used to ensure that the crypto_tfm context structure is
- * aligned correctly for the given architecture so that there are no alignment
- * faults for C data types.  In particular, this is required on platforms such
- * as arm where pointers are 32-bit aligned but there are data types such as
- * u64 which require 64-bit alignment.
- */
 #define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN
-
 #define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN)))
 
 struct scatterlist;
 struct crypto_blkcipher;
 struct crypto_tfm;
 struct crypto_type;
-struct skcipher_givcrypt_request;
 
 struct blkcipher_desc {
-       struct crypto_blkcipher *tfm;
-       void *info;
-       u32 flags;
-};
-
-struct cipher_desc {
-       struct crypto_tfm *tfm;
-       void (*crfn)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-       unsigned int (*prfn)(const struct cipher_desc *desc, u8 *dst,
-                            const u8 *src, unsigned int nbytes);
-       void *info;
+       struct crypto_blkcipher *tfm;
+       void                    *info;
+       u32                     flags;
 };
 
 struct blkcipher_alg {
        int (*setkey)(struct crypto_tfm *tfm, const u8 *key,
-                     unsigned int keylen);
+                     unsigned keylen);
        int (*encrypt)(struct blkcipher_desc *desc,
                       struct scatterlist *dst, struct scatterlist *src,
-                      unsigned int nbytes);
+                      unsigned nbytes);
        int (*decrypt)(struct blkcipher_desc *desc,
                       struct scatterlist *dst, struct scatterlist *src,
-                      unsigned int nbytes);
-
-       const char *geniv;
-
-       unsigned int min_keysize;
-       unsigned int max_keysize;
-       unsigned int ivsize;
-};
-
-struct cipher_alg {
-       unsigned int cia_min_keysize;
-       unsigned int cia_max_keysize;
-       int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key,
-                         unsigned int keylen);
-       void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-       void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-};
-
-struct compress_alg {
-       int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src,
-                           unsigned int slen, u8 *dst, unsigned int *dlen);
-       int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src,
-                             unsigned int slen, u8 *dst, unsigned int *dlen);
+                      unsigned nbytes);
 };
 
-
 #define cra_blkcipher  cra_u.blkcipher
-#define cra_cipher     cra_u.cipher
-#define cra_compress   cra_u.compress
 
 struct crypto_alg {
-       struct list_head cra_list;
-       struct list_head cra_users;
-
-       u32 cra_flags;
-       unsigned int cra_blocksize;
-       unsigned int cra_ctxsize;
-       unsigned int cra_alignmask;
-
-       int cra_priority;
-       atomic_t cra_refcnt;
+       struct list_head        cra_list;
+       struct list_head        cra_users;
 
-       char cra_name[CRYPTO_MAX_ALG_NAME];
-       char cra_driver_name[CRYPTO_MAX_ALG_NAME];
+       u32                     cra_flags;
+       unsigned                cra_ctxsize;
+       char                    cra_name[CRYPTO_MAX_ALG_NAME];
 
        const struct crypto_type *cra_type;
 
        union {
                struct blkcipher_alg blkcipher;
-               struct cipher_alg cipher;
-               struct compress_alg compress;
        } cra_u;
 
        int (*cra_init)(struct crypto_tfm *tfm);
        void (*cra_exit)(struct crypto_tfm *tfm);
-       void (*cra_destroy)(struct crypto_alg *alg);
-
-       struct module *cra_module;
 } CRYPTO_MINALIGN_ATTR;
 
-/*
- * Algorithm registration interface.
- */
 int crypto_register_alg(struct crypto_alg *alg);
-int crypto_unregister_alg(struct crypto_alg *alg);
-int crypto_register_algs(struct crypto_alg *algs, int count);
-int crypto_unregister_algs(struct crypto_alg *algs, int count);
-
-/*
- * Algorithm query interface.
- */
-int crypto_has_alg(const char *name, u32 type, u32 mask);
-
-/*
- * Transforms: user-instantiated objects which encapsulate algorithms
- * and core processing logic.  Managed via crypto_alloc_*() and
- * crypto_free_*(), as well as the various helpers below.
- */
 
 struct blkcipher_tfm {
-       void *iv;
        int (*setkey)(struct crypto_tfm *tfm, const u8 *key,
-                     unsigned int keylen);
+                     unsigned keylen);
        int (*encrypt)(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes);
+                      struct scatterlist *src, unsigned nbytes);
        int (*decrypt)(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes);
+                      struct scatterlist *src, unsigned nbytes);
 };
 
-struct cipher_tfm {
-       int (*cit_setkey)(struct crypto_tfm *tfm,
-                         const u8 *key, unsigned int keylen);
-       void (*cit_encrypt_one)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-       void (*cit_decrypt_one)(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-};
-
-struct compress_tfm {
-       int (*cot_compress)(struct crypto_tfm *tfm,
-                           const u8 *src, unsigned int slen,
-                           u8 *dst, unsigned int *dlen);
-       int (*cot_decompress)(struct crypto_tfm *tfm,
-                             const u8 *src, unsigned int slen,
-                             u8 *dst, unsigned int *dlen);
-};
-
-#define crt_blkcipher  crt_u.blkcipher
-#define crt_cipher     crt_u.cipher
-#define crt_compress   crt_u.compress
-
 struct crypto_tfm {
+       u32                     crt_flags;
 
-       u32 crt_flags;
-
-       union {
-               struct blkcipher_tfm blkcipher;
-               struct cipher_tfm cipher;
-               struct compress_tfm compress;
-       } crt_u;
+       struct blkcipher_tfm    crt_blkcipher;
 
        void (*exit)(struct crypto_tfm *tfm);
 
-       struct crypto_alg *__crt_alg;
-
-       void *__crt_ctx[] CRYPTO_MINALIGN_ATTR;
-};
-
-struct crypto_blkcipher {
-       struct crypto_tfm base;
-};
-
-struct crypto_cipher {
-       struct crypto_tfm base;
-};
-
-struct crypto_comp {
-       struct crypto_tfm base;
+       struct crypto_alg       *__crt_alg;
+       void                    *__crt_ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
-enum {
-       CRYPTOA_UNSPEC,
-       CRYPTOA_ALG,
-       CRYPTOA_TYPE,
-       CRYPTOA_U32,
-       __CRYPTOA_MAX,
-};
-
-#define CRYPTOA_MAX (__CRYPTOA_MAX - 1)
-
-/* Maximum number of (rtattr) parameters for each template. */
-#define CRYPTO_MAX_ATTRS 32
-
-struct crypto_attr_alg {
-       char name[CRYPTO_MAX_ALG_NAME];
-};
-
-struct crypto_attr_type {
-       u32 type;
-       u32 mask;
-};
-
-struct crypto_attr_u32 {
-       u32 num;
-};
-
-/* 
- * Transform user interface.
- */
-
 struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
 void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm);
 
@@ -325,110 +107,19 @@ static inline void crypto_free_tfm(struct crypto_tfm *tfm)
        return crypto_destroy_tfm(tfm, tfm);
 }
 
-int alg_test(const char *driver, const char *alg, u32 type, u32 mask);
-
-/*
- * Transform helpers which query the underlying algorithm.
- */
-static inline const char *crypto_tfm_alg_name(struct crypto_tfm *tfm)
-{
-       return tfm->__crt_alg->cra_name;
-}
-
-static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm)
-{
-       return tfm->__crt_alg->cra_driver_name;
-}
-
-static inline int crypto_tfm_alg_priority(struct crypto_tfm *tfm)
-{
-       return tfm->__crt_alg->cra_priority;
-}
-
 static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm)
 {
        return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK;
 }
 
-static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm)
-{
-       return tfm->__crt_alg->cra_blocksize;
-}
-
-static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm)
-{
-       return tfm->__crt_alg->cra_alignmask;
-}
-
-static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm)
-{
-       return tfm->crt_flags;
-}
-
-static inline void crypto_tfm_set_flags(struct crypto_tfm *tfm, u32 flags)
-{
-       tfm->crt_flags |= flags;
-}
-
-static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags)
-{
-       tfm->crt_flags &= ~flags;
-}
-
 static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
 {
        return tfm->__crt_ctx;
 }
 
-static inline unsigned int crypto_tfm_ctx_alignment(void)
-{
-       struct crypto_tfm *tfm;
-       return __alignof__(tfm->__crt_ctx);
-}
-
-static inline u32 crypto_skcipher_type(u32 type)
-{
-       type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
-       type |= CRYPTO_ALG_TYPE_BLKCIPHER;
-       return type;
-}
-
-static inline u32 crypto_skcipher_mask(u32 mask)
-{
-       mask &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
-       mask |= CRYPTO_ALG_TYPE_BLKCIPHER_MASK;
-       return mask;
-}
-
-/**
- * DOC: Synchronous Block Cipher API
- *
- * The synchronous block cipher API is used with the ciphers of type
- * CRYPTO_ALG_TYPE_BLKCIPHER (listed as type "blkcipher" in /proc/crypto)
- *
- * Synchronous calls, have a context in the tfm. But since a single tfm can be
- * used in multiple calls and in parallel, this info should not be changeable
- * (unless a lock is used). This applies, for example, to the symmetric key.
- * However, the IV is changeable, so there is an iv field in blkcipher_tfm
- * structure for synchronous blkcipher api. So, its the only state info that can
- * be kept for synchronous calls without using a big lock across a tfm.
- *
- * The block cipher API allows the use of a complete cipher, i.e. a cipher
- * consisting of a template (a block chaining mode) and a single block cipher
- * primitive (e.g. AES).
- *
- * The plaintext data buffer and the ciphertext data buffer are pointed to
- * by using scatter/gather lists. The cipher operation is performed
- * on all segments of the provided scatter/gather lists.
- *
- * The kernel crypto API supports a cipher operation "in-place" which means that
- * the caller may provide the same scatter/gather list for the plaintext and
- * cipher text. After the completion of the cipher operation, the plaintext
- * data is replaced with the ciphertext data in case of an encryption and vice
- * versa for a decryption. The caller must ensure that the scatter/gather lists
- * for the output data point to sufficiently large buffers, i.e. multiples of
- * the block size of the cipher.
- */
+struct crypto_blkcipher {
+       struct crypto_tfm base;
+};
 
 static inline struct crypto_blkcipher *__crypto_blkcipher_cast(
        struct crypto_tfm *tfm)
@@ -443,20 +134,6 @@ static inline struct crypto_blkcipher *crypto_blkcipher_cast(
        return __crypto_blkcipher_cast(tfm);
 }
 
-/**
- * crypto_alloc_blkcipher() - allocate synchronous block cipher handle
- * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
- *           blkcipher cipher
- * @type: specifies the type of the cipher
- * @mask: specifies the mask for the cipher
- *
- * Allocate a cipher handle for a block cipher. The returned struct
- * crypto_blkcipher is the cipher handle that is required for any subsequent
- * API invocation for that block cipher.
- *
- * Return: allocated cipher handle in case of success; IS_ERR() is true in case
- *        of an error, PTR_ERR() returns the error code.
- */
 static inline struct crypto_blkcipher *crypto_alloc_blkcipher(
        const char *alg_name, u32 type, u32 mask)
 {
@@ -467,455 +144,30 @@ static inline struct crypto_blkcipher *crypto_alloc_blkcipher(
        return __crypto_blkcipher_cast(crypto_alloc_base(alg_name, type, mask));
 }
 
-static inline struct crypto_tfm *crypto_blkcipher_tfm(
-       struct crypto_blkcipher *tfm)
-{
-       return &tfm->base;
-}
-
-/**
- * crypto_free_blkcipher() - zeroize and free the block cipher handle
- * @tfm: cipher handle to be freed
- */
 static inline void crypto_free_blkcipher(struct crypto_blkcipher *tfm)
 {
-       crypto_free_tfm(crypto_blkcipher_tfm(tfm));
-}
-
-/**
- * crypto_has_blkcipher() - Search for the availability of a block cipher
- * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
- *           block cipher
- * @type: specifies the type of the cipher
- * @mask: specifies the mask for the cipher
- *
- * Return: true when the block cipher is known to the kernel crypto API; false
- *        otherwise
- */
-static inline int crypto_has_blkcipher(const char *alg_name, u32 type, u32 mask)
-{
-       type &= ~CRYPTO_ALG_TYPE_MASK;
-       type |= CRYPTO_ALG_TYPE_BLKCIPHER;
-       mask |= CRYPTO_ALG_TYPE_MASK;
-
-       return crypto_has_alg(alg_name, type, mask);
-}
-
-/**
- * crypto_blkcipher_name() - return the name / cra_name from the cipher handle
- * @tfm: cipher handle
- *
- * Return: The character string holding the name of the cipher
- */
-static inline const char *crypto_blkcipher_name(struct crypto_blkcipher *tfm)
-{
-       return crypto_tfm_alg_name(crypto_blkcipher_tfm(tfm));
+       crypto_free_tfm(&tfm->base);
 }
 
 static inline struct blkcipher_tfm *crypto_blkcipher_crt(
        struct crypto_blkcipher *tfm)
 {
-       return &crypto_blkcipher_tfm(tfm)->crt_blkcipher;
+       return &tfm->base.crt_blkcipher;
 }
 
-static inline struct blkcipher_alg *crypto_blkcipher_alg(
-       struct crypto_blkcipher *tfm)
-{
-       return &crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher;
-}
-
-/**
- * crypto_blkcipher_ivsize() - obtain IV size
- * @tfm: cipher handle
- *
- * The size of the IV for the block cipher referenced by the cipher handle is
- * returned. This IV size may be zero if the cipher does not need an IV.
- *
- * Return: IV size in bytes
- */
-static inline unsigned int crypto_blkcipher_ivsize(struct crypto_blkcipher *tfm)
-{
-       return crypto_blkcipher_alg(tfm)->ivsize;
-}
-
-/**
- * crypto_blkcipher_blocksize() - obtain block size of cipher
- * @tfm: cipher handle
- *
- * The block size for the block cipher referenced with the cipher handle is
- * returned. The caller may use that information to allocate appropriate
- * memory for the data returned by the encryption or decryption operation.
- *
- * Return: block size of cipher
- */
-static inline unsigned int crypto_blkcipher_blocksize(
-       struct crypto_blkcipher *tfm)
-{
-       return crypto_tfm_alg_blocksize(crypto_blkcipher_tfm(tfm));
-}
-
-static inline unsigned int crypto_blkcipher_alignmask(
-       struct crypto_blkcipher *tfm)
-{
-       return crypto_tfm_alg_alignmask(crypto_blkcipher_tfm(tfm));
-}
-
-static inline u32 crypto_blkcipher_get_flags(struct crypto_blkcipher *tfm)
-{
-       return crypto_tfm_get_flags(crypto_blkcipher_tfm(tfm));
-}
-
-static inline void crypto_blkcipher_set_flags(struct crypto_blkcipher *tfm,
-                                             u32 flags)
-{
-       crypto_tfm_set_flags(crypto_blkcipher_tfm(tfm), flags);
-}
-
-static inline void crypto_blkcipher_clear_flags(struct crypto_blkcipher *tfm,
-                                               u32 flags)
-{
-       crypto_tfm_clear_flags(crypto_blkcipher_tfm(tfm), flags);
-}
-
-/**
- * crypto_blkcipher_setkey() - set key for cipher
- * @tfm: cipher handle
- * @key: buffer holding the key
- * @keylen: length of the key in bytes
- *
- * The caller provided key is set for the block cipher referenced by the cipher
- * handle.
- *
- * Note, the key length determines the cipher type. Many block ciphers implement
- * different cipher modes depending on the key size, such as AES-128 vs AES-192
- * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
- * is performed.
- *
- * Return: 0 if the setting of the key was successful; < 0 if an error occurred
- */
 static inline int crypto_blkcipher_setkey(struct crypto_blkcipher *tfm,
-                                         const u8 *key, unsigned int keylen)
+                                         const u8 *key, unsigned keylen)
 {
-       return crypto_blkcipher_crt(tfm)->setkey(crypto_blkcipher_tfm(tfm),
-                                                key, keylen);
+       return crypto_blkcipher_crt(tfm)->setkey(&tfm->base, key, keylen);
 }
 
-/**
- * crypto_blkcipher_encrypt() - encrypt plaintext
- * @desc: reference to the block cipher handle with meta data
- * @dst: scatter/gather list that is filled by the cipher operation with the
- *     ciphertext
- * @src: scatter/gather list that holds the plaintext
- * @nbytes: number of bytes of the plaintext to encrypt.
- *
- * Encrypt plaintext data using the IV set by the caller with a preceding
- * call of crypto_blkcipher_set_iv.
- *
- * The blkcipher_desc data structure must be filled by the caller and can
- * reside on the stack. The caller must fill desc as follows: desc.tfm is filled
- * with the block cipher handle; desc.flags is filled with either
- * CRYPTO_TFM_REQ_MAY_SLEEP or 0.
- *
- * Return: 0 if the cipher operation was successful; < 0 if an error occurred
- */
-static inline int crypto_blkcipher_encrypt(struct blkcipher_desc *desc,
-                                          struct scatterlist *dst,
-                                          struct scatterlist *src,
-                                          unsigned int nbytes)
-{
-       desc->info = crypto_blkcipher_crt(desc->tfm)->iv;
-       return crypto_blkcipher_crt(desc->tfm)->encrypt(desc, dst, src, nbytes);
-}
-
-/**
- * crypto_blkcipher_encrypt_iv() - encrypt plaintext with dedicated IV
- * @desc: reference to the block cipher handle with meta data
- * @dst: scatter/gather list that is filled by the cipher operation with the
- *     ciphertext
- * @src: scatter/gather list that holds the plaintext
- * @nbytes: number of bytes of the plaintext to encrypt.
- *
- * Encrypt plaintext data with the use of an IV that is solely used for this
- * cipher operation. Any previously set IV is not used.
- *
- * The blkcipher_desc data structure must be filled by the caller and can
- * reside on the stack. The caller must fill desc as follows: desc.tfm is filled
- * with the block cipher handle; desc.info is filled with the IV to be used for
- * the current operation; desc.flags is filled with either
- * CRYPTO_TFM_REQ_MAY_SLEEP or 0.
- *
- * Return: 0 if the cipher operation was successful; < 0 if an error occurred
- */
 static inline int crypto_blkcipher_encrypt_iv(struct blkcipher_desc *desc,
                                              struct scatterlist *dst,
                                              struct scatterlist *src,
-                                             unsigned int nbytes)
+                                             unsigned nbytes)
 {
        return crypto_blkcipher_crt(desc->tfm)->encrypt(desc, dst, src, nbytes);
 }
 
-/**
- * crypto_blkcipher_decrypt() - decrypt ciphertext
- * @desc: reference to the block cipher handle with meta data
- * @dst: scatter/gather list that is filled by the cipher operation with the
- *     plaintext
- * @src: scatter/gather list that holds the ciphertext
- * @nbytes: number of bytes of the ciphertext to decrypt.
- *
- * Decrypt ciphertext data using the IV set by the caller with a preceding
- * call of crypto_blkcipher_set_iv.
- *
- * The blkcipher_desc data structure must be filled by the caller as documented
- * for the crypto_blkcipher_encrypt call above.
- *
- * Return: 0 if the cipher operation was successful; < 0 if an error occurred
- *
- */
-static inline int crypto_blkcipher_decrypt(struct blkcipher_desc *desc,
-                                          struct scatterlist *dst,
-                                          struct scatterlist *src,
-                                          unsigned int nbytes)
-{
-       desc->info = crypto_blkcipher_crt(desc->tfm)->iv;
-       return crypto_blkcipher_crt(desc->tfm)->decrypt(desc, dst, src, nbytes);
-}
-
-/**
- * crypto_blkcipher_decrypt_iv() - decrypt ciphertext with dedicated IV
- * @desc: reference to the block cipher handle with meta data
- * @dst: scatter/gather list that is filled by the cipher operation with the
- *     plaintext
- * @src: scatter/gather list that holds the ciphertext
- * @nbytes: number of bytes of the ciphertext to decrypt.
- *
- * Decrypt ciphertext data with the use of an IV that is solely used for this
- * cipher operation. Any previously set IV is not used.
- *
- * The blkcipher_desc data structure must be filled by the caller as documented
- * for the crypto_blkcipher_encrypt_iv call above.
- *
- * Return: 0 if the cipher operation was successful; < 0 if an error occurred
- */
-static inline int crypto_blkcipher_decrypt_iv(struct blkcipher_desc *desc,
-                                             struct scatterlist *dst,
-                                             struct scatterlist *src,
-                                             unsigned int nbytes)
-{
-       return crypto_blkcipher_crt(desc->tfm)->decrypt(desc, dst, src, nbytes);
-}
-
-/**
- * crypto_blkcipher_set_iv() - set IV for cipher
- * @tfm: cipher handle
- * @src: buffer holding the IV
- * @len: length of the IV in bytes
- *
- * The caller provided IV is set for the block cipher referenced by the cipher
- * handle.
- */
-static inline void crypto_blkcipher_set_iv(struct crypto_blkcipher *tfm,
-                                          const u8 *src, unsigned int len)
-{
-       memcpy(crypto_blkcipher_crt(tfm)->iv, src, len);
-}
-
-/**
- * crypto_blkcipher_get_iv() - obtain IV from cipher
- * @tfm: cipher handle
- * @dst: buffer filled with the IV
- * @len: length of the buffer dst
- *
- * The caller can obtain the IV set for the block cipher referenced by the
- * cipher handle and store it into the user-provided buffer. If the buffer
- * has an insufficient space, the IV is truncated to fit the buffer.
- */
-static inline void crypto_blkcipher_get_iv(struct crypto_blkcipher *tfm,
-                                          u8 *dst, unsigned int len)
-{
-       memcpy(dst, crypto_blkcipher_crt(tfm)->iv, len);
-}
-
-/**
- * DOC: Single Block Cipher API
- *
- * The single block cipher API is used with the ciphers of type
- * CRYPTO_ALG_TYPE_CIPHER (listed as type "cipher" in /proc/crypto).
- *
- * Using the single block cipher API calls, operations with the basic cipher
- * primitive can be implemented. These cipher primitives exclude any block
- * chaining operations including IV handling.
- *
- * The purpose of this single block cipher API is to support the implementation
- * of templates or other concepts that only need to perform the cipher operation
- * on one block at a time. Templates invoke the underlying cipher primitive
- * block-wise and process either the input or the output data of these cipher
- * operations.
- */
-
-static inline struct crypto_cipher *__crypto_cipher_cast(struct crypto_tfm *tfm)
-{
-       return (struct crypto_cipher *)tfm;
-}
-
-static inline struct crypto_cipher *crypto_cipher_cast(struct crypto_tfm *tfm)
-{
-       BUG_ON(crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER);
-       return __crypto_cipher_cast(tfm);
-}
-
-/**
- * crypto_alloc_cipher() - allocate single block cipher handle
- * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
- *          single block cipher
- * @type: specifies the type of the cipher
- * @mask: specifies the mask for the cipher
- *
- * Allocate a cipher handle for a single block cipher. The returned struct
- * crypto_cipher is the cipher handle that is required for any subsequent API
- * invocation for that single block cipher.
- *
- * Return: allocated cipher handle in case of success; IS_ERR() is true in case
- *        of an error, PTR_ERR() returns the error code.
- */
-static inline struct crypto_cipher *crypto_alloc_cipher(const char *alg_name,
-                                                       u32 type, u32 mask)
-{
-       type &= ~CRYPTO_ALG_TYPE_MASK;
-       type |= CRYPTO_ALG_TYPE_CIPHER;
-       mask |= CRYPTO_ALG_TYPE_MASK;
-
-       return __crypto_cipher_cast(crypto_alloc_base(alg_name, type, mask));
-}
-
-static inline struct crypto_tfm *crypto_cipher_tfm(struct crypto_cipher *tfm)
-{
-       return &tfm->base;
-}
-
-/**
- * crypto_free_cipher() - zeroize and free the single block cipher handle
- * @tfm: cipher handle to be freed
- */
-static inline void crypto_free_cipher(struct crypto_cipher *tfm)
-{
-       crypto_free_tfm(crypto_cipher_tfm(tfm));
-}
-
-/**
- * crypto_has_cipher() - Search for the availability of a single block cipher
- * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
- *          single block cipher
- * @type: specifies the type of the cipher
- * @mask: specifies the mask for the cipher
- *
- * Return: true when the single block cipher is known to the kernel crypto API;
- *        false otherwise
- */
-static inline int crypto_has_cipher(const char *alg_name, u32 type, u32 mask)
-{
-       type &= ~CRYPTO_ALG_TYPE_MASK;
-       type |= CRYPTO_ALG_TYPE_CIPHER;
-       mask |= CRYPTO_ALG_TYPE_MASK;
-
-       return crypto_has_alg(alg_name, type, mask);
-}
-
-static inline struct cipher_tfm *crypto_cipher_crt(struct crypto_cipher *tfm)
-{
-       return &crypto_cipher_tfm(tfm)->crt_cipher;
-}
-
-/**
- * crypto_cipher_blocksize() - obtain block size for cipher
- * @tfm: cipher handle
- *
- * The block size for the single block cipher referenced with the cipher handle
- * tfm is returned. The caller may use that information to allocate appropriate
- * memory for the data returned by the encryption or decryption operation
- *
- * Return: block size of cipher
- */
-static inline unsigned int crypto_cipher_blocksize(struct crypto_cipher *tfm)
-{
-       return crypto_tfm_alg_blocksize(crypto_cipher_tfm(tfm));
-}
-
-static inline unsigned int crypto_cipher_alignmask(struct crypto_cipher *tfm)
-{
-       return crypto_tfm_alg_alignmask(crypto_cipher_tfm(tfm));
-}
-
-static inline u32 crypto_cipher_get_flags(struct crypto_cipher *tfm)
-{
-       return crypto_tfm_get_flags(crypto_cipher_tfm(tfm));
-}
-
-static inline void crypto_cipher_set_flags(struct crypto_cipher *tfm,
-                                          u32 flags)
-{
-       crypto_tfm_set_flags(crypto_cipher_tfm(tfm), flags);
-}
-
-static inline void crypto_cipher_clear_flags(struct crypto_cipher *tfm,
-                                            u32 flags)
-{
-       crypto_tfm_clear_flags(crypto_cipher_tfm(tfm), flags);
-}
-
-/**
- * crypto_cipher_setkey() - set key for cipher
- * @tfm: cipher handle
- * @key: buffer holding the key
- * @keylen: length of the key in bytes
- *
- * The caller provided key is set for the single block cipher referenced by the
- * cipher handle.
- *
- * Note, the key length determines the cipher type. Many block ciphers implement
- * different cipher modes depending on the key size, such as AES-128 vs AES-192
- * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
- * is performed.
- *
- * Return: 0 if the setting of the key was successful; < 0 if an error occurred
- */
-static inline int crypto_cipher_setkey(struct crypto_cipher *tfm,
-                                       const u8 *key, unsigned int keylen)
-{
-       return crypto_cipher_crt(tfm)->cit_setkey(crypto_cipher_tfm(tfm),
-                                                 key, keylen);
-}
-
-/**
- * crypto_cipher_encrypt_one() - encrypt one block of plaintext
- * @tfm: cipher handle
- * @dst: points to the buffer that will be filled with the ciphertext
- * @src: buffer holding the plaintext to be encrypted
- *
- * Invoke the encryption operation of one block. The caller must ensure that
- * the plaintext and ciphertext buffers are at least one block in size.
- */
-static inline void crypto_cipher_encrypt_one(struct crypto_cipher *tfm,
-                                            u8 *dst, const u8 *src)
-{
-       crypto_cipher_crt(tfm)->cit_encrypt_one(crypto_cipher_tfm(tfm),
-                                               dst, src);
-}
-
-/**
- * crypto_cipher_decrypt_one() - decrypt one block of ciphertext
- * @tfm: cipher handle
- * @dst: points to the buffer that will be filled with the plaintext
- * @src: buffer holding the ciphertext to be decrypted
- *
- * Invoke the decryption operation of one block. The caller must ensure that
- * the plaintext and ciphertext buffers are at least one block in size.
- */
-static inline void crypto_cipher_decrypt_one(struct crypto_cipher *tfm,
-                                            u8 *dst, const u8 *src)
-{
-       crypto_cipher_crt(tfm)->cit_decrypt_one(crypto_cipher_tfm(tfm),
-                                               dst, src);
-}
-
 #endif /* _LINUX_CRYPTO_H */
 
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
deleted file mode 100644 (file)
index 8dfcb83..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef __CRYPTOHASH_H
-#define __CRYPTOHASH_H
-
-#include <linux/types.h>
-
-#define SHA_DIGEST_WORDS 5
-#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
-#define SHA_WORKSPACE_WORDS 16
-
-void sha_init(__u32 *buf);
-void sha_transform(__u32 *digest, const char *data, __u32 *W);
-
-#define MD5_DIGEST_WORDS 4
-#define MD5_MESSAGE_BYTES 64
-
-void md5_transform(__u32 *hash, __u32 const *in);
-
-__u32 half_md4_transform(__u32 buf[4], __u32 const in[8]);
-
-#endif
index 2233350b405bbca289cacba88cd0a9b2a09f7f27..ac72858bbf700fb8a62511d210d64e21e3bd4b24 100644 (file)
@@ -207,6 +207,4 @@ int __must_check kstrtoint(const char *s, unsigned int base, int *res);
         BUILD_BUG_ON_ZERO((perms) & 2) +                                       \
         (perms))
 
-#define offset_in_page(p)      ((unsigned long)(p) & ~PAGE_MASK)
-
 #endif
diff --git a/include/linux/key.h b/include/linux/key.h
new file mode 100644 (file)
index 0000000..adc12a9
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef _LINUX_KEY_H
+#define _LINUX_KEY_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+#include <linux/sysctl.h>
+#include <linux/rwsem.h>
+#include <linux/atomic.h>
+
+#include <keyutils.h>
+
+struct key;
+
+struct user_key_payload {
+       size_t          datalen;        /* length of this data */
+       char            data[0];        /* actual data */
+};
+
+struct key {
+       atomic_t                usage;          /* number of references */
+       key_serial_t            serial;         /* key serial number */
+       struct rw_semaphore     sem;            /* change vs change sem */
+       struct user_key_payload payload;
+};
+
+static inline const struct user_key_payload *user_key_payload(const struct key *key)
+{
+       return &key->payload;
+}
+
+static inline void key_put(struct key *key)
+{
+       if (atomic_dec_and_test(&key->usage))
+               free(key);
+}
+
+static inline struct key *__key_get(struct key *key)
+{
+       atomic_inc(&key->usage);
+       return key;
+}
+
+static inline struct key *key_get(struct key *key)
+{
+       return key ? __key_get(key) : key;
+}
+
+#endif /* _LINUX_KEY_H */
index c2789f934b93612a2dcc5a8e057c168655a1b6ae..ddf6f94130afa11bf40819986d04f38ba11e4af4 100644 (file)
@@ -14,6 +14,11 @@ typedef struct mempool_s {
        size_t          elem_size;
 } mempool_t;
 
+static inline bool mempool_initialized(mempool_t *pool)
+{
+       return true;
+}
+
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
 
 static inline void mempool_free(void *element, mempool_t *pool)
index c99d9de3c5b56632c4d4815ad86330530671e74a..8d6413ce60bd520fce7330f068b4e017c1f50491 100644 (file)
@@ -5,8 +5,11 @@
 
 struct page;
 
-#define virt_to_page(kaddr)            ((struct page *) (kaddr))
-#define page_address(kaddr)            ((void *) (kaddr))
+#define virt_to_page(p)                                                        \
+       ((struct page *) (((unsigned long) (p)) & PAGE_MASK))
+#define offset_in_page(p)              ((unsigned long) (p) & ~PAGE_MASK)
+
+#define page_address(p)                        ((void *) (p))
 
 #define kmap_atomic(page)              page_address(page)
 #define kunmap_atomic(addr)            do {} while (0)
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
new file mode 100644 (file)
index 0000000..04bf59d
--- /dev/null
@@ -0,0 +1,111 @@
+#ifndef _LINUX_SCATTERLIST_H
+#define _LINUX_SCATTERLIST_H
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/mm.h>
+
+struct scatterlist {
+       unsigned long   page_link;
+       unsigned int    offset;
+       unsigned int    length;
+};
+
+#define sg_is_chain(sg)                ((sg)->page_link & 0x01)
+#define sg_is_last(sg)         ((sg)->page_link & 0x02)
+#define sg_chain_ptr(sg)       \
+       ((struct scatterlist *) ((sg)->page_link & ~0x03))
+
+static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
+{
+       unsigned long page_link = sg->page_link & 0x3;
+
+       /*
+        * In order for the low bit stealing approach to work, pages
+        * must be aligned at a 32-bit boundary as a minimum.
+        */
+       BUG_ON((unsigned long) page & 0x03);
+       sg->page_link = page_link | (unsigned long) page;
+}
+
+static inline void sg_set_page(struct scatterlist *sg, struct page *page,
+                              unsigned int len, unsigned int offset)
+{
+       sg_assign_page(sg, page);
+       sg->offset = offset;
+       sg->length = len;
+}
+
+static inline struct page *sg_page(struct scatterlist *sg)
+{
+       return (struct page *)((sg)->page_link & ~0x3);
+}
+
+static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
+                             unsigned int buflen)
+{
+       sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
+}
+
+static inline struct scatterlist *sg_next(struct scatterlist *sg)
+{
+       if (sg_is_last(sg))
+               return NULL;
+
+       sg++;
+       if (unlikely(sg_is_chain(sg)))
+               sg = sg_chain_ptr(sg);
+
+       return sg;
+}
+
+#define for_each_sg(sglist, sg, nr, __i)       \
+       for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
+
+static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
+                           struct scatterlist *sgl)
+{
+       /*
+        * offset and length are unused for chain entry.  Clear them.
+        */
+       prv[prv_nents - 1].offset = 0;
+       prv[prv_nents - 1].length = 0;
+
+       /*
+        * Set lowest bit to indicate a link pointer, and make sure to clear
+        * the termination bit if it happens to be set.
+        */
+       prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02;
+}
+
+static inline void sg_mark_end(struct scatterlist *sg)
+{
+       sg->page_link |= 0x02;
+       sg->page_link &= ~0x01;
+}
+
+static inline void sg_unmark_end(struct scatterlist *sg)
+{
+       sg->page_link &= ~0x02;
+}
+
+static inline void *sg_virt(struct scatterlist *sg)
+{
+       return page_address(sg_page(sg)) + sg->offset;
+}
+
+static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
+{
+       memset(sgl, 0, sizeof(*sgl) * nents);
+       sg_mark_end(&sgl[nents - 1]);
+}
+
+static inline void sg_init_one(struct scatterlist *sg, const void *buf,
+                              unsigned int buflen)
+{
+       sg_init_table(sg, 1);
+       sg_set_buf(sg, buf, buflen);
+}
+
+#endif /* _LINUX_SCATTERLIST_H */
index 2e1ad82e8eb2a32e18ec02d7ad508ee550244fd1..2d9f8291f581bcadcde4d35294757c17e94b5b67 100644 (file)
@@ -38,6 +38,19 @@ struct itimerspec64 {
 #define KTIME_MAX                      ((s64)~((u64)1 << 63))
 #define KTIME_SEC_MAX                  (KTIME_MAX / NSEC_PER_SEC)
 
+static inline struct timespec ns_to_timespec(const u64 nsec)
+{
+       return (struct timespec) {
+               .tv_sec = nsec / NSEC_PER_SEC,
+               .tv_nsec = nsec % NSEC_PER_SEC,
+       };
+}
+
+static inline s64 timespec_to_ns(const struct timespec *ts)
+{
+       return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
 #if __BITS_PER_LONG == 64
 
 static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
@@ -61,11 +74,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 # define ns_to_timespec64              ns_to_timespec
 # define timespec64_add_ns             timespec_add_ns
 
-static inline s64 timespec_to_ns(const struct timespec *ts)
-{
-       return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
-}
-
 #else
 
 static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
index d4968c54284995d951334bc78042911b93cf1bb6..01e4b79d408bea1701da274098a6bfe94dd61546 100644 (file)
@@ -185,7 +185,7 @@ TRACE_EVENT(bcache_write,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->inode          = inode;
                __entry->sector         = bio->bi_iter.bi_sector;
                __entry->nr_sector      = bio->bi_iter.bi_size >> 9;
@@ -215,7 +215,7 @@ TRACE_EVENT(bcache_write_throttle,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->inode          = inode;
                __entry->sector         = bio->bi_iter.bi_sector;
                __entry->nr_sector      = bio->bi_iter.bi_size >> 9;
@@ -245,7 +245,7 @@ DECLARE_EVENT_CLASS(page_alloc_fail,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->size = size;
        ),
 
@@ -263,7 +263,7 @@ DECLARE_EVENT_CLASS(cache_set,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
        ),
 
        TP_printk("%pU", __entry->uuid)
@@ -285,7 +285,7 @@ TRACE_EVENT(bcache_journal_next_bucket,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+               memcpy(__entry->uuid, ca->uuid.b, 16);
                __entry->cur_idx        = cur_idx;
                __entry->last_idx       = last_idx;
        ),
@@ -304,7 +304,7 @@ TRACE_EVENT(bcache_journal_write_oldest,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->seq            = seq;
        ),
 
@@ -322,7 +322,7 @@ TRACE_EVENT(bcache_journal_write_oldest_done,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->seq            = seq;
                __entry->written        = written;
        ),
@@ -368,7 +368,7 @@ DECLARE_EVENT_CLASS(cache,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+               memcpy(__entry->uuid, ca->uuid.b, 16);
                __entry->tier = ca->mi.tier;
        ),
 
@@ -418,7 +418,7 @@ DECLARE_EVENT_CLASS(btree_node,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->bucket         = PTR_BUCKET_NR_TRACE(c, &b->key, 0);
                __entry->level          = b->level;
                __entry->id             = b->btree_id;
@@ -471,7 +471,7 @@ TRACE_EVENT(bcache_btree_node_alloc_fail,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->id = id;
        ),
 
@@ -514,7 +514,7 @@ TRACE_EVENT(bcache_mca_scan,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->touched        = touched;
                __entry->freed          = freed;
                __entry->can_free       = can_free;
@@ -535,7 +535,7 @@ DECLARE_EVENT_CLASS(mca_cannibalize_lock,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
        ),
 
        TP_printk("%pU", __entry->uuid)
@@ -675,7 +675,7 @@ TRACE_EVENT(bcache_btree_gc_coalesce_fail,
 
        TP_fast_assign(
                __entry->reason         = reason;
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->disk_sb->user_uuid.b, 16);
        ),
 
        TP_printk("%pU: %u", __entry->uuid, __entry->reason)
@@ -696,7 +696,7 @@ TRACE_EVENT(bcache_btree_node_alloc_replacement,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->old_bucket     = PTR_BUCKET_NR_TRACE(c,
                                                              &old->key, 0);
                __entry->bucket         = PTR_BUCKET_NR_TRACE(c, &b->key, 0);
@@ -778,7 +778,7 @@ TRACE_EVENT(bcache_mark_bucket,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+               memcpy(__entry->uuid, ca->uuid.b, 16);
                __entry->inode          = k->p.inode;
                __entry->offset         = k->p.offset;
                __entry->sectors        = sectors;
@@ -804,7 +804,7 @@ TRACE_EVENT(bcache_alloc_batch,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+               memcpy(__entry->uuid, ca->uuid.b, 16);
                __entry->free = free;
                __entry->total = total;
        ),
@@ -824,7 +824,7 @@ TRACE_EVENT(bcache_btree_reserve_get_fail,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->required = required;
                __entry->cl = cl;
        ),
@@ -879,7 +879,7 @@ DECLARE_EVENT_CLASS(cache_bucket_alloc,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+               memcpy(__entry->uuid, ca->uuid.b, 16);
                __entry->reserve = reserve;
        ),
 
@@ -908,7 +908,7 @@ DECLARE_EVENT_CLASS(cache_set_bucket_alloc,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->reserve = reserve;
                __entry->cl = cl;
        ),
@@ -933,7 +933,7 @@ DECLARE_EVENT_CLASS(open_bucket_alloc,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->cl = cl;
        ),
 
@@ -1054,7 +1054,7 @@ TRACE_EVENT(bcache_moving_gc_end,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, ca->disk_sb.sb->disk_uuid.b, 16);
+               memcpy(__entry->uuid, ca->uuid.b, 16);
                __entry->sectors_moved = sectors_moved;
                __entry->keys_moved = keys_moved;
                __entry->buckets_moved = buckets_moved;
@@ -1114,7 +1114,7 @@ TRACE_EVENT(bcache_tiering_end,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->disk_sb.user_uuid.b, 16);
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
                __entry->sectors_moved = sectors_moved;
                __entry->keys_moved = keys_moved;
        ),
index 802d3b4c1a9e30546c65133a238ced3b4868125e..cc294bd4ac39fd434db95dabe33e7fb3eb838ea2 100644 (file)
@@ -7,6 +7,7 @@
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <time.h>
 #include <unistd.h>
 
 #include <uuid/uuid.h>
 #include "linux/bcache.h"
 #include "libbcache.h"
 #include "checksum.h"
+#include "crypto.h"
 #include "opts.h"
+#include "super-io.h"
+
+#define NSEC_PER_SEC   1000000000L
 
 #define BCH_MIN_NR_NBUCKETS    (1 << 10)
 
 /* first bucket should start 1 mb in, in sectors: */
 #define FIRST_BUCKET_OFFSET    (1 << 11)
 
-void __do_write_sb(int fd, void *sb, size_t bytes)
-{
-       char zeroes[SB_SECTOR << 9] = {0};
-
-       /* Zero start of disk */
-       xpwrite(fd, zeroes, SB_SECTOR << 9, 0);
-
-       /* Write superblock */
-       xpwrite(fd, sb, bytes, SB_SECTOR << 9);
-
-       fsync(fd);
-       close(fd);
-}
-
-#define do_write_sb(_fd, _sb)                  \
-       __do_write_sb(_fd, _sb, ((void *) __bset_bkey_last(_sb)) - (void *) _sb);
-
 /* minimum size filesystem we can create, given a bucket size: */
 static u64 min_size(unsigned bucket_size)
 {
@@ -45,12 +33,26 @@ static u64 min_size(unsigned bucket_size)
                BCH_MIN_NR_NBUCKETS) * bucket_size;
 }
 
+static void init_layout(struct bch_sb_layout *l)
+{
+       memset(l, 0, sizeof(*l));
+
+       l->magic                = BCACHE_MAGIC;
+       l->layout_type          = 0;
+       l->nr_superblocks       = 2;
+       l->sb_max_size_bits     = 7;
+       l->sb_offset[0]         = cpu_to_le64(BCH_SB_SECTOR);
+       l->sb_offset[1]         = cpu_to_le64(BCH_SB_SECTOR +
+                                             (1 << l->sb_max_size_bits));
+}
+
 void bcache_format(struct dev_opts *devs, size_t nr_devs,
                   unsigned block_size,
                   unsigned btree_node_size,
                   unsigned meta_csum_type,
                   unsigned data_csum_type,
                   unsigned compression_type,
+                  const char *passphrase,
                   unsigned meta_replicas,
                   unsigned data_replicas,
                   unsigned on_error_action,
@@ -58,8 +60,10 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
                   char *label,
                   uuid_le uuid)
 {
-       struct cache_sb *sb;
+       struct bch_sb *sb;
        struct dev_opts *i;
+       struct bch_sb_field_members *mi;
+       unsigned u64s, j;
 
        /* calculate block size: */
        if (!block_size)
@@ -124,16 +128,20 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 
        max_journal_entry_size = roundup_pow_of_two(max_journal_entry_size);
 
-       sb = calloc(1, sizeof(*sb) + sizeof(struct cache_member) * nr_devs);
+       sb = calloc(1, sizeof(*sb) +
+                   sizeof(struct bch_sb_field_members) +
+                   sizeof(struct bch_member) * nr_devs +
+                   sizeof(struct bch_sb_field_crypt));
 
-       sb->offset      = __cpu_to_le64(SB_SECTOR);
-       sb->version     = __cpu_to_le64(BCACHE_SB_VERSION_CDEV_V3);
+       sb->version     = cpu_to_le64(BCACHE_SB_VERSION_CDEV_V4);
        sb->magic       = BCACHE_MAGIC;
-       sb->block_size  = __cpu_to_le16(block_size);
+       sb->block_size  = cpu_to_le16(block_size);
        sb->user_uuid   = uuid;
-       sb->nr_in_set   = nr_devs;
+       sb->nr_devices  = nr_devs;
+
+       init_layout(&sb->layout);
 
-       uuid_generate(sb->set_uuid.b);
+       uuid_generate(sb->uuid.b);
 
        if (label)
                strncpy((char *) sb->label, label, sizeof(sb->label));
@@ -142,44 +150,85 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
         * don't have a userspace crc32c implementation handy, just always use
         * crc64
         */
-       SET_CACHE_SB_CSUM_TYPE(sb,              BCH_CSUM_CRC64);
-       SET_CACHE_SET_META_PREFERRED_CSUM_TYPE(sb,      meta_csum_type);
-       SET_CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb,      data_csum_type);
-       SET_CACHE_SET_COMPRESSION_TYPE(sb,      compression_type);
-
-       SET_CACHE_SET_BTREE_NODE_SIZE(sb,       btree_node_size);
-       SET_CACHE_SET_META_REPLICAS_WANT(sb,    meta_replicas);
-       SET_CACHE_SET_META_REPLICAS_HAVE(sb,    meta_replicas);
-       SET_CACHE_SET_DATA_REPLICAS_WANT(sb,    data_replicas);
-       SET_CACHE_SET_DATA_REPLICAS_HAVE(sb,    data_replicas);
-       SET_CACHE_SET_ERROR_ACTION(sb,          on_error_action);
-       SET_CACHE_SET_STR_HASH_TYPE(sb,         BCH_STR_HASH_SIPHASH);
-       SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb,    ilog2(max_journal_entry_size));
+       SET_BCH_SB_CSUM_TYPE(sb,                BCH_CSUM_CRC64);
+       SET_BCH_SB_META_CSUM_TYPE(sb,           meta_csum_type);
+       SET_BCH_SB_DATA_CSUM_TYPE(sb,           data_csum_type);
+       SET_BCH_SB_COMPRESSION_TYPE(sb,         compression_type);
+
+       SET_BCH_SB_BTREE_NODE_SIZE(sb,          btree_node_size);
+       SET_BCH_SB_GC_RESERVE(sb,               8);
+       SET_BCH_SB_META_REPLICAS_WANT(sb,       meta_replicas);
+       SET_BCH_SB_META_REPLICAS_HAVE(sb,       meta_replicas);
+       SET_BCH_SB_DATA_REPLICAS_WANT(sb,       data_replicas);
+       SET_BCH_SB_DATA_REPLICAS_HAVE(sb,       data_replicas);
+       SET_BCH_SB_ERROR_ACTION(sb,             on_error_action);
+       SET_BCH_SB_STR_HASH_TYPE(sb,            BCH_STR_HASH_SIPHASH);
+       SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,       ilog2(max_journal_entry_size));
+
+       struct timespec now;
+       if (clock_gettime(CLOCK_REALTIME, &now))
+               die("error getting current time: %s", strerror(errno));
+
+       sb->time_base_lo        = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
+       sb->time_precision      = cpu_to_le32(1);
+
+       if (passphrase) {
+               struct bch_sb_field_crypt *crypt = vstruct_end(sb);
+
+               u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64);
+
+               le32_add_cpu(&sb->u64s, u64s);
+               crypt->field.u64s = cpu_to_le32(u64s);
+               crypt->field.type = BCH_SB_FIELD_crypt;
+
+               bch_sb_crypt_init(sb, crypt, passphrase);
+               SET_BCH_SB_ENCRYPTION_TYPE(sb, 1);
+       }
+
+       mi = vstruct_end(sb);
+       u64s = (sizeof(struct bch_sb_field_members) +
+               sizeof(struct bch_member) * nr_devs) / sizeof(u64);
+
+       le32_add_cpu(&sb->u64s, u64s);
+       mi->field.u64s = cpu_to_le32(u64s);
+       mi->field.type = BCH_SB_FIELD_members;
 
        for (i = devs; i < devs + nr_devs; i++) {
-               struct cache_member *m = sb->members + (i - devs);
+               struct bch_member *m = mi->members + (i - devs);
 
                uuid_generate(m->uuid.b);
-               m->nbuckets     = __cpu_to_le64(i->nbuckets);
-               m->first_bucket = __cpu_to_le16(i->first_bucket);
-               m->bucket_size  = __cpu_to_le16(i->bucket_size);
+               m->nbuckets     = cpu_to_le64(i->nbuckets);
+               m->first_bucket = cpu_to_le16(i->first_bucket);
+               m->bucket_size  = cpu_to_le16(i->bucket_size);
 
-               SET_CACHE_TIER(m,               i->tier);
-               SET_CACHE_REPLACEMENT(m,        CACHE_REPLACEMENT_LRU);
-               SET_CACHE_DISCARD(m,            i->discard);
+               SET_BCH_MEMBER_TIER(m,          i->tier);
+               SET_BCH_MEMBER_REPLACEMENT(m,   CACHE_REPLACEMENT_LRU);
+               SET_BCH_MEMBER_DISCARD(m,       i->discard);
        }
 
-       sb->u64s = __cpu_to_le16(bch_journal_buckets_offset(sb));
-
        for (i = devs; i < devs + nr_devs; i++) {
-               struct cache_member *m = sb->members + (i - devs);
+               sb->dev_idx = i - devs;
+
+               static const char zeroes[BCH_SB_SECTOR << 9];
+               struct nonce nonce = { 0 };
+
+               /* Zero start of disk */
+               xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
+
+               xpwrite(i->fd, &sb->layout, sizeof(sb->layout),
+                       BCH_SB_LAYOUT_SECTOR << 9);
 
-               sb->disk_uuid   = m->uuid;
-               sb->nr_this_dev = i - devs;
-               sb->csum        = __cpu_to_le64(__csum_set(sb, __le16_to_cpu(sb->u64s),
-                                                          CACHE_SB_CSUM_TYPE(sb)));
+               for (j = 0; j < sb->layout.nr_superblocks; j++) {
+                       sb->offset = sb->layout.sb_offset[j];
 
-               do_write_sb(i->fd, sb);
+                       sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb),
+                                                  nonce, sb);
+                       xpwrite(i->fd, sb, vstruct_bytes(sb),
+                               le64_to_cpu(sb->offset) << 9);
+               }
+
+               fsync(i->fd);
+               close(i->fd);
        }
 
        bcache_super_print(sb, HUMAN_READABLE);
@@ -187,16 +236,39 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
        free(sb);
 }
 
-void bcache_super_print(struct cache_sb *sb, int units)
+struct bch_sb *bcache_super_read(const char *path)
 {
-       unsigned i;
+       struct bch_sb sb, *ret;
+
+       int fd = open(path, O_RDONLY);
+       if (fd < 0)
+               die("couldn't open %s", path);
+
+       xpread(fd, &sb, sizeof(sb), BCH_SB_SECTOR << 9);
+
+       if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
+               die("not a bcache superblock");
+
+       size_t bytes = vstruct_bytes(&sb);
+
+       ret = malloc(bytes);
+
+       xpread(fd, ret, bytes, BCH_SB_SECTOR << 9);
+
+       return ret;
+}
+
+void bcache_super_print(struct bch_sb *sb, int units)
+{
+       struct bch_sb_field_members *mi;
        char user_uuid_str[40], internal_uuid_str[40], member_uuid_str[40];
-       char label[SB_LABEL_SIZE + 1];
+       char label[BCH_SB_LABEL_SIZE + 1];
+       unsigned i;
 
        memset(label, 0, sizeof(label));
        memcpy(label, sb->label, sizeof(sb->label));
        uuid_unparse(sb->user_uuid.b, user_uuid_str);
-       uuid_unparse(sb->set_uuid.b, internal_uuid_str);
+       uuid_unparse(sb->uuid.b, internal_uuid_str);
 
        printf("External UUID:                  %s\n"
               "Internal UUID:                  %s\n"
@@ -226,44 +298,50 @@ void bcache_super_print(struct cache_sb *sb, int units)
               label,
               le64_to_cpu(sb->version),
               pr_units(le16_to_cpu(sb->block_size), units),
-              pr_units(CACHE_SET_BTREE_NODE_SIZE(sb), units),
-              pr_units(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb), units),
+              pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units),
+              pr_units(1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb), units),
 
-              CACHE_SET_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS
-              ? bch_error_actions[CACHE_SET_ERROR_ACTION(sb)]
+              BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS
+              ? bch_error_actions[BCH_SB_ERROR_ACTION(sb)]
               : "unknown",
 
-              CACHE_SET_CLEAN(sb),
+              BCH_SB_CLEAN(sb),
 
-              CACHE_SET_META_REPLICAS_HAVE(sb),
-              CACHE_SET_META_REPLICAS_WANT(sb),
-              CACHE_SET_DATA_REPLICAS_HAVE(sb),
-              CACHE_SET_DATA_REPLICAS_WANT(sb),
+              BCH_SB_META_REPLICAS_HAVE(sb),
+              BCH_SB_META_REPLICAS_WANT(sb),
+              BCH_SB_DATA_REPLICAS_HAVE(sb),
+              BCH_SB_DATA_REPLICAS_WANT(sb),
 
-              CACHE_SET_META_PREFERRED_CSUM_TYPE(sb) < BCH_CSUM_NR
-              ? bch_csum_types[CACHE_SET_META_PREFERRED_CSUM_TYPE(sb)]
+              BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_NR
+              ? bch_csum_types[BCH_SB_META_CSUM_TYPE(sb)]
               : "unknown",
 
-              CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb) < BCH_CSUM_NR
-              ? bch_csum_types[CACHE_SET_DATA_PREFERRED_CSUM_TYPE(sb)]
+              BCH_SB_DATA_CSUM_TYPE(sb) < BCH_CSUM_NR
+              ? bch_csum_types[BCH_SB_DATA_CSUM_TYPE(sb)]
               : "unknown",
 
-              CACHE_SET_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_NR
-              ? bch_compression_types[CACHE_SET_COMPRESSION_TYPE(sb)]
+              BCH_SB_COMPRESSION_TYPE(sb) < BCH_COMPRESSION_NR
+              ? bch_compression_types[BCH_SB_COMPRESSION_TYPE(sb)]
               : "unknown",
 
-              CACHE_SET_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
-              ? bch_str_hash_types[CACHE_SET_STR_HASH_TYPE(sb)]
+              BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
+              ? bch_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)]
               : "unknown",
 
-              CACHE_INODE_32BIT(sb),
-              CACHE_SET_GC_RESERVE(sb),
-              CACHE_SET_ROOT_RESERVE(sb),
+              BCH_SB_INODE_32BIT(sb),
+              BCH_SB_GC_RESERVE(sb),
+              BCH_SB_ROOT_RESERVE(sb),
 
-              sb->nr_in_set);
+              sb->nr_devices);
 
-       for (i = 0; i < sb->nr_in_set; i++) {
-               struct cache_member *m = sb->members + i;
+       mi = bch_sb_get_members(sb);
+       if (!mi) {
+               printf("Member info section missing\n");
+               return;
+       }
+
+       for (i = 0; i < sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
                time_t last_mount = le64_to_cpu(m->last_mount);
 
                uuid_unparse(m->uuid.b, member_uuid_str);
@@ -290,41 +368,18 @@ void bcache_super_print(struct cache_sb *sb, int units)
                       le64_to_cpu(m->nbuckets),
                       last_mount ? ctime(&last_mount) : "(never)",
 
-                      CACHE_STATE(m) < CACHE_STATE_NR
-                      ? bch_cache_state[CACHE_STATE(m)]
+                      BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
+                      ? bch_cache_state[BCH_MEMBER_STATE(m)]
                       : "unknown",
 
-                      CACHE_TIER(m),
-                      CACHE_HAS_METADATA(m),
-                      CACHE_HAS_DATA(m),
+                      BCH_MEMBER_TIER(m),
+                      BCH_MEMBER_HAS_METADATA(m),
+                      BCH_MEMBER_HAS_DATA(m),
 
-                      CACHE_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
-                      ? bch_cache_replacement_policies[CACHE_REPLACEMENT(m)]
+                      BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
+                      ? bch_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
                       : "unknown",
 
-                      CACHE_DISCARD(m));
+                      BCH_MEMBER_DISCARD(m));
        }
 }
-
-struct cache_sb *bcache_super_read(const char *path)
-{
-       struct cache_sb sb, *ret;
-       size_t bytes;
-
-       int fd = open(path, O_RDONLY);
-       if (fd < 0)
-               die("couldn't open %s", path);
-
-       xpread(fd, &sb, sizeof(sb), SB_SECTOR << 9);
-
-       if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
-               die("not a bcache superblock");
-
-       bytes = sizeof(sb) + le16_to_cpu(sb.u64s) * sizeof(u64);
-
-       ret = calloc(1, bytes);
-
-       xpread(fd, ret, bytes, SB_SECTOR << 9);
-
-       return ret;
-}
index 07329cd1f5cb092eaaa13792f5fe186f344e039e..6ec3f42dd1c091f77e93a963123fbab73e58ecaf 100644 (file)
@@ -2,6 +2,8 @@
 #define _LIBBCACHE_H
 
 #include <linux/uuid.h>
+#include "tools-util.h"
+#include "vstructs.h"
 #include "stdbool.h"
 
 #include "tools-util.h"
@@ -34,6 +36,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
                   unsigned meta_csum_type,
                   unsigned data_csum_type,
                   unsigned compression_type,
+                  const char *passphrase,
                   unsigned meta_replicas,
                   unsigned data_replicas,
                   unsigned on_error_action,
@@ -41,8 +44,8 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
                   char *label,
                   uuid_le uuid);
 
-void bcache_super_print(struct cache_sb *, int);
+struct bch_sb *bcache_super_read(const char *);
 
-struct cache_sb *bcache_super_read(const char *);
+void bcache_super_print(struct bch_sb *, int);
 
 #endif /* _LIBBCACHE_H */
index 64d56165fea3402f5ebcb5a348e268ac115ba820..468d98dada76ec269e3a6876cc47aa2cdfc1d8b6 100644 (file)
@@ -187,7 +187,7 @@ int bch_set_acl(struct inode *inode, struct posix_acl *acl, int type)
                        if (ret < 0)
                                return ret;
                        else {
-                               inode->i_ctime = CURRENT_TIME_SEC;
+                               inode->i_ctime = current_fs_time(inode->i_sb);
                                mark_inode_dirty(inode);
                                if (ret == 0)
                                        acl = NULL;
index 4fe08b5718937ca7f85849f359fe5dad92f67b97..cd22c3812d5687c6e107fbf958c63e666fd994d9 100644 (file)
@@ -64,7 +64,7 @@
 #include "extents.h"
 #include "io.h"
 #include "journal.h"
-#include "super.h"
+#include "super-io.h"
 
 #include <linux/blkdev.h>
 #include <linux/kthread.h>
@@ -105,7 +105,7 @@ void bch_cache_group_add_cache(struct cache_group *grp, struct cache *ca)
                if (rcu_access_pointer(grp->d[i].dev) == ca)
                        goto out;
 
-       BUG_ON(grp->nr_devices >= MAX_CACHES_PER_SET);
+       BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
 
        rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
 out:
@@ -124,9 +124,9 @@ static void pd_controllers_update(struct work_struct *work)
        int i;
 
        /* All units are in bytes */
-       u64 tier_size[CACHE_TIERS];
-       u64 tier_free[CACHE_TIERS];
-       u64 tier_dirty[CACHE_TIERS];
+       u64 tier_size[BCH_TIER_MAX];
+       u64 tier_free[BCH_TIER_MAX];
+       u64 tier_dirty[BCH_TIER_MAX];
        u64 tier0_can_free = 0;
 
        memset(tier_size, 0, sizeof(tier_size));
@@ -134,7 +134,7 @@ static void pd_controllers_update(struct work_struct *work)
        memset(tier_dirty, 0, sizeof(tier_dirty));
 
        rcu_read_lock();
-       for (i = CACHE_TIERS - 1; i >= 0; --i)
+       for (i = BCH_TIER_MAX - 1; i >= 0; --i)
                group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
                        struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
                        unsigned bucket_bits = ca->bucket_bits + 9;
@@ -246,6 +246,16 @@ static int prio_io(struct cache *ca, uint64_t bucket, int op)
        return submit_bio_wait(ca->bio_prio);
 }
 
+static struct nonce prio_nonce(struct prio_set *p)
+{
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = p->nonce[0],
+               [2] = p->nonce[1],
+               [3] = p->nonce[2]^BCH_NONCE_PRIO,
+       }};
+}
+
 static int bch_prio_write(struct cache *ca)
 {
        struct cache_set *c = ca->set;
@@ -279,12 +289,8 @@ static int bch_prio_write(struct cache *ca)
                }
 
                p->next_bucket  = cpu_to_le64(ca->prio_buckets[i + 1]);
-               p->magic        = cpu_to_le64(pset_magic(&c->disk_sb));
-
-               SET_PSET_CSUM_TYPE(p, c->opts.metadata_checksum);
-               p->csum         = cpu_to_le64(bch_checksum(PSET_CSUM_TYPE(p),
-                                                          &p->magic,
-                                                          bucket_bytes(ca) - 8));
+               p->magic        = cpu_to_le64(pset_magic(c));
+               get_random_bytes(&p->nonce, sizeof(p->nonce));
 
                spin_lock(&ca->prio_buckets_lock);
                r = bch_bucket_alloc(ca, RESERVE_PRIO);
@@ -298,6 +304,19 @@ static int bch_prio_write(struct cache *ca)
                bch_mark_metadata_bucket(ca, ca->buckets + r, false);
                spin_unlock(&ca->prio_buckets_lock);
 
+               SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
+
+               bch_encrypt(c, PSET_CSUM_TYPE(p),
+                           prio_nonce(p),
+                           p->encrypted_start,
+                           bucket_bytes(ca) -
+                           offsetof(struct prio_set, encrypted_start));
+
+               p->csum  = bch_checksum(c, PSET_CSUM_TYPE(p),
+                                       prio_nonce(p),
+                                       (void *) p + sizeof(p->csum),
+                                       bucket_bytes(ca) - sizeof(p->csum));
+
                ret = prio_io(ca, r, REQ_OP_WRITE);
                if (cache_fatal_io_err_on(ret, ca,
                                          "prio write to bucket %zu", r) ||
@@ -306,9 +325,9 @@ static int bch_prio_write(struct cache *ca)
        }
 
        spin_lock(&j->lock);
-       j->prio_buckets[ca->sb.nr_this_dev] = cpu_to_le64(ca->prio_buckets[0]);
+       j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]);
        j->nr_prio_buckets = max_t(unsigned,
-                                  ca->sb.nr_this_dev + 1,
+                                  ca->dev_idx + 1,
                                   j->nr_prio_buckets);
        spin_unlock(&j->lock);
 
@@ -320,7 +339,7 @@ static int bch_prio_write(struct cache *ca)
                        return ret;
 
                need_new_journal_entry = j->buf[res.idx].nr_prio_buckets <
-                       ca->sb.nr_this_dev + 1;
+                       ca->dev_idx + 1;
                bch_journal_res_put(j, &res);
 
                ret = bch_journal_flush_seq(j, res.seq);
@@ -355,13 +374,14 @@ int bch_prio_read(struct cache *ca)
        struct prio_set *p = ca->disk_buckets;
        struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
        struct bucket_mark new;
+       struct bch_csum csum;
        unsigned bucket_nr = 0;
        u64 bucket, expect, got;
        size_t b;
        int ret = 0;
 
        spin_lock(&c->journal.lock);
-       bucket = le64_to_cpu(c->journal.prio_buckets[ca->sb.nr_this_dev]);
+       bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]);
        spin_unlock(&c->journal.lock);
 
        /*
@@ -387,18 +407,28 @@ int bch_prio_read(struct cache *ca)
                                return -EIO;
 
                        got = le64_to_cpu(p->magic);
-                       expect = pset_magic(&c->disk_sb);
+                       expect = pset_magic(c);
                        unfixable_fsck_err_on(got != expect, c,
                                "bad magic (got %llu expect %llu) while reading prios from bucket %llu",
                                got, expect, bucket);
 
-                       got = le64_to_cpu(p->csum);
-                       expect = bch_checksum(PSET_CSUM_TYPE(p),
-                                             &p->magic,
-                                             bucket_bytes(ca) - 8);
-                       unfixable_fsck_err_on(got != expect, c,
-                               "bad checksum (got %llu expect %llu) while reading prios from bucket %llu",
-                               got, expect, bucket);
+                       unfixable_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c,
+                               "prio bucket with unknown csum type %llu bucket %lluu",
+                               PSET_CSUM_TYPE(p), bucket);
+
+                       csum = bch_checksum(c, PSET_CSUM_TYPE(p),
+                                           prio_nonce(p),
+                                           (void *) p + sizeof(p->csum),
+                                           bucket_bytes(ca) - sizeof(p->csum));
+                       unfixable_fsck_err_on(bch_crc_cmp(csum, p->csum), c,
+                               "bad checksum reading prios from bucket %llu",
+                               bucket);
+
+                       bch_encrypt(c, PSET_CSUM_TYPE(p),
+                                   prio_nonce(p),
+                                   p->encrypted_start,
+                                   bucket_bytes(ca) -
+                                   offsetof(struct prio_set, encrypted_start));
 
                        bucket = le64_to_cpu(p->next_bucket);
                        d = p->data;
@@ -1029,7 +1059,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
        spin_lock(&devs->lock);
 
        for (i = 0; i < devs->nr_devices; i++)
-               available += !test_bit(devs->d[i].dev->sb.nr_this_dev,
+               available += !test_bit(devs->d[i].dev->dev_idx,
                                       caches_used);
 
        recalc_alloc_group_weights(c, devs);
@@ -1054,7 +1084,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
 
                ca = devs->d[i].dev;
 
-               if (test_bit(ca->sb.nr_this_dev, caches_used))
+               if (test_bit(ca->dev_idx, caches_used))
                        continue;
 
                if (fail_idx == -1 &&
@@ -1082,11 +1112,11 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
                ob->ptrs[0] = (struct bch_extent_ptr) {
                        .gen    = ca->buckets[bucket].mark.gen,
                        .offset = bucket_to_sector(ca, bucket),
-                       .dev    = ca->sb.nr_this_dev,
+                       .dev    = ca->dev_idx,
                };
                ob->ptr_offset[0] = 0;
 
-               __set_bit(ca->sb.nr_this_dev, caches_used);
+               __set_bit(ca->dev_idx, caches_used);
                available--;
                devs->cur_device = i;
        }
@@ -1334,7 +1364,7 @@ static int open_bucket_add_buckets(struct cache_set *c,
                                   enum alloc_reserve reserve,
                                   struct closure *cl)
 {
-       long caches_used[BITS_TO_LONGS(MAX_CACHES_PER_SET)];
+       long caches_used[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
        int i, dst;
 
        /*
@@ -1475,6 +1505,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
                EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
 
                tmp = ob->ptrs[i];
+               tmp.cached = bkey_extent_is_cached(&e->k);
                tmp.offset += ob->ptr_offset[i];
                extent_ptr_append(e, tmp);
 
@@ -1657,7 +1688,7 @@ static void bch_stop_write_point(struct cache *ca,
                return;
 
        for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
-               if (ptr->dev == ca->sb.nr_this_dev)
+               if (ptr->dev == ca->dev_idx)
                        goto found;
 
        mutex_unlock(&ob->lock);
@@ -1682,7 +1713,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
                if (atomic_read(&ob->pin)) {
                        mutex_lock(&ob->lock);
                        for (ptr = ob->ptrs; ptr < ob->ptrs + ob->nr_ptrs; ptr++)
-                               if (ptr->dev == ca->sb.nr_this_dev) {
+                               if (ptr->dev == ca->dev_idx) {
                                        mutex_unlock(&ob->lock);
                                        return true;
                                }
index 337b6e46517a88f60077a5f82ed012e5a040c6db..fbe8b75c8251750ab96137fc20e0b57f6cd830fa 100644 (file)
@@ -56,7 +56,7 @@ struct cache_group {
        struct {
                u64             weight;
                struct cache    *dev;
-       }                       d[MAX_CACHES_PER_SET];
+       }                       d[BCH_SB_MEMBERS_MAX];
 };
 
 /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
index 309d37286a6dce76c3026ce25d6f47d10e949bdb..8a0262fbc1a7a960d37b0d2d33df5a83f00e0634 100644 (file)
@@ -314,6 +314,8 @@ do {                                                                        \
 
 struct btree;
 struct cache;
+struct crypto_blkcipher;
+struct crypto_ahash;
 
 enum gc_phase {
        GC_PHASE_PENDING_DELETE         = BTREE_ID_NR + 1,
@@ -332,7 +334,6 @@ struct cache_member_cpu {
        u16                     bucket_size;    /* sectors */
        u8                      state;
        u8                      tier;
-       u8                      replication_set;
        u8                      has_metadata;
        u8                      has_data;
        u8                      replacement;
@@ -342,7 +343,7 @@ struct cache_member_cpu {
 
 struct cache_member_rcu {
        struct rcu_head         rcu;
-       unsigned                nr_in_set;
+       unsigned                nr_devices;
        struct cache_member_cpu m[];
 };
 
@@ -363,14 +364,13 @@ struct cache {
 
        struct cache_group      self;
 
+       u8                      dev_idx;
        /*
         * Cached version of this device's member info from superblock
-        * Committed by write_super()
+        * Committed by bch_write_super() -> bch_cache_set_mi_update()
         */
-       struct {
-               u8              nr_this_dev;
-       }                       sb;
        struct cache_member_cpu mi;
+       uuid_le                 uuid;
 
        struct bcache_superblock disk_sb;
 
@@ -518,36 +518,45 @@ struct cache_set {
        struct percpu_ref       writes;
        struct work_struct      read_only_work;
 
-       struct cache __rcu      *cache[MAX_CACHES_PER_SET];
-
-       struct mutex            mi_lock;
-       struct cache_member_rcu __rcu *members;
-       struct cache_member     *disk_mi; /* protected by register_lock */
+       struct cache __rcu      *cache[BCH_SB_MEMBERS_MAX];
 
        struct cache_set_opts   opts;
 
        /*
         * Cached copy in native endianness:
-        * Set by cache_sb_to_cache_set:
+        * Set by bch_cache_set_mi_update():
         */
+       struct cache_member_rcu __rcu *members;
+
+       /* Updated by bch_sb_update():*/
        struct {
+               uuid_le         uuid;
+               uuid_le         user_uuid;
+
                u16             block_size;
                u16             btree_node_size;
 
-               u8              nr_in_set;
+               u8              nr_devices;
                u8              clean;
 
                u8              meta_replicas_have;
                u8              data_replicas_have;
 
                u8              str_hash_type;
+               u8              encryption_type;
+
+               u64             time_base_lo;
+               u32             time_base_hi;
+               u32             time_precision;
        }                       sb;
 
-       struct cache_sb         disk_sb;
+       struct bch_sb           *disk_sb;
+       unsigned                disk_sb_order;
+
        unsigned short          block_bits;     /* ilog2(block_size) */
 
        struct closure          sb_write;
-       struct semaphore        sb_write_mutex;
+       struct mutex            sb_lock;
 
        struct backing_dev_info bdi;
 
@@ -631,7 +640,7 @@ struct cache_set {
         * allocate from:
         */
        struct cache_group      cache_all;
-       struct cache_group      cache_tiers[CACHE_TIERS];
+       struct cache_group      cache_tiers[BCH_TIER_MAX];
 
        u64                     capacity; /* sectors */
 
@@ -724,6 +733,11 @@ struct cache_set {
        struct bio_decompress_worker __percpu
                                *bio_decompress_worker;
 
+       struct crypto_blkcipher *chacha20;
+       struct crypto_shash     *poly1305;
+
+       atomic64_t              key_version;
+
        /* For punting bio submissions to workqueue, io.c */
        struct bio_list         bio_submit_list;
        struct work_struct      bio_submit_work;
index 64d2c845ce228c25cd2c783f5a670ada7b13f7ce..374237e2ea2b2203aaa2bfbe6f477af0545bf4fd 100644 (file)
@@ -81,9 +81,9 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 
 #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
 
-       p("u64s %u type %u %llu:%llu snap %u len %u ver %u",
+       p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
          k->u64s, k->type, k->p.inode, k->p.offset,
-         k->p.snapshot, k->size, k->version);
+         k->p.snapshot, k->size, k->version.lo);
 
        BUG_ON(bkey_packed(k));
 
@@ -258,13 +258,21 @@ bool bch_bkey_transform(const struct bkey_format *out_f,
        return true;
 }
 
+#define bkey_fields()                                                  \
+       x(BKEY_FIELD_INODE,             p.inode)                        \
+       x(BKEY_FIELD_OFFSET,            p.offset)                       \
+       x(BKEY_FIELD_SNAPSHOT,          p.snapshot)                     \
+       x(BKEY_FIELD_SIZE,              size)                           \
+       x(BKEY_FIELD_VERSION_HI,        version.hi)                     \
+       x(BKEY_FIELD_VERSION_LO,        version.lo)
+
 struct bkey __bkey_unpack_key(const struct bkey_format *format,
                              const struct bkey_packed *in)
 {
        struct unpack_state state = unpack_state_init(format, in);
        struct bkey out;
 
-       EBUG_ON(format->nr_fields != 5);
+       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
        EBUG_ON(in->u64s < format->key_u64s);
        EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
        EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
@@ -274,11 +282,10 @@ struct bkey __bkey_unpack_key(const struct bkey_format *format,
        out.needs_whiteout = in->needs_whiteout;
        out.type        = in->type;
        out.pad[0]      = 0;
-       out.p.inode     = get_inc_field(&state, BKEY_FIELD_INODE);
-       out.p.offset    = get_inc_field(&state, BKEY_FIELD_OFFSET);
-       out.p.snapshot  = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
-       out.size        = get_inc_field(&state, BKEY_FIELD_SIZE);
-       out.version     = get_inc_field(&state, BKEY_FIELD_VERSION);
+
+#define x(id, field)   out.field = get_inc_field(&state, id);
+       bkey_fields()
+#undef x
 
        return out;
 }
@@ -290,7 +297,7 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format,
        struct unpack_state state = unpack_state_init(format, in);
        struct bpos out;
 
-       EBUG_ON(format->nr_fields != 5);
+       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
        EBUG_ON(in->u64s < format->key_u64s);
        EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
 
@@ -311,17 +318,14 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
        struct pack_state state = pack_state_init(format, out);
 
        EBUG_ON((void *) in == (void *) out);
-       EBUG_ON(format->nr_fields != 5);
+       EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
        EBUG_ON(in->format != KEY_FORMAT_CURRENT);
 
        out->_data[0] = 0;
 
-       if (!set_inc_field(&state, BKEY_FIELD_INODE,    in->p.inode) ||
-           !set_inc_field(&state, BKEY_FIELD_OFFSET,   in->p.offset) ||
-           !set_inc_field(&state, BKEY_FIELD_SNAPSHOT, in->p.snapshot) ||
-           !set_inc_field(&state, BKEY_FIELD_SIZE,     in->size) ||
-           !set_inc_field(&state, BKEY_FIELD_VERSION,  in->version))
-               return false;
+#define x(id, field)   if (!set_inc_field(&state, id, in->field)) return false;
+       bkey_fields()
+#undef x
 
        /*
         * Extents - we have to guarantee that if an extent is packed, a trimmed
@@ -340,47 +344,6 @@ bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
        return true;
 }
 
-/*
- * Alternate implementations using bch_bkey_transform_key() - unfortunately, too
- * slow
- */
-#if 0
-struct bkey __bkey_unpack_key(const struct bkey_format *format,
-                             const struct bkey_packed *in)
-{
-       struct bkey out;
-       bool s;
-
-       EBUG_ON(format->nr_fields != 5);
-       EBUG_ON(in->u64s < format->key_u64s);
-       EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
-
-       s = bch_bkey_transform_key(&bch_bkey_format_current, (void *) &out,
-                                  format, in);
-       EBUG_ON(!s);
-
-       out.format = KEY_FORMAT_CURRENT;
-
-       return out;
-}
-
-bool bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-                  const struct bkey_format *format)
-{
-       EBUG_ON(format->nr_fields != 5);
-       EBUG_ON(in->format != KEY_FORMAT_CURRENT);
-
-       if (!bch_bkey_transform_key(format, out,
-                                   &bch_bkey_format_current, (void *) in))
-               return false;
-
-       out->format = KEY_FORMAT_LOCAL_BTREE;
-
-       bch_bkey_pack_verify(out, in, format);
-       return true;
-}
-#endif
-
 /**
  * bkey_unpack -- unpack the key and the value
  */
@@ -588,12 +551,10 @@ static void __bkey_format_add(struct bkey_format_state *s,
  */
 void bch_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
 {
-       __bkey_format_add(s, BKEY_FIELD_INODE, k->p.inode);
-       __bkey_format_add(s, BKEY_FIELD_OFFSET, k->p.offset);
+#define x(id, field) __bkey_format_add(s, id, k->field);
+       bkey_fields()
+#undef x
        __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
-       __bkey_format_add(s, BKEY_FIELD_SNAPSHOT, k->p.snapshot);
-       __bkey_format_add(s, BKEY_FIELD_SIZE, k->size);
-       __bkey_format_add(s, BKEY_FIELD_VERSION, k->version);
 }
 
 void bch_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
@@ -636,6 +597,12 @@ struct bkey_format bch_bkey_format_done(struct bkey_format_state *s)
                bits += ret.bits_per_field[i];
        }
 
+       /* allow for extent merging: */
+       if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
+               ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
+               bits += 4;
+       }
+
        ret.key_u64s = DIV_ROUND_UP(bits, 64);
 
        /* if we have enough spare bits, round fields up to nearest byte */
@@ -1014,25 +981,13 @@ int bch_compile_bkey_format(const struct bkey_format *format, void *_out)
        /* mov [rdi], eax */
        I2(0x89, 0x07);
 
-       out = compile_bkey_field(format, out,   BKEY_FIELD_INODE,
-                                offsetof(struct bkey, p.inode), 8,
-                                &eax_zeroed);
-
-       out = compile_bkey_field(format, out,   BKEY_FIELD_OFFSET,
-                                offsetof(struct bkey, p.offset), 8,
-                                &eax_zeroed);
-
-       out = compile_bkey_field(format, out,   BKEY_FIELD_SNAPSHOT,
-                                offsetof(struct bkey, p.snapshot), 4,
-                                &eax_zeroed);
-
-       out = compile_bkey_field(format, out,   BKEY_FIELD_SIZE,
-                                offsetof(struct bkey, size), 4,
-                                &eax_zeroed);
-
-       out = compile_bkey_field(format, out,   BKEY_FIELD_VERSION,
-                                offsetof(struct bkey, version), 4,
+#define x(id, field)                                                   \
+       out = compile_bkey_field(format, out, id,                       \
+                                offsetof(struct bkey, field),          \
+                                sizeof(((struct bkey *) NULL)->field), \
                                 &eax_zeroed);
+       bkey_fields()
+#undef x
 
        /* retq */
        I1(0xc3);
@@ -1078,43 +1033,6 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
 }
 #endif
 
-/*
- * Would like to use this if we can make __bkey_cmp_bits() fast enough, it'll be
- * a decent reduction in code size
- */
-#if 0
-static int bkey_cmp_verify(const struct bkey *l, const struct bkey *r)
-{
-       if (l->p.inode != r->p.inode)
-               return l->p.inode < r->p.inode ? -1 : 1;
-
-       if (l->p.offset != r->p.offset)
-               return l->p.offset < r->p.offset ? -1 : 1;
-
-       if (l->p.snapshot != r->p.snapshot)
-               return l->p.snapshot < r->p.snapshot ? -1 : 1;
-
-       return 0;
-}
-
-int bkey_cmp(const struct bkey *l, const struct bkey *r)
-{
-       int ret;
-
-       EBUG_ON(bkey_packed(l) || bkey_packed(r));
-
-       ret = __bkey_cmp_bits((sizeof(l->inode) +
-                              sizeof(l->offset) +
-                              sizeof(l->snapshot)) * BITS_PER_BYTE,
-                             __high_word(BKEY_U64s, l),
-                             __high_word(BKEY_U64s, r));
-
-       BUG_ON(ret != bkey_cmp_verify(l, r));
-
-       return ret;
-}
-#endif
-
 __pure
 int __bkey_cmp_packed_format_checked(const struct bkey_packed *l,
                                     const struct bkey_packed *r,
@@ -1214,7 +1132,7 @@ void bkey_pack_test(void)
 
        struct bkey_format test_format = {
                .key_u64s       = 2,
-               .nr_fields      = 5,
+               .nr_fields      = BKEY_NR_FIELDS,
                .bits_per_field = {
                        13,
                        64,
@@ -1230,21 +1148,9 @@ void bkey_pack_test(void)
                u64 a, v = get_inc_field(&in_s, i);
 
                switch (i) {
-               case 0:
-                       a = t.p.inode;
-                       break;
-               case 1:
-                       a = t.p.offset;
-                       break;
-               case 2:
-                       a = t.p.snapshot;
-                       break;
-               case 3:
-                       a = t.size;
-                       break;
-               case 4:
-                       a = t.version;
-                       break;
+#define x(id, field)   case id: a = t.field; break;
+       bkey_fields()
+#undef x
                default:
                        BUG();
                }
index 3e29cdde93b4cccb4e375c518c91ea41f2861093..0893134f6673377e8db3e5585befe42369507b2d 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/bcache.h>
 
 #include "util.h"
+#include "vstructs.h"
 
 void bch_to_binary(char *, const u64 *, unsigned);
 int bch_bkey_to_text(char *, size_t, const struct bkey *);
@@ -28,15 +29,7 @@ struct bkey_s {
        };
 };
 
-#define bkey_next(_k)                                                  \
-({                                                                     \
-       BUILD_BUG_ON(!type_is(_k, struct bkey *) &&                     \
-                    !type_is(_k, struct bkey_i *) &&                   \
-                    !type_is(_k, struct bkey_packed *));               \
-                                                                       \
-       ((typeof(_k)) __bkey_idx(((struct bkey *) (_k)),                \
-                                ((struct bkey *) (_k))->u64s));        \
-})
+#define bkey_next(_k)          vstruct_next(_k)
 
 static inline unsigned bkey_val_u64s(const struct bkey *k)
 {
@@ -218,6 +211,22 @@ static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 void bch_bpos_swab(struct bpos *);
 void bch_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+       if (l.hi != r.hi)
+               return l.hi < r.hi ? -1 : 1;
+       if (l.lo != r.lo)
+               return l.lo < r.lo ? -1 : 1;
+       return 0;
+}
+
+#define ZERO_VERSION   ((struct bversion) { .hi = 0, .lo = 0 })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+       return !bversion_cmp(v, ZERO_VERSION);
+}
+
 #ifdef CONFIG_BCACHE_DEBUG
 /* statement expressions confusing unlikely()? */
 #define bkey_packed(_k)                                                        \
@@ -555,6 +564,7 @@ static inline void __bch_extent_assert(u8 type, u8 nr)
 }
 
 __BKEY_VAL_ACCESSORS(extent,           BCH_EXTENT, __bch_extent_assert);
+BKEY_VAL_ACCESSORS(reservation,                BCH_RESERVATION);
 
 BKEY_VAL_ACCESSORS(inode,              BCH_INODE_FS);
 BKEY_VAL_ACCESSORS(inode_blockdev,     BCH_INODE_BLOCKDEV);
index cd231f5ee7372558550212fb9fd2be7d91038695..d3a373c2a61612c64ac67a8a80084a63b9204dad 100644 (file)
@@ -2,11 +2,12 @@
 #include "bcache.h"
 #include "blockdev.h"
 #include "btree_iter.h"
+#include "btree_update.h"
 #include "checksum.h"
 #include "error.h"
 #include "inode.h"
 #include "request.h"
-#include "super.h"
+#include "super-io.h"
 #include "writeback.h"
 
 #include <linux/kthread.h>
@@ -42,15 +43,22 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
        down(&dc->sb_write_mutex);
        closure_init(cl, parent);
 
+       sb->csum = csum_vstruct(NULL, BCH_CSUM_CRC64,
+                               (struct nonce) { 0 }, sb).lo;
+
        bio_reset(bio);
-       bio->bi_end_io  = write_bdev_super_endio;
-       bio->bi_private = dc;
+       bio->bi_bdev            = dc->disk_sb.bdev;
+       bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
+       bio->bi_iter.bi_size    =
+               roundup(vstruct_bytes(sb),
+                       bdev_logical_block_size(dc->disk_sb.bdev));
+       bio->bi_end_io          = write_bdev_super_endio;
+       bio->bi_private         = dc;
+       bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FUA|REQ_META);
+       bch_bio_map(bio, sb);
 
        closure_get(cl);
 
-       sb->csum = cpu_to_le64(__csum_set(sb, 0, BCH_CSUM_CRC64));
-       __write_super(dc->disk.c, (void *) &dc->disk_sb);
-
        closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 }
 
@@ -263,7 +271,7 @@ static void calc_cached_dev_sectors(struct cache_set *c)
 void bch_cached_dev_run(struct cached_dev *dc)
 {
        struct bcache_device *d = &dc->disk;
-       char buf[SB_LABEL_SIZE + 1];
+       char buf[BCH_SB_LABEL_SIZE + 1];
        char *env[] = {
                "DRIVER=bcache",
                kasprintf(GFP_KERNEL, "CACHED_UUID=%pU",
@@ -272,8 +280,8 @@ void bch_cached_dev_run(struct cached_dev *dc)
                NULL,
        };
 
-       memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE);
-       buf[SB_LABEL_SIZE] = '\0';
+       memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
+       buf[BCH_SB_LABEL_SIZE] = '\0';
        env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
 
        if (atomic_xchg(&dc->running, 1)) {
@@ -370,8 +378,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
        bdevname(dc->disk_sb.bdev, buf);
 
        if (memcmp(&dc->disk_sb.sb->set_uuid,
-                  &c->disk_sb.set_uuid,
-                  sizeof(c->disk_sb.set_uuid)))
+                  &c->sb.uuid,
+                  sizeof(c->sb.uuid)))
                return -ENOENT;
 
        if (dc->disk.c) {
@@ -424,7 +432,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
                SET_CACHED_DEV(&dc->disk.inode.v, true);
                dc->disk.inode.v.i_uuid = dc->disk_sb.sb->disk_uuid;
                memcpy(dc->disk.inode.v.i_label,
-                      dc->disk_sb.sb->label, SB_LABEL_SIZE);
+                      dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
                dc->disk.inode.v.i_ctime = rtime;
                dc->disk.inode.v.i_mtime = rtime;
 
@@ -438,14 +446,15 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 
                pr_info("attached inode %llu", bcache_dev_inum(&dc->disk));
 
-               dc->disk_sb.sb->set_uuid = c->disk_sb.set_uuid;
+               dc->disk_sb.sb->set_uuid = c->sb.uuid;
                SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
 
                bch_write_bdev_super(dc, &cl);
                closure_sync(&cl);
        } else {
                dc->disk.inode.v.i_mtime = rtime;
-               bch_inode_update(c, &dc->disk.inode.k_i, NULL);
+               bch_btree_update(c, BTREE_ID_INODES,
+                                &dc->disk.inode.k_i, NULL);
        }
 
        /* Count dirty sectors before attaching */
@@ -479,7 +488,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 
        pr_info("Caching %s as %s on set %pU",
                bdevname(dc->disk_sb.bdev, buf), dc->disk.disk->disk_name,
-               dc->disk.c->disk_sb.set_uuid.b);
+               dc->disk.c->sb.uuid.b);
        return 0;
 }
 
@@ -517,7 +526,7 @@ static void cached_dev_free(struct closure *cl)
 
        mutex_unlock(&bch_register_lock);
 
-       free_super((void *) &dc->disk_sb);
+       bch_free_super((void *) &dc->disk_sb);
 
        kobject_put(&dc->disk.kobj);
 }
index 34880952ea41fef2f88cc4e09ad78b8fc38ee1f3..a88d8017b6edb8c53d975e44e802c2464ae26d8d 100644 (file)
@@ -59,7 +59,7 @@ void bch_dump_bset(struct btree *b, struct bset *i, unsigned set)
                return;
 
        for (_k = i->start, k = bkey_unpack_key(b, _k);
-            _k < bset_bkey_last(i);
+            _k < vstruct_last(i);
             _k = _n, k = n) {
                _n = bkey_next(_k);
 
@@ -67,7 +67,7 @@ void bch_dump_bset(struct btree *b, struct bset *i, unsigned set)
                printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
                       _k->_data - i->_data, i->u64s, buf);
 
-               if (_n == bset_bkey_last(i))
+               if (_n == vstruct_last(i))
                        continue;
 
                n = bkey_unpack_key(b, _n);
index f03e6b868257b38d7830eccc38d332d40c3ca086..70868c51b0635e38e34a54aba0573639d11afe26 100644 (file)
@@ -9,6 +9,7 @@
 #include "bkey_methods.h"
 #include "btree_types.h"
 #include "util.h" /* for time_stats */
+#include "vstructs.h"
 
 /*
  * BKEYS:
@@ -302,15 +303,6 @@ static inline void btree_node_set_format(struct btree *b,
        bch_bset_set_no_aux_tree(b, b->set);
 }
 
-#define __set_bytes(_i, _u64s) (sizeof(*(_i)) + (_u64s) * sizeof(u64))
-#define set_bytes(_i)          __set_bytes(_i, (_i)->u64s)
-
-#define __set_blocks(_i, _u64s, _block_bytes)                          \
-       DIV_ROUND_UP((size_t) __set_bytes((_i), (_u64s)), (_block_bytes))
-
-#define set_blocks(_i, _block_bytes)                                   \
-       __set_blocks((_i), (_i)->u64s, (_block_bytes))
-
 static inline struct bset *bset_next_set(struct btree *b,
                                         unsigned block_bytes)
 {
@@ -318,7 +310,7 @@ static inline struct bset *bset_next_set(struct btree *b,
 
        EBUG_ON(!is_power_of_2(block_bytes));
 
-       return ((void *) i) + round_up(set_bytes(i), block_bytes);
+       return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
 }
 
 void bch_btree_keys_free(struct btree *);
@@ -387,11 +379,6 @@ static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
                (cmp == 0 && !strictly_greater && !bkey_deleted(k));
 }
 
-static inline struct bkey_packed *bset_bkey_idx(struct bset *i, unsigned idx)
-{
-       return bkey_idx(i, idx);
-}
-
 struct bset_tree *bch_bkey_to_bset(struct btree *, struct bkey_packed *);
 struct bkey_packed *bkey_prev_all(struct btree *, struct bset_tree *,
                                  struct bkey_packed *);
index ca6064af27866b3bbca90d517c9185f8d9f1df6a..4d5efdbd3970dfe4faac855a16b739a6e11ddc14 100644 (file)
@@ -695,7 +695,7 @@ retry:
 
        EBUG_ON(!b->written);
        EBUG_ON(b->btree_id != iter->btree_id ||
-               BSET_BTREE_LEVEL(&b->data->keys) != level ||
+               BTREE_NODE_LEVEL(b->data) != level ||
                bkey_cmp(b->data->max_key, k->k.p));
 
        return b;
index 8417187561f4d9b8573cf57eec5d4ffb3656fdb1..5c77b267ad5df9787bb6286143a2a8d9c1b73449 100644 (file)
@@ -18,6 +18,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "super-io.h"
 #include "writeback.h"
 
 #include <linux/slab.h>
@@ -118,8 +119,8 @@ u8 bch_btree_key_recalc_oldest_gen(struct cache_set *c, struct bkey_s_c k)
 /*
  * For runtime mark and sweep:
  */
-u8 __bch_btree_mark_key(struct cache_set *c, enum bkey_type type,
-                       struct bkey_s_c k)
+static u8 bch_btree_mark_key(struct cache_set *c, enum bkey_type type,
+                            struct bkey_s_c k)
 {
        switch (type) {
        case BKEY_TYPE_BTREE:
@@ -133,10 +134,14 @@ u8 __bch_btree_mark_key(struct cache_set *c, enum bkey_type type,
        }
 }
 
-static u8 btree_mark_key(struct cache_set *c, struct btree *b,
-                        struct bkey_s_c k)
+u8 bch_btree_mark_key_initial(struct cache_set *c, enum bkey_type type,
+                         struct bkey_s_c k)
 {
-       return __bch_btree_mark_key(c, btree_node_type(b), k);
+       atomic64_set(&c->key_version,
+                    max_t(u64, k.k->version.lo,
+                          atomic64_read(&c->key_version)));
+
+       return bch_btree_mark_key(c, type, k);
 }
 
 static bool btree_gc_mark_node(struct cache_set *c, struct btree *b)
@@ -151,7 +156,8 @@ static bool btree_gc_mark_node(struct cache_set *c, struct btree *b)
                                               btree_node_is_extents(b),
                                               &unpacked) {
                        bkey_debugcheck(c, b, k);
-                       stale = max(stale, btree_mark_key(c, b, k));
+                       stale = max(stale, bch_btree_mark_key(c,
+                                                       btree_node_type(b), k));
                }
 
                if (btree_gc_rewrite_disabled(c))
@@ -218,7 +224,7 @@ static int bch_gc_btree(struct cache_set *c, enum btree_id btree_id)
        mutex_lock(&c->btree_root_lock);
 
        b = c->btree_roots[btree_id].b;
-       __bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
+       bch_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
        gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 
        mutex_unlock(&c->btree_root_lock);
@@ -265,22 +271,21 @@ static void bch_mark_allocator_buckets(struct cache_set *c)
 static void bch_mark_metadata(struct cache_set *c)
 {
        struct cache *ca;
-       unsigned i;
+       unsigned i, j;
+       u64 b;
 
        for_each_cache(ca, c, i) {
-               unsigned j;
-               u64 *i;
-
-               for (j = 0; j < bch_nr_journal_buckets(ca->disk_sb.sb); j++)
-                       bch_mark_metadata_bucket(ca,
-                               &ca->buckets[journal_bucket(ca->disk_sb.sb, j)],
-                               true);
+               for (j = 0; j < ca->journal.nr; j++) {
+                       b = ca->journal.buckets[j];
+                       bch_mark_metadata_bucket(ca, ca->buckets + b, true);
+               }
 
                spin_lock(&ca->prio_buckets_lock);
 
-               for (i = ca->prio_buckets;
-                    i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
-                       bch_mark_metadata_bucket(ca, &ca->buckets[*i], true);
+               for (j = 0; j < prio_buckets(ca) * 2; j++) {
+                       b = ca->prio_buckets[j];
+                       bch_mark_metadata_bucket(ca, ca->buckets + b, true);
+               }
 
                spin_unlock(&ca->prio_buckets_lock);
        }
@@ -476,9 +481,8 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
 
        /* Check if all keys in @old_nodes could fit in one fewer node */
        if (nr_old_nodes <= 1 ||
-           __set_blocks(old_nodes[0]->data,
-                        DIV_ROUND_UP(u64s, nr_old_nodes - 1),
-                        block_bytes(c)) > blocks)
+           __vstruct_blocks(struct btree_node, c->block_bits,
+                            DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
                return;
 
        res = bch_btree_reserve_get(c, parent, nr_old_nodes,
@@ -542,9 +546,9 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
                u64s = 0;
 
                for (k = s2->start;
-                    k < bset_bkey_last(s2) &&
-                    __set_blocks(n1->data, le16_to_cpu(s1->u64s) + u64s + k->u64s,
-                                 block_bytes(c)) <= blocks;
+                    k < vstruct_last(s2) &&
+                    vstruct_blocks_plus(n1->data, c->block_bits,
+                                        u64s + k->u64s) <= blocks;
                     k = bkey_next(k)) {
                        last = k;
                        u64s += k->u64s;
@@ -554,7 +558,7 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
                        /* n2 fits entirely in n1 */
                        n1->key.k.p = n1->data->max_key = n2->data->max_key;
 
-                       memcpy_u64s(bset_bkey_last(s1),
+                       memcpy_u64s(vstruct_last(s1),
                                    s2->start,
                                    le16_to_cpu(s2->u64s));
                        le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s));
@@ -578,12 +582,12 @@ static void bch_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
                                btree_type_successor(iter->btree_id,
                                                     n1->data->max_key);
 
-                       memcpy_u64s(bset_bkey_last(s1),
+                       memcpy_u64s(vstruct_last(s1),
                                    s2->start, u64s);
                        le16_add_cpu(&s1->u64s, u64s);
 
                        memmove(s2->start,
-                               bset_bkey_idx(s2, u64s),
+                               vstruct_idx(s2, u64s),
                                (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64));
                        s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s);
 
@@ -866,7 +870,7 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id)
                        for_each_btree_node_key_unpack(b, k, &node_iter,
                                                       btree_node_is_extents(b),
                                                       &unpacked)
-                               btree_mark_key(c, b, k);
+                               bch_btree_mark_key_initial(c, btree_node_type(b), k);
                }
 
                bch_btree_iter_cond_resched(&iter);
@@ -874,8 +878,8 @@ static void bch_initial_gc_btree(struct cache_set *c, enum btree_id id)
 
        bch_btree_iter_unlock(&iter);
 
-       __bch_btree_mark_key(c, BKEY_TYPE_BTREE,
-                            bkey_i_to_s_c(&c->btree_roots[id].b->key));
+       bch_btree_mark_key(c, BKEY_TYPE_BTREE,
+                          bkey_i_to_s_c(&c->btree_roots[id].b->key));
 }
 
 int bch_initial_gc(struct cache_set *c, struct list_head *journal)
@@ -889,6 +893,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
                bch_journal_mark(c, journal);
        }
 
+       /*
+        * Skip past versions that might have possibly been used (as nonces),
+        * but hadn't had their pointers written:
+        */
+       if (c->sb.encryption_type)
+               atomic64_add(1 << 16, &c->key_version);
+
        bch_mark_metadata(c);
 
        gc_pos_set(c, gc_phase(GC_PHASE_DONE));
index 91d31c05c4b91dd75b83c31de02f36339c5f0e67..0607187f6081dfe457b2d6fa9e816bc7a3e67671 100644 (file)
@@ -11,7 +11,7 @@ void bch_gc_thread_stop(struct cache_set *);
 int bch_gc_thread_start(struct cache_set *);
 int bch_initial_gc(struct cache_set *, struct list_head *);
 u8 bch_btree_key_recalc_oldest_gen(struct cache_set *, struct bkey_s_c);
-u8 __bch_btree_mark_key(struct cache_set *, enum bkey_type,
+u8 bch_btree_mark_key_initial(struct cache_set *, enum bkey_type,
                                struct bkey_s_c);
 
 /*
index 4c295af1803e80e5971f5a1ae1abdbb605b03927..e772c6adf389f5b4e6b61ddf4cde250051c24b56 100644 (file)
@@ -13,6 +13,7 @@
 #include "extents.h"
 #include "io.h"
 #include "journal.h"
+#include "super-io.h"
 
 #include <trace/events/bcache.h>
 
@@ -39,7 +40,7 @@ static void clear_needs_whiteout(struct bset *i)
 {
        struct bkey_packed *k;
 
-       for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+       for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
                k->needs_whiteout = false;
 }
 
@@ -47,7 +48,7 @@ static void set_needs_whiteout(struct bset *i)
 {
        struct bkey_packed *k;
 
-       for (k = i->start; k != bset_bkey_last(i); k = bkey_next(k))
+       for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
                k->needs_whiteout = true;
 }
 
@@ -341,7 +342,7 @@ bool __bch_compact_whiteouts(struct cache_set *c, struct btree *b,
                compacting = true;
                u_start = u_pos;
                start = i->start;
-               end = bset_bkey_last(i);
+               end = vstruct_last(i);
 
                if (src != dst) {
                        memmove(dst, src, sizeof(*src));
@@ -574,7 +575,7 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
 
        order = sorting_entire_node
                ? btree_page_order(c)
-               : get_order(__set_bytes(b->data, u64s));
+               : get_order(__vstruct_bytes(struct btree_node, u64s));
 
        out = btree_bounce_alloc(c, order, &used_mempool);
 
@@ -589,8 +590,7 @@ static void btree_node_sort(struct cache_set *c, struct btree *b,
 
        out->keys.u64s = cpu_to_le16(u64s);
 
-       BUG_ON((void *) bset_bkey_last(&out->keys) >
-              (void *) out + (PAGE_SIZE << order));
+       BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
 
        if (sorting_entire_node)
                bch_time_stats_update(&c->btree_sort_time, start_time);
@@ -654,7 +654,7 @@ static struct btree_nr_keys sort_repack(struct bset *dst,
                                        bool filter_whiteouts)
 {
        struct bkey_format *in_f = &src->format;
-       struct bkey_packed *in, *out = bset_bkey_last(dst);
+       struct bkey_packed *in, *out = vstruct_last(dst);
        struct btree_nr_keys nr;
 
        memset(&nr, 0, sizeof(nr));
@@ -723,7 +723,7 @@ static struct btree_nr_keys sort_repack_merge(struct cache_set *c,
                        btree_keys_account_key_add(&nr, 0, prev);
                        prev = bkey_next(prev);
                } else {
-                       prev = bset_bkey_last(dst);
+                       prev = vstruct_last(dst);
                }
 
                bkey_copy(prev, &tmp.k);
@@ -734,7 +734,7 @@ static struct btree_nr_keys sort_repack_merge(struct cache_set *c,
                btree_keys_account_key_add(&nr, 0, prev);
                out = bkey_next(prev);
        } else {
-               out = bset_bkey_last(dst);
+               out = vstruct_last(dst);
        }
 
        dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
@@ -854,22 +854,23 @@ void bch_btree_init_next(struct cache_set *c, struct btree *b,
                bch_btree_iter_reinit_node(iter, b);
 }
 
-/*
- * We seed the checksum with the entire first pointer (dev, gen and offset),
- * since for btree nodes we have to store the checksum with the data instead of
- * the pointer - this helps guard against reading a valid btree node that is not
- * the node we actually wanted:
- */
-#define btree_csum_set(_b, _i)                                         \
-({                                                                     \
-       void *_data = (void *) (_i) + 8;                                \
-       void *_end = bset_bkey_last(&(_i)->keys);                       \
-                                                                       \
-       bch_checksum_update(BSET_CSUM_TYPE(&(_i)->keys),                \
-                           bkey_i_to_extent_c(&(_b)->key)->v._data[0], \
-                           _data,                                      \
-                           _end - _data) ^ 0xffffffffffffffffULL;      \
-})
+static struct nonce btree_nonce(struct btree *b,
+                               struct bset *i,
+                               unsigned offset)
+{
+       return (struct nonce) {{
+               [0] = cpu_to_le32(offset),
+               [1] = ((__le32 *) &i->seq)[0],
+               [2] = ((__le32 *) &i->seq)[1],
+               [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
+       }};
+}
+
+static void bset_encrypt(struct cache_set *c, struct bset *i, struct nonce nonce)
+{
+       bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
+                   vstruct_end(i) - (void *) i->_data);
+}
 
 #define btree_node_error(b, c, ptr, fmt, ...)                          \
        cache_set_inconsistent(c,                                       \
@@ -877,7 +878,7 @@ void bch_btree_init_next(struct cache_set *c, struct btree *b,
                (b)->btree_id, (b)->level, btree_node_root(c, b)        \
                            ? btree_node_root(c, b)->level : -1,        \
                PTR_BUCKET_NR(ca, ptr), (b)->written,                   \
-               (i)->u64s, ##__VA_ARGS__)
+               le16_to_cpu((i)->u64s), ##__VA_ARGS__)
 
 static const char *validate_bset(struct cache_set *c, struct btree *b,
                                 struct cache *ca,
@@ -886,6 +887,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
                                 unsigned *whiteout_u64s)
 {
        struct bkey_packed *k, *prev = NULL;
+       struct bpos prev_pos = POS_MIN;
        bool seen_non_whiteout = false;
 
        if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
@@ -903,7 +905,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
        }
 
        for (k = i->start;
-            k != bset_bkey_last(i);) {
+            k != vstruct_last(i);) {
                struct bkey_s_c u;
                struct bkey tmp;
                const char *invalid;
@@ -911,13 +913,13 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
                if (!k->u64s) {
                        btree_node_error(b, c, ptr,
                                "KEY_U64s 0: %zu bytes of metadata lost",
-                               (void *) bset_bkey_last(i) - (void *) k);
+                               vstruct_end(i) - (void *) k);
 
                        i->u64s = cpu_to_le16((u64 *) k - i->_data);
                        break;
                }
 
-               if (bkey_next(k) > bset_bkey_last(i)) {
+               if (bkey_next(k) > vstruct_last(i)) {
                        btree_node_error(b, c, ptr,
                                         "key extends past end of bset");
 
@@ -931,7 +933,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
-                                         (u64 *) bset_bkey_last(i) - (u64 *) k);
+                                         (u64 *) vstruct_end(i) - (u64 *) k);
                        continue;
                }
 
@@ -951,7 +953,7 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
-                                         (u64 *) bset_bkey_last(i) - (u64 *) k);
+                                         (u64 *) vstruct_end(i) - (u64 *) k);
                        continue;
                }
 
@@ -963,22 +965,40 @@ static const char *validate_bset(struct cache_set *c, struct btree *b,
 
                if (!seen_non_whiteout &&
                    (!bkey_whiteout(k) ||
-                    (prev && bkey_cmp_left_packed_byval(b, prev,
-                                       bkey_start_pos(u.k)) > 0))) {
+                    (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
                        *whiteout_u64s = k->_data - i->_data;
                        seen_non_whiteout = true;
+               } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+                       btree_node_error(b, c, ptr,
+                                        "keys out of order: %llu:%llu > %llu:%llu",
+                                        prev_pos.inode,
+                                        prev_pos.offset,
+                                        u.k->p.inode,
+                                        bkey_start_offset(u.k));
+                       /* XXX: repair this */
                }
 
+               prev_pos = u.k->p;
                prev = k;
                k = bkey_next(k);
        }
 
        SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-
-       b->written += sectors;
        return NULL;
 }
 
+static bool extent_contains_ptr(struct bkey_s_c_extent e,
+                               struct bch_extent_ptr match)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               if (!memcmp(ptr, &match, sizeof(*ptr)))
+                       return true;
+
+       return false;
+}
+
 void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
                              struct cache *ca,
                              const struct bch_extent_ptr *ptr)
@@ -990,6 +1010,8 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
        bool used_mempool;
        unsigned u64s;
        const char *err;
+       struct bch_csum csum;
+       struct nonce nonce;
        int ret;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
@@ -1005,40 +1027,62 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
                if (!b->written) {
                        i = &b->data->keys;
 
+                       err = "bad magic";
+                       if (le64_to_cpu(b->data->magic) != bset_magic(c))
+                               goto err;
+
+                       err = "bad btree header";
+                       if (!b->data->keys.seq)
+                               goto err;
+
                        err = "unknown checksum type";
-                       if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+                       if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
                                goto err;
 
                        /* XXX: retry checksum errors */
 
+                       nonce = btree_nonce(b, i, b->written << 9);
+                       csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+
                        err = "bad checksum";
-                       if (le64_to_cpu(b->data->csum) !=
-                           btree_csum_set(b, b->data))
+                       if (bch_crc_cmp(csum, b->data->csum))
                                goto err;
 
-                       sectors = __set_blocks(b->data,
-                                              le16_to_cpu(b->data->keys.u64s),
-                                              block_bytes(c)) << c->block_bits;
+                       bch_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+                                   &b->data->flags,
+                                   (void *) &b->data->keys -
+                                   (void *) &b->data->flags);
+                       nonce = nonce_add(nonce,
+                                         round_up((void *) &b->data->keys -
+                                                  (void *) &b->data->flags,
+                                                  CHACHA20_BLOCK_SIZE));
+                       bset_encrypt(c, i, nonce);
 
-                       err = "bad magic";
-                       if (le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb))
-                               goto err;
-
-                       err = "bad btree header";
-                       if (!b->data->keys.seq)
-                               goto err;
+                       sectors = vstruct_sectors(b->data, c->block_bits);
 
                        if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
+                               u64 *p = (u64 *) &b->data->ptr;
+
+                               *p = swab64(*p);
                                bch_bpos_swab(&b->data->min_key);
                                bch_bpos_swab(&b->data->max_key);
                        }
 
+                       err = "incorrect btree id";
+                       if (BTREE_NODE_ID(b->data) != b->btree_id)
+                               goto err;
+
+                       err = "incorrect level";
+                       if (BTREE_NODE_LEVEL(b->data) != b->level)
+                               goto err;
+
                        err = "incorrect max key";
                        if (bkey_cmp(b->data->max_key, b->key.k.p))
                                goto err;
 
-                       err = "incorrect level";
-                       if (BSET_BTREE_LEVEL(i) != b->level)
+                       err = "incorrect backpointer";
+                       if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
+                                                b->data->ptr))
                                goto err;
 
                        err = bch_bkey_format_validate(&b->data->format);
@@ -1056,23 +1100,27 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
                                break;
 
                        err = "unknown checksum type";
-                       if (BSET_CSUM_TYPE(i) >= BCH_CSUM_NR)
+                       if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
                                goto err;
 
+                       nonce = btree_nonce(b, i, b->written << 9);
+                       csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+
                        err = "bad checksum";
-                       if (le64_to_cpu(bne->csum) !=
-                           btree_csum_set(b, bne))
+                       if (memcmp(&csum, &bne->csum, sizeof(csum)))
                                goto err;
 
-                       sectors = __set_blocks(bne,
-                                              le16_to_cpu(bne->keys.u64s),
-                                              block_bytes(c)) << c->block_bits;
+                       bset_encrypt(c, i, nonce);
+
+                       sectors = vstruct_sectors(bne, c->block_bits);
                }
 
                err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
                if (err)
                        goto err;
 
+               b->written += sectors;
+
                err = "insufficient memory";
                ret = bch_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
                if (ret < 0)
@@ -1083,11 +1131,11 @@ void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
 
                __bch_btree_node_iter_push(iter, b,
                                           i->start,
-                                          bkey_idx(i, whiteout_u64s));
+                                          vstruct_idx(i, whiteout_u64s));
 
                __bch_btree_node_iter_push(iter, b,
-                                          bkey_idx(i, whiteout_u64s),
-                                          bset_bkey_last(i));
+                                          vstruct_idx(i, whiteout_u64s),
+                                          vstruct_last(i));
        }
 
        err = "corrupted btree";
@@ -1290,6 +1338,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
        struct bch_extent_ptr *ptr;
        struct cache *ca;
        struct sort_iter sort_iter;
+       struct nonce nonce;
        unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
        u64 seq = 0;
        bool used_mempool;
@@ -1330,7 +1379,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
 
        BUG_ON(b->written >= c->sb.btree_node_size);
        BUG_ON(bset_written(b, btree_bset_last(b)));
-       BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(&c->disk_sb));
+       BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
        BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
 
        if (lock_type_held == SIX_LOCK_intent) {
@@ -1396,7 +1445,7 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
        b->whiteout_u64s = 0;
 
        u64s = btree_node_is_extents(b)
-               ? sort_extents(bset_bkey_last(i), &sort_iter, false)
+               ? sort_extents(vstruct_last(i), &sort_iter, false)
                : sort_keys(i->start, &sort_iter, false);
        le16_add_cpu(&i->u64s, u64s);
 
@@ -1413,14 +1462,30 @@ void __bch_btree_node_write(struct cache_set *c, struct btree *b,
        BUG_ON(i->seq != b->data->keys.seq);
 
        i->version = cpu_to_le16(BCACHE_BSET_VERSION);
-       SET_BSET_CSUM_TYPE(i, c->opts.metadata_checksum);
+       SET_BSET_CSUM_TYPE(i, bch_meta_checksum_type(c));
+
+       nonce = btree_nonce(b, i, b->written << 9);
+
+       if (bn) {
+               bch_encrypt(c, BSET_CSUM_TYPE(i), nonce,
+                           &bn->flags,
+                           (void *) &b->data->keys -
+                           (void *) &b->data->flags);
+               nonce = nonce_add(nonce,
+                                 round_up((void *) &b->data->keys -
+                                          (void *) &b->data->flags,
+                                          CHACHA20_BLOCK_SIZE));
+               bset_encrypt(c, i, nonce);
+
+               nonce = btree_nonce(b, i, b->written << 9);
+               bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
+       } else {
+               bset_encrypt(c, i, nonce);
 
-       if (bn)
-               bn->csum = cpu_to_le64(btree_csum_set(b, bn));
-       else
-               bne->csum = cpu_to_le64(btree_csum_set(b, bne));
+               bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+       }
 
-       bytes_to_write = (void *) bset_bkey_last(i) - data;
+       bytes_to_write = vstruct_end(i) - data;
        sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
 
        memset(data + bytes_to_write, 0,
@@ -1548,7 +1613,7 @@ bool bch_btree_post_write_cleanup(struct cache_set *c, struct btree *b)
         * If later we don't unconditionally sort down to a single bset, we have
         * to ensure this is still true:
         */
-       BUG_ON((void *) bset_bkey_last(btree_bset_last(b)) > write_block(b));
+       BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
 
        bne = want_new_bset(c, b);
        if (bne)
index 176d42a7a434878116eebbc354a1d0ac3c6c798c..4cbec7fe97776014a70ed1e8f1908aa563c00e8d 100644 (file)
@@ -202,24 +202,12 @@ __btree_node_offset_to_key(const struct btree *b, u16 k)
        return (void *) ((u64 *) b->data + k + 1);
 }
 
-#define __bkey_idx(_set, _offset)                              \
-       ((_set)->_data + (_offset))
-
-#define bkey_idx(_set, _offset)                                        \
-       ((typeof(&(_set)->start[0])) __bkey_idx((_set), (_offset)))
-
-#define __bset_bkey_last(_set)                                 \
-        __bkey_idx((_set), (_set)->u64s)
-
-#define bset_bkey_last(_set)                                   \
-        bkey_idx((_set), le16_to_cpu((_set)->u64s))
-
 #define btree_bkey_first(_b, _t)       (bset(_b, _t)->start)
 
 #define btree_bkey_last(_b, _t)                                                \
 ({                                                                     \
        EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=     \
-               bset_bkey_last(bset(_b, _t)));                          \
+               vstruct_last(bset(_b, _t)));                            \
                                                                        \
        __btree_node_offset_to_key(_b, (_t)->end_offset);               \
 })
@@ -227,7 +215,7 @@ __btree_node_offset_to_key(const struct btree *b, u16 k)
 static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
 {
        t->end_offset =
-               __btree_node_key_to_offset(b, bset_bkey_last(bset(b, t)));
+               __btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
        btree_bkey_last(b, t);
 }
 
index 95406a4443936024e0c620827ca2ac4f58b67f47..c3bb2092a551078f156089c99115ec02258c13c5 100644 (file)
@@ -12,7 +12,7 @@
 #include "extents.h"
 #include "journal.h"
 #include "keylist.h"
-#include "super.h"
+#include "super-io.h"
 
 #include <linux/random.h>
 #include <linux/sort.h>
@@ -80,7 +80,7 @@ bool bch_btree_node_format_fits(struct cache_set *c, struct btree *b,
 {
        size_t u64s = btree_node_u64s_with_format(b, new_f);
 
-       return __set_bytes(b->data, u64s) < btree_bytes(c);
+       return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
 }
 
 /* Btree node freeing/allocation: */
@@ -298,8 +298,11 @@ static struct btree *bch_btree_node_alloc(struct cache_set *c,
 
        bch_bset_init_first(b, &b->data->keys);
        memset(&b->nr, 0, sizeof(b->nr));
-       b->data->magic = cpu_to_le64(bset_magic(&c->disk_sb));
-       SET_BSET_BTREE_LEVEL(&b->data->keys, level);
+       b->data->magic = cpu_to_le64(bset_magic(c));
+       b->data->flags = 0;
+       SET_BTREE_NODE_ID(b->data, id);
+       SET_BTREE_NODE_LEVEL(b->data, level);
+       b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
 
        bch_btree_build_aux_trees(b);
 
@@ -1292,7 +1295,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
         */
        k = set1->start;
        while (1) {
-               if (bkey_next(k) == bset_bkey_last(set1))
+               if (bkey_next(k) == vstruct_last(set1))
                        break;
                if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
                        break;
@@ -1313,7 +1316,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
        n2->data->min_key =
                btree_type_successor(n1->btree_id, n1->key.k.p);
 
-       set2->u64s = cpu_to_le16((u64 *) bset_bkey_last(set1) - (u64 *) k);
+       set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
        set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
 
        set_btree_bset_end(n1, n1->set);
@@ -1333,7 +1336,7 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n
        BUG_ON(!set2->u64s);
 
        memcpy_u64s(set2->start,
-                   bset_bkey_last(set1),
+                   vstruct_end(set1),
                    le16_to_cpu(set2->u64s));
 
        btree_node_reset_sib_u64s(n1);
@@ -1393,12 +1396,12 @@ static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b,
         */
        i = btree_bset_first(b);
        p = i->start;
-       while (p != bset_bkey_last(i))
+       while (p != vstruct_last(i))
                if (bkey_deleted(p)) {
                        le16_add_cpu(&i->u64s, -p->u64s);
                        set_btree_bset_end(b, b->set);
                        memmove_u64s_down(p, bkey_next(p),
-                                         (u64 *) bset_bkey_last(i) -
+                                         (u64 *) vstruct_last(i) -
                                          (u64 *) p);
                } else
                        p = bkey_next(p);
@@ -1428,9 +1431,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter,
        if (b->level)
                btree_split_insert_keys(iter, n1, insert_keys, reserve);
 
-       if (__set_blocks(n1->data,
-                        le16_to_cpu(n1->data->keys.u64s),
-                        block_bytes(c)) > BTREE_SPLIT_THRESHOLD(c)) {
+       if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
                trace_bcache_btree_node_split(c, b, b->nr.live_u64s);
 
                n2 = __btree_split_node(iter, n1, reserve);
@@ -1939,7 +1940,7 @@ retry:
        u64s = 0;
        trans_for_each_entry(trans, i)
                if (!i->done)
-                       u64s += jset_u64s(i->k->k.u64s);
+                       u64s += jset_u64s(i->k->k.u64s + i->extra_res);
 
        memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
@@ -1966,7 +1967,7 @@ retry:
                 * written one
                 */
                if (!i->done) {
-                       u64s += i->k->k.u64s;
+                       u64s += i->k->k.u64s + i->extra_res;
                        if (!bch_btree_node_insert_fits(c,
                                        i->iter->nodes[0], u64s)) {
                                split = i->iter;
@@ -2217,7 +2218,7 @@ int bch_btree_update(struct cache_set *c, enum btree_id id,
 int bch_btree_delete_range(struct cache_set *c, enum btree_id id,
                           struct bpos start,
                           struct bpos end,
-                          u64 version,
+                          struct bversion version,
                           struct disk_reservation *disk_res,
                           struct extent_insert_hook *hook,
                           u64 *journal_seq)
index 5fc1b1aaf069b82360c00306a5ecaae0f12769d9..8ff089da9d542fddd3eb97e512d49d2c6f90a100 100644 (file)
@@ -5,6 +5,7 @@
 #include "btree_iter.h"
 #include "buckets.h"
 #include "journal.h"
+#include "vstructs.h"
 
 struct cache_set;
 struct bkey_format_state;
@@ -200,7 +201,7 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i)
 static inline unsigned bset_end_sector(struct cache_set *c, struct btree *b,
                                       struct bset *i)
 {
-       return round_up(bset_byte_offset(b, bset_bkey_last(i)),
+       return round_up(bset_byte_offset(b, vstruct_end(i)),
                        block_bytes(c)) >> 9;
 }
 
@@ -208,7 +209,7 @@ static inline size_t bch_btree_keys_u64s_remaining(struct cache_set *c,
                                                   struct btree *b)
 {
        struct bset *i = btree_bset_last(b);
-       unsigned used = bset_byte_offset(b, bset_bkey_last(i)) / sizeof(u64) +
+       unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
                b->whiteout_u64s +
                b->uncompacted_whiteout_u64s;
        unsigned total = c->sb.btree_node_size << 6;
@@ -235,7 +236,7 @@ static inline struct btree_node_entry *want_new_bset(struct cache_set *c,
 {
        struct bset *i = btree_bset_last(b);
        unsigned offset = max_t(unsigned, b->written << 9,
-                               bset_byte_offset(b, bset_bkey_last(i)));
+                               bset_byte_offset(b, vstruct_end(i)));
        ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
                (offset + sizeof(struct btree_node_entry) +
                 b->whiteout_u64s * sizeof(u64) +
@@ -244,8 +245,8 @@ static inline struct btree_node_entry *want_new_bset(struct cache_set *c,
        EBUG_ON(offset > btree_bytes(c));
 
        if ((unlikely(bset_written(b, i)) && n > 0) ||
-           (unlikely(__set_bytes(i, le16_to_cpu(i->u64s)) >
-                     btree_write_set_buffer(b)) && n > btree_write_set_buffer(b)))
+           (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+            n > btree_write_set_buffer(b)))
                return (void *) b->data + offset;
 
        return NULL;
@@ -308,6 +309,7 @@ struct btree_insert {
        struct btree_insert_entry {
                struct btree_iter *iter;
                struct bkey_i   *k;
+               unsigned        extra_res;
                /*
                 * true if entire key was inserted - can only be false for
                 * extents
@@ -329,6 +331,14 @@ int __bch_btree_insert_at(struct btree_insert *);
                .done           = false,                                \
        })
 
+#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra)                        \
+       ((struct btree_insert_entry) {                                  \
+               .iter           = (_iter),                              \
+               .k              = (_k),                                 \
+               .extra_res = (_extra),                                  \
+               .done           = false,                                \
+       })
+
 /**
  * bch_btree_insert_at - insert one or more keys at iterator positions
  * @iter:              btree iterator
@@ -391,7 +401,7 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
                return true;
 
        for (i = insert; i < trans->entries + trans->nr; i++)
-               u64s += jset_u64s(i->k->k.u64s);
+               u64s += jset_u64s(i->k->k.u64s + i->extra_res);
 
        return u64s <= trans->journal_res.u64s;
 }
@@ -404,7 +414,7 @@ int bch_btree_update(struct cache_set *, enum btree_id,
                     struct bkey_i *, u64 *);
 
 int bch_btree_delete_range(struct cache_set *, enum btree_id,
-                          struct bpos, struct bpos, u64,
+                          struct bpos, struct bpos, struct bversion,
                           struct disk_reservation *,
                           struct extent_insert_hook *, u64 *);
 
index 3398b255053a281d9939cd6ff81649db417f8177..757bc0355d733f1e6e8b5d04e652dd1950944b33 100644 (file)
@@ -534,12 +534,10 @@ static void bch_mark_extent(struct cache_set *c, struct bkey_s_c_extent e,
 
        rcu_read_lock();
        extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
-               bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
-
-               trace_bcache_mark_bucket(ca, e.k, ptr, sectors, dirty);
+               trace_bcache_mark_bucket(ca, e.k, ptr, sectors, !ptr->cached);
 
                bch_mark_pointer(c, e, ca, crc, ptr, sectors,
-                                dirty ? type : S_CACHED,
+                                ptr->cached ? S_CACHED : type,
                                 may_make_unavailable,
                                 stats, gc_will_visit, journal_seq);
        }
@@ -559,10 +557,13 @@ static void __bch_mark_key(struct cache_set *c, struct bkey_s_c k,
                                may_make_unavailable, stats,
                                gc_will_visit, journal_seq);
                break;
-       case BCH_RESERVATION:
-               stats->persistent_reserved += sectors;
+       case BCH_RESERVATION: {
+               struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+               stats->persistent_reserved += r.v->nr_replicas * sectors;
                break;
        }
+       }
 }
 
 void __bch_gc_mark_key(struct cache_set *c, struct bkey_s_c k,
index 35100eba351e9381f23bbd367025e7ab9891cffc..8194dd9b6baf8101a1a066c89677ee65eaf4d93f 100644 (file)
@@ -42,7 +42,7 @@ static inline u8 bucket_gc_gen(struct cache *ca, struct bucket *g)
 static inline struct cache *PTR_CACHE(const struct cache_set *c,
                                      const struct bch_extent_ptr *ptr)
 {
-       EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_in_set);
+       EBUG_ON(ptr->dev > rcu_dereference(c->members)->nr_devices);
 
        return rcu_dereference(c->cache[ptr->dev]);
 }
index 0b020c84a050ba2e2cc02d3431871e2f2ffd3a53..b361b0928416afebd32399952c6008856108165d 100644 (file)
@@ -9,6 +9,7 @@
 
 #include "bcache.h"
 #include "super.h"
+#include "super-io.h"
 
 #include <linux/module.h>
 #include <linux/fs.h>
@@ -202,16 +203,16 @@ static long bch_ioctl_disk_fail(struct cache_set *c,
        return ret;
 }
 
-static struct cache_member *bch_uuid_lookup(struct cache_set *c, uuid_le uuid)
+static struct bch_member *bch_uuid_lookup(struct cache_set *c, uuid_le uuid)
 {
-       struct cache_member *mi = c->disk_mi;
+       struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
        unsigned i;
 
-       lockdep_assert_held(&bch_register_lock);
+       lockdep_assert_held(&c->sb_lock);
 
-       for (i = 0; i < c->disk_sb.nr_in_set; i++)
-               if (!memcmp(&mi[i].uuid, &uuid, sizeof(uuid)))
-                       return &mi[i];
+       for (i = 0; i < c->disk_sb->nr_devices; i++)
+               if (!memcmp(&mi->members[i].uuid, &uuid, sizeof(uuid)))
+                       return &mi->members[i];
 
        return NULL;
 }
@@ -220,20 +221,20 @@ static long bch_ioctl_disk_remove_by_uuid(struct cache_set *c,
                        struct bch_ioctl_disk_remove_by_uuid __user *user_arg)
 {
        struct bch_ioctl_disk_fail_by_uuid arg;
-       struct cache_member *m;
+       struct bch_member *m;
        int ret = -ENOENT;
 
        if (copy_from_user(&arg, user_arg, sizeof(arg)))
                return -EFAULT;
 
-       mutex_lock(&bch_register_lock);
+       mutex_lock(&c->sb_lock);
        if ((m = bch_uuid_lookup(c, arg.dev))) {
                /* XXX: */
-               SET_CACHE_STATE(m, CACHE_FAILED);
-               bcache_write_super(c);
+               SET_BCH_MEMBER_STATE(m, BCH_MEMBER_STATE_FAILED);
+               bch_write_super(c);
                ret = 0;
        }
-       mutex_unlock(&bch_register_lock);
+       mutex_unlock(&c->sb_lock);
 
        return ret;
 }
@@ -242,19 +243,19 @@ static long bch_ioctl_disk_fail_by_uuid(struct cache_set *c,
                        struct bch_ioctl_disk_fail_by_uuid __user *user_arg)
 {
        struct bch_ioctl_disk_fail_by_uuid arg;
-       struct cache_member *m;
+       struct bch_member *m;
        int ret = -ENOENT;
 
        if (copy_from_user(&arg, user_arg, sizeof(arg)))
                return -EFAULT;
 
-       mutex_lock(&bch_register_lock);
+       mutex_lock(&c->sb_lock);
        if ((m = bch_uuid_lookup(c, arg.dev))) {
-               SET_CACHE_STATE(m, CACHE_FAILED);
-               bcache_write_super(c);
+               SET_BCH_MEMBER_STATE(m, BCH_MEMBER_STATE_FAILED);
+               bch_write_super(c);
                ret = 0;
        }
-       mutex_unlock(&bch_register_lock);
+       mutex_unlock(&c->sb_lock);
 
        return ret;
 }
@@ -263,8 +264,8 @@ static long bch_ioctl_query_uuid(struct cache_set *c,
                        struct bch_ioctl_query_uuid __user *user_arg)
 {
        return copy_to_user(&user_arg->uuid,
-                           &c->disk_sb.user_uuid,
-                           sizeof(c->disk_sb.user_uuid));
+                           &c->sb.user_uuid,
+                           sizeof(c->sb.user_uuid));
 }
 
 long bch_cache_set_ioctl(struct cache_set *c, unsigned cmd, void __user *arg)
index beae0b26e5702c6fe179265e06800f4d7f0c23fd..eb41f2eaaee46e3cd8d3fac65d4abb9873634002 100644 (file)
@@ -1,11 +1,19 @@
 
 #include "bcache.h"
 #include "checksum.h"
+#include "super.h"
+#include "super-io.h"
 
 #include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
 #include <crypto/chacha20.h>
 #include <crypto/hash.h>
 #include <crypto/poly1305.h>
+#include <keys/user-type.h>
 
 /*
  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
@@ -129,7 +137,35 @@ u64 bch_crc64_update(u64 crc, const void *_data, size_t len)
        return crc;
 }
 
-u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+static u64 bch_checksum_init(unsigned type)
+{
+       switch (type) {
+       case BCH_CSUM_NONE:
+               return 0;
+       case BCH_CSUM_CRC32C:
+               return U32_MAX;
+       case BCH_CSUM_CRC64:
+               return U64_MAX;
+       default:
+               BUG();
+       }
+}
+
+static u64 bch_checksum_final(unsigned type, u64 crc)
+{
+       switch (type) {
+       case BCH_CSUM_NONE:
+               return 0;
+       case BCH_CSUM_CRC32C:
+               return crc ^ U32_MAX;
+       case BCH_CSUM_CRC64:
+               return crc ^ U64_MAX;
+       default:
+               BUG();
+       }
+}
+
+static u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
 {
        switch (type) {
        case BCH_CSUM_NONE:
@@ -143,32 +179,416 @@ u64 bch_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
        }
 }
 
-u64 bch_checksum(unsigned type, const void *data, size_t len)
+static inline void do_encrypt_sg(struct crypto_blkcipher *tfm,
+                                struct nonce nonce,
+                                struct scatterlist *sg, size_t len)
+{
+       struct blkcipher_desc desc = { .tfm = tfm, .info = nonce.d };
+       int ret;
+
+       ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+       BUG_ON(ret);
+}
+
+static inline void do_encrypt(struct crypto_blkcipher *tfm,
+                             struct nonce nonce,
+                             void *buf, size_t len)
+{
+       struct scatterlist sg;
+
+       sg_init_one(&sg, buf, len);
+       do_encrypt_sg(tfm, nonce, &sg, len);
+}
+
+int bch_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+                          void *buf, size_t len)
+{
+       struct crypto_blkcipher *chacha20 =
+               crypto_alloc_blkcipher("chacha20", 0, CRYPTO_ALG_ASYNC);
+       int ret;
+
+       if (!chacha20)
+               return PTR_ERR(chacha20);
+
+       ret = crypto_blkcipher_setkey(chacha20, (void *) key, sizeof(*key));
+       if (ret)
+               goto err;
+
+       do_encrypt(chacha20, nonce, buf, len);
+err:
+       crypto_free_blkcipher(chacha20);
+       return ret;
+}
+
+static void gen_poly_key(struct cache_set *c, struct shash_desc *desc,
+                        struct nonce nonce)
+{
+       u8 key[POLY1305_KEY_SIZE];
+
+       nonce.d[3] ^= BCH_NONCE_POLY;
+
+       memset(key, 0, sizeof(key));
+       do_encrypt(c->chacha20, nonce, key, sizeof(key));
+
+       desc->tfm = c->poly1305;
+       desc->flags = 0;
+       crypto_shash_init(desc);
+       crypto_shash_update(desc, key, sizeof(key));
+}
+
+struct bch_csum bch_checksum(struct cache_set *c, unsigned type,
+                            struct nonce nonce, const void *data, size_t len)
 {
-       u64 crc = 0xffffffffffffffffULL;
+       switch (type) {
+       case BCH_CSUM_NONE:
+       case BCH_CSUM_CRC32C:
+       case BCH_CSUM_CRC64: {
+               u64 crc = bch_checksum_init(type);
+
+               crc = bch_checksum_update(type, crc, data, len);
+               crc = bch_checksum_final(type, crc);
+
+               return (struct bch_csum) { .lo = crc };
+       }
+
+       case BCH_CSUM_CHACHA20_POLY1305_80:
+       case BCH_CSUM_CHACHA20_POLY1305_128: {
+               SHASH_DESC_ON_STACK(desc, c->poly1305);
+               u8 digest[POLY1305_DIGEST_SIZE];
+               struct bch_csum ret = { 0 };
+
+               gen_poly_key(c, desc, nonce);
+
+               crypto_shash_update(desc, data, len);
+               crypto_shash_final(desc, digest);
+
+               memcpy(&ret, digest, bch_crc_bytes[type]);
+               return ret;
+       }
+       default:
+               BUG();
+       }
+}
 
-       crc = bch_checksum_update(type, crc, data, len);
+void bch_encrypt(struct cache_set *c, unsigned type,
+                struct nonce nonce, void *data, size_t len)
+{
+       if (!bch_csum_type_is_encryption(type))
+               return;
 
-       return crc ^ 0xffffffffffffffffULL;
+       do_encrypt(c->chacha20, nonce, data, len);
 }
 
-u32 bch_checksum_bio(struct bio *bio, unsigned type)
+struct bch_csum bch_checksum_bio(struct cache_set *c, unsigned type,
+                                struct nonce nonce, struct bio *bio)
 {
        struct bio_vec bv;
        struct bvec_iter iter;
-       u32 csum = U32_MAX;
 
-       if (type == BCH_CSUM_NONE)
-               return 0;
+       switch (type) {
+       case BCH_CSUM_NONE:
+               return (struct bch_csum) { 0 };
+       case BCH_CSUM_CRC32C:
+       case BCH_CSUM_CRC64: {
+               u64 crc = bch_checksum_init(type);
+
+               bio_for_each_segment(bv, bio, iter) {
+                       void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+                       crc = bch_checksum_update(type,
+                               crc, p, bv.bv_len);
+                       kunmap_atomic(p);
+               }
+
+               crc = bch_checksum_final(type, crc);
+               return (struct bch_csum) { .lo = crc };
+       }
+
+       case BCH_CSUM_CHACHA20_POLY1305_80:
+       case BCH_CSUM_CHACHA20_POLY1305_128: {
+               SHASH_DESC_ON_STACK(desc, c->poly1305);
+               u8 digest[POLY1305_DIGEST_SIZE];
+               struct bch_csum ret = { 0 };
+
+               gen_poly_key(c, desc, nonce);
+
+               bio_for_each_segment(bv, bio, iter) {
+                       void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+                       crypto_shash_update(desc, p, bv.bv_len);
+                       kunmap_atomic(p);
+               }
+
+               crypto_shash_final(desc, digest);
+
+               memcpy(&ret, digest, bch_crc_bytes[type]);
+               return ret;
+       }
+       default:
+               BUG();
+       }
+}
+
+void bch_encrypt_bio(struct cache_set *c, unsigned type,
+                    struct nonce nonce, struct bio *bio)
+{
+       struct bio_vec bv;
+       struct bvec_iter iter;
+       struct scatterlist sgl[16], *sg = sgl;
+       size_t bytes = 0;
+
+       if (!bch_csum_type_is_encryption(type))
+               return;
+
+       sg_init_table(sgl, ARRAY_SIZE(sgl));
 
        bio_for_each_segment(bv, bio, iter) {
-               void *p = kmap_atomic(bv.bv_page);
+               if (sg == sgl + ARRAY_SIZE(sgl)) {
+                       sg_mark_end(sg - 1);
+                       do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+                       le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE);
+                       bytes = 0;
+
+                       sg_init_table(sgl, ARRAY_SIZE(sgl));
+                       sg = sgl;
+               }
+
+               sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+               bytes += bv.bv_len;
+
+       }
+
+       sg_mark_end(sg - 1);
+       do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+#ifdef __KERNEL__
+int bch_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+       char key_description[60];
+       struct key *keyring_key;
+       const struct user_key_payload *ukp;
+       int ret;
+
+       snprintf(key_description, sizeof(key_description),
+                "bcache:%pUb", &sb->user_uuid);
+
+       keyring_key = request_key(&key_type_logon, key_description, NULL);
+       if (IS_ERR(keyring_key))
+               return PTR_ERR(keyring_key);
+
+       down_read(&keyring_key->sem);
+       ukp = user_key_payload(keyring_key);
+       if (ukp->datalen == sizeof(*key)) {
+               memcpy(key, ukp->data, ukp->datalen);
+               ret = 0;
+       } else {
+               ret = -EINVAL;
+       }
+       up_read(&keyring_key->sem);
+       key_put(keyring_key);
+
+       return ret;
+}
+#else
+#include <keyutils.h>
+#include <uuid/uuid.h>
+
+int bch_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+       key_serial_t key_id;
+       char key_description[60];
+       char uuid[40];
+
+       uuid_unparse_lower(sb->user_uuid.b, uuid);
+       sprintf(key_description, "bcache:%s", uuid);
+
+       key_id = request_key("user", key_description, NULL,
+                            KEY_SPEC_USER_KEYRING);
+       if (key_id < 0)
+               return -errno;
+
+       if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+               return -1;
+
+       return 0;
+}
+#endif
 
-               csum = bch_checksum_update(type, csum,
-                                          p + bv.bv_offset,
-                                          bv.bv_len);
-               kunmap_atomic(p);
+static int bch_decrypt_sb_key(struct cache_set *c,
+                             struct bch_sb_field_crypt *crypt,
+                             struct bch_key *key)
+{
+       struct bch_encrypted_key sb_key = crypt->key;
+       struct bch_key user_key;
+       int ret = 0;
+
+       /* is key encrypted? */
+       if (!bch_key_is_encrypted(&sb_key))
+               goto out;
+
+       ret = bch_request_key(c->disk_sb, &user_key);
+       if (ret) {
+               bch_err(c, "error requesting encryption key");
+               goto err;
        }
 
-       return csum ^= U32_MAX;
+       /* decrypt real key: */
+       ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c),
+                            &sb_key, sizeof(sb_key));
+       if (ret)
+               goto err;
+
+       if (bch_key_is_encrypted(&sb_key)) {
+               bch_err(c, "incorrect encryption key");
+               ret = -EINVAL;
+               goto err;
+       }
+out:
+       *key = sb_key.key;
+err:
+       memzero_explicit(&sb_key, sizeof(sb_key));
+       memzero_explicit(&user_key, sizeof(user_key));
+       return ret;
+}
+
+static int bch_alloc_ciphers(struct cache_set *c)
+{
+       if (!c->chacha20)
+               c->chacha20 = crypto_alloc_blkcipher("chacha20", 0,
+                                                    CRYPTO_ALG_ASYNC);
+       if (IS_ERR(c->chacha20))
+               return PTR_ERR(c->chacha20);
+
+       if (!c->poly1305)
+               c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+       if (IS_ERR(c->poly1305))
+               return PTR_ERR(c->poly1305);
+
+       return 0;
+}
+
+int bch_disable_encryption(struct cache_set *c)
+{
+       struct bch_sb_field_crypt *crypt;
+       struct bch_key key;
+       int ret = -EINVAL;
+
+       mutex_lock(&c->sb_lock);
+
+       crypt = bch_sb_get_crypt(c->disk_sb);
+       if (!crypt)
+               goto out;
+
+       /* is key encrypted? */
+       ret = 0;
+       if (bch_key_is_encrypted(&crypt->key))
+               goto out;
+
+       ret = bch_decrypt_sb_key(c, crypt, &key);
+       if (ret)
+               goto out;
+
+       crypt->key.magic        = BCH_KEY_MAGIC;
+       crypt->key.key          = key;
+
+       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0);
+       bch_write_super(c);
+out:
+       mutex_unlock(&c->sb_lock);
+
+       return ret;
+}
+
+int bch_enable_encryption(struct cache_set *c, bool keyed)
+{
+       struct bch_encrypted_key key;
+       struct bch_key user_key;
+       struct bch_sb_field_crypt *crypt;
+       int ret = -EINVAL;
+
+       mutex_lock(&c->sb_lock);
+
+       /* Do we already have an encryption key? */
+       if (bch_sb_get_crypt(c->disk_sb))
+               goto err;
+
+       ret = bch_alloc_ciphers(c);
+       if (ret)
+               goto err;
+
+       key.magic = BCH_KEY_MAGIC;
+       get_random_bytes(&key.key, sizeof(key.key));
+
+       if (keyed) {
+               ret = bch_request_key(c->disk_sb, &user_key);
+               if (ret) {
+                       bch_err(c, "error requesting encryption key");
+                       goto err;
+               }
+
+               ret = bch_chacha_encrypt_key(&user_key, bch_sb_key_nonce(c),
+                                            &key, sizeof(key));
+               if (ret)
+                       goto err;
+       }
+
+       ret = crypto_blkcipher_setkey(c->chacha20,
+                       (void *) &key.key, sizeof(key.key));
+       if (ret)
+               goto err;
+
+       crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL,
+                                               sizeof(*crypt) / sizeof(u64)),
+                                    struct bch_sb_field_crypt, field);
+       if (!crypt) {
+               ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+               goto err;
+       }
+
+       crypt->field.type = BCH_SB_FIELD_crypt;
+       crypt->key = key;
+
+       /* write superblock */
+       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1);
+       bch_write_super(c);
+err:
+       mutex_unlock(&c->sb_lock);
+       memzero_explicit(&user_key, sizeof(user_key));
+       memzero_explicit(&key, sizeof(key));
+       return ret;
+}
+
+void bch_cache_set_encryption_free(struct cache_set *c)
+{
+       if (!IS_ERR_OR_NULL(c->poly1305))
+               crypto_free_shash(c->poly1305);
+       if (!IS_ERR_OR_NULL(c->chacha20))
+               crypto_free_blkcipher(c->chacha20);
+}
+
+int bch_cache_set_encryption_init(struct cache_set *c)
+{
+       struct bch_sb_field_crypt *crypt;
+       struct bch_key key;
+       int ret;
+
+       crypt = bch_sb_get_crypt(c->disk_sb);
+       if (!crypt)
+               return 0;
+
+       ret = bch_alloc_ciphers(c);
+       if (ret)
+               return ret;
+
+       ret = bch_decrypt_sb_key(c, crypt, &key);
+       if (ret)
+               goto err;
+
+       ret = crypto_blkcipher_setkey(c->chacha20,
+                       (void *) &key.key, sizeof(key.key));
+err:
+       memzero_explicit(&key, sizeof(key));
+       return ret;
 }
index 196b7e8c52ca1e3376bf65fb80cb58a8115d7cab..a9a1758791a92ab332ae94c28f6c5fa29daaa09d 100644 (file)
 #ifndef _BCACHE_CHECKSUM_H
 #define _BCACHE_CHECKSUM_H
 
-#include "btree_types.h"
+#include "bcache.h"
+#include "super-io.h"
+
+#include <crypto/chacha20.h>
 
 u64 bch_crc64_update(u64, const void *, size_t);
 
-u64 bch_checksum_update(unsigned, u64, const void *, size_t);
-u64 bch_checksum(unsigned, const void *, size_t);
-u32 bch_checksum_bio(struct bio *, unsigned);
+#define BCH_NONCE_EXTENT       cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE                cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL      cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO         cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY         cpu_to_le32(1 << 31)
+
+struct bch_csum bch_checksum(struct cache_set *, unsigned, struct nonce,
+                            const void *, size_t);
 
 /*
- * This is used for various on disk data structures - cache_sb, prio_set, bset,
- * jset: The checksum is _always_ the first 8 bytes of these structs
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
  */
-#define __csum_set(i, u64s, type)                                      \
+#define csum_vstruct(_c, _type, _nonce, _i)                            \
 ({                                                                     \
-       const void *start = ((const void *) (i)) + sizeof(u64);         \
-       const void *end = __bkey_idx(i, u64s);                          \
+       const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
+       const void *end = vstruct_end(_i);                              \
                                                                        \
-       bch_checksum(type, start, end - start);                         \
+       bch_checksum(_c, _type, _nonce, start, end - start);            \
 })
 
+int bch_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch_request_key(struct bch_sb *, struct bch_key *);
+
+void bch_encrypt(struct cache_set *, unsigned, struct nonce,
+                void *data, size_t);
+
+struct bch_csum bch_checksum_bio(struct cache_set *, unsigned,
+                                struct nonce, struct bio *);
+void bch_encrypt_bio(struct cache_set *, unsigned,
+                   struct nonce, struct bio *);
+
+int bch_disable_encryption(struct cache_set *);
+int bch_enable_encryption(struct cache_set *, bool);
+
+void bch_cache_set_encryption_free(struct cache_set *);
+int bch_cache_set_encryption_init(struct cache_set *);
+
+static inline unsigned bch_data_checksum_type(struct cache_set *c)
+{
+       if (c->sb.encryption_type)
+               return c->opts.wide_macs
+                       ? BCH_CSUM_CHACHA20_POLY1305_128
+                       : BCH_CSUM_CHACHA20_POLY1305_80;
+
+       return c->opts.data_checksum;
+}
+
+static inline unsigned bch_meta_checksum_type(struct cache_set *c)
+{
+       return c->sb.encryption_type
+               ? BCH_CSUM_CHACHA20_POLY1305_128
+               : c->opts.metadata_checksum;
+}
+
+static inline bool bch_checksum_type_valid(const struct cache_set *c,
+                                          unsigned type)
+{
+       if (type >= BCH_CSUM_NR)
+               return false;
+
+       if (bch_csum_type_is_encryption(type) && !c->chacha20)
+               return false;
+
+       return true;
+}
+
+static const unsigned bch_crc_bytes[] = {
+       [BCH_CSUM_NONE]                         = 0,
+       [BCH_CSUM_CRC32C]                       = 4,
+       [BCH_CSUM_CRC64]                        = 8,
+       [BCH_CSUM_CHACHA20_POLY1305_80]         = 10,
+       [BCH_CSUM_CHACHA20_POLY1305_128]        = 16,
+};
+
+static inline bool bch_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+       /*
+        * XXX: need some way of preventing the compiler from optimizing this
+        * into a form that isn't constant time..
+        */
+       return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+       EBUG_ON(offset & (CHACHA20_BLOCK_SIZE - 1));
+
+       le32_add_cpu(&nonce.d[0], offset / CHACHA20_BLOCK_SIZE);
+       return nonce;
+}
+
+static inline bool bch_key_is_encrypted(struct bch_encrypted_key *key)
+{
+       return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch_sb_key_nonce(struct bch_sb *sb)
+{
+       __le64 magic = __bch_sb_magic(sb);
+
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = 0,
+               [2] = ((__le32 *) &magic)[0],
+               [3] = ((__le32 *) &magic)[1],
+       }};
+}
+
+static inline struct nonce bch_sb_key_nonce(struct cache_set *c)
+{
+       __le64 magic = bch_sb_magic(c);
+
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = 0,
+               [2] = ((__le32 *) &magic)[0],
+               [3] = ((__le32 *) &magic)[1],
+       }};
+}
+
 #endif /* _BCACHE_CHECKSUM_H */
index f7bfd57f9578ebbc1cdc62563eb55a328b487c1b..e76850be4e29d8b3cb8c37530fb14de412b38d4f 100644 (file)
@@ -1,6 +1,8 @@
 #include "bcache.h"
 #include "compress.h"
+#include "extents.h"
 #include "io.h"
+#include "super-io.h"
 
 #include <linux/lz4.h>
 #include <linux/zlib.h>
@@ -50,7 +52,7 @@ static void *__bio_map_or_bounce(struct cache_set *c,
        unsigned prev_end = PAGE_SIZE;
        void *data;
 
-       BUG_ON(bvec_iter_sectors(start) > BCH_COMPRESSED_EXTENT_MAX);
+       BUG_ON(bvec_iter_sectors(start) > BCH_ENCODED_EXTENT_MAX);
 
        *bounced = BOUNCED_MAPPED;
 
@@ -118,12 +120,12 @@ static void bio_unmap_or_unbounce(struct cache_set *c, void *data,
 }
 
 static int __bio_uncompress(struct cache_set *c, struct bio *src,
-                           void *dst_data, struct bch_extent_crc64 crc)
+                           void *dst_data, struct bch_extent_crc128 crc)
 {
        void *src_data = NULL;
        unsigned src_bounced;
        size_t src_len = src->bi_iter.bi_size;
-       size_t dst_len = crc.uncompressed_size << 9;
+       size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
        int ret;
 
        src_data = bio_map_or_bounce(c, src, &src_bounced, READ);
@@ -179,10 +181,10 @@ err:
 
 int bch_bio_uncompress_inplace(struct cache_set *c, struct bio *bio,
                               unsigned live_data_sectors,
-                              struct bch_extent_crc64 crc)
+                              struct bch_extent_crc128 crc)
 {
        void *dst_data = NULL;
-       size_t dst_len = crc.uncompressed_size << 9;
+       size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
        int ret = -ENOMEM;
 
        BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
@@ -231,11 +233,11 @@ use_mempool:
 
 int bch_bio_uncompress(struct cache_set *c, struct bio *src,
                       struct bio *dst, struct bvec_iter dst_iter,
-                      struct bch_extent_crc64 crc)
+                      struct bch_extent_crc128 crc)
 {
        void *dst_data = NULL;
        unsigned dst_bounced;
-       size_t dst_len = crc.uncompressed_size << 9;
+       size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
        int ret = -ENOMEM;
 
        dst_data = dst_len == dst_iter.bi_size
@@ -273,28 +275,23 @@ static int __bio_compress(struct cache_set *c,
                *src_len = src->bi_iter.bi_size;
 
                workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
-retry_compress:
-               ret = lz4_compress(src_data, *src_len,
-                                  dst_data, dst_len,
-                                  workspace);
-               /*
-                * On error, the compressed data was bigger than dst_len, and
-                * -ret is the amount of data we were able to compress - round
-                * down to nearest block and try again:
-                */
-               if (ret && round_down(-ret, block_bytes(c)) > *dst_len) {
-                       BUG_ON(ret > 0);
 
-                       /* not supposed to happen */
-                       if (WARN_ON(-ret >= *src_len))
-                               goto err;
+               while (*src_len > block_bytes(c) &&
+                      (ret = lz4_compress(src_data, *src_len,
+                                          dst_data, dst_len,
+                                          workspace))) {
+                       /*
+                        * On error, the compressed data was bigger than
+                        * dst_len, and -ret is the amount of data we were able
+                        * to compress - round down to nearest block and try
+                        * again:
+                        */
+                       BUG_ON(ret > 0);
+                       BUG_ON(-ret >= *src_len);
 
                        *src_len = round_down(-ret, block_bytes(c));
-                       if (!*src_len)
-                               goto err;
-
-                       goto retry_compress;
                }
+
                mempool_free(workspace, &c->lz4_workspace_pool);
 
                if (ret)
@@ -354,6 +351,10 @@ zlib_err:
        }
 
        BUG_ON(!*dst_len);
+       BUG_ON(*dst_len > dst->bi_iter.bi_size);
+
+       BUG_ON(*src_len & (block_bytes(c) - 1));
+       BUG_ON(*src_len > src->bi_iter.bi_size);
 
        /* Didn't get smaller: */
        if (round_up(*dst_len, block_bytes(c)) >= *src_len) {
@@ -382,9 +383,9 @@ void bch_bio_compress(struct cache_set *c,
        unsigned orig_dst = dst->bi_iter.bi_size;
        unsigned orig_src = src->bi_iter.bi_size;
 
-       /* Don't consume more than BCH_COMPRESSED_EXTENT_MAX from @src: */
+       /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
        src->bi_iter.bi_size =
-               min(src->bi_iter.bi_size, BCH_COMPRESSED_EXTENT_MAX << 9);
+               min(src->bi_iter.bi_size, BCH_ENCODED_EXTENT_MAX << 9);
 
        /* Don't generate a bigger output than input: */
        dst->bi_iter.bi_size =
@@ -405,6 +406,30 @@ out:
        src->bi_iter.bi_size = orig_src;
 }
 
+/* doesn't write superblock: */
+int bch_check_set_has_compressed_data(struct cache_set *c,
+                                     unsigned compression_type)
+{
+       switch (compression_type) {
+       case BCH_COMPRESSION_NONE:
+               return 0;
+       case BCH_COMPRESSION_LZ4:
+               if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
+                       return 0;
+
+               bch_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
+               break;
+       case BCH_COMPRESSION_GZIP:
+               if (bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+                       return 0;
+
+               bch_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
+               break;
+       }
+
+       return bch_compress_init(c);
+}
+
 void bch_compress_free(struct cache_set *c)
 {
        vfree(c->zlib_workspace);
@@ -420,39 +445,56 @@ void bch_compress_free(struct cache_set *c)
 
 int bch_compress_init(struct cache_set *c)
 {
+       unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
        int ret, cpu;
 
-       c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
-       if (!c->bio_decompress_worker)
-               return -ENOMEM;
+       if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
+           !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+               return 0;
 
-       for_each_possible_cpu(cpu) {
-               struct bio_decompress_worker *d =
-                       per_cpu_ptr(c->bio_decompress_worker, cpu);
+       if (!c->bio_decompress_worker) {
+               c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
+               if (!c->bio_decompress_worker)
+                       return -ENOMEM;
 
-               d->c = c;
-               INIT_WORK(&d->work, bch_bio_decompress_work);
-               init_llist_head(&d->bio_list);
+               for_each_possible_cpu(cpu) {
+                       struct bio_decompress_worker *d =
+                               per_cpu_ptr(c->bio_decompress_worker, cpu);
+
+                       d->c = c;
+                       INIT_WORK(&d->work, bch_bio_decompress_work);
+                       init_llist_head(&d->bio_list);
+               }
        }
 
-       ret = mempool_init_page_pool(&c->compression_bounce[READ], 1,
-                                    get_order(BCH_COMPRESSED_EXTENT_MAX << 9));
-       if (ret)
-               return ret;
+       if (!mempool_initialized(&c->compression_bounce[READ])) {
+               ret = mempool_init_page_pool(&c->compression_bounce[READ],
+                                            1, order);
+               if (ret)
+                       return ret;
+       }
 
-       ret = mempool_init_page_pool(&c->compression_bounce[WRITE], 1,
-                                    get_order(BCH_COMPRESSED_EXTENT_MAX << 9));
-       if (ret)
-               return ret;
+       if (!mempool_initialized(&c->compression_bounce[WRITE])) {
+               ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
+                                            1, order);
+               if (ret)
+                       return ret;
+       }
 
-       ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, 1,
-                                       LZ4_MEM_COMPRESS);
-       if (ret)
-               return ret;
+       if (!mempool_initialized(&c->lz4_workspace_pool) &&
+           bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) {
+               ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool,
+                                               1, LZ4_MEM_COMPRESS);
+               if (ret)
+                       return ret;
+       }
 
-       c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
-       if (!c->zlib_workspace)
-               return -ENOMEM;
+       if (!c->zlib_workspace &&
+           bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) {
+               c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
+               if (!c->zlib_workspace)
+                       return -ENOMEM;
+       }
 
        return 0;
 }
index 02578ef73c9a50ded79d1580613f6cc448d358e0..485acd95249e7a81d0996e759a7ddd06a0bee0c0 100644 (file)
@@ -2,12 +2,13 @@
 #define _BCACHE_COMPRESS_H
 
 int bch_bio_uncompress_inplace(struct cache_set *, struct bio *,
-                              unsigned, struct bch_extent_crc64);
+                              unsigned, struct bch_extent_crc128);
 int bch_bio_uncompress(struct cache_set *, struct bio *, struct bio *,
-                      struct bvec_iter, struct bch_extent_crc64);
+                      struct bvec_iter, struct bch_extent_crc128);
 void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
                      struct bio *, size_t *, unsigned *);
 
+int bch_check_set_has_compressed_data(struct cache_set *, unsigned);
 void bch_compress_free(struct cache_set *);
 int bch_compress_init(struct cache_set *);
 
index 39f5550e887c583e4d37891f58e952d4ea425d17..d25c32aea29aba99b3cebd833b2a581b8d9ab06e 100644 (file)
@@ -96,7 +96,7 @@ void __bch_btree_verify(struct cache_set *c, struct btree *b)
        if (inmemory->u64s != sorted->u64s ||
            memcmp(inmemory->start,
                   sorted->start,
-                  (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+                  vstruct_end(inmemory) - (void *) inmemory->start)) {
                unsigned offset = 0, sectors;
                struct bset *i;
                unsigned j;
@@ -112,18 +112,14 @@ void __bch_btree_verify(struct cache_set *c, struct btree *b)
                while (offset < b->written) {
                        if (!offset ) {
                                i = &n_ondisk->keys;
-                               sectors = __set_blocks(n_ondisk,
-                                                      le16_to_cpu(n_ondisk->keys.u64s),
-                                                      block_bytes(c)) <<
+                               sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
                                        c->block_bits;
                        } else {
                                struct btree_node_entry *bne =
                                        (void *) n_ondisk + (offset << 9);
                                i = &bne->keys;
 
-                               sectors = __set_blocks(bne,
-                                                      le16_to_cpu(bne->keys.u64s),
-                                                      block_bytes(c)) <<
+                               sectors = vstruct_blocks(bne, c->block_bits) <<
                                        c->block_bits;
                        }
 
@@ -427,7 +423,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
        if (IS_ERR_OR_NULL(bch_debug))
                return;
 
-       snprintf(name, sizeof(name), "%pU", c->disk_sb.user_uuid.b);
+       snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
        c->debug = debugfs_create_dir(name, bch_debug);
        if (IS_ERR_OR_NULL(c->debug))
                return;
index d97c3b22c3617deac8d18d21e1733f41e8565017..ebf0f101a73fd57c6070da573341e821b8700d96 100644 (file)
@@ -23,34 +23,13 @@ unsigned bch_dirent_name_bytes(struct bkey_s_c_dirent d)
 static u64 bch_dirent_hash(const struct bch_hash_info *info,
                           const struct qstr *name)
 {
-       switch (info->type) {
-       case BCH_STR_HASH_SHA1: {
-               SHASH_DESC_ON_STACK(desc, bch_sha1);
-               u8 digest[SHA1_DIGEST_SIZE];
-               u64 ret;
-               desc->tfm = bch_sha1;
-               desc->flags = 0;
-               crypto_shash_init(desc);
-
-               crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed));
-
-               crypto_shash_update(desc, (void *) name->name, name->len);
-               crypto_shash_final(desc, digest);
-               memcpy(&ret, &digest, sizeof(ret));
-               return max_t(u64, ret >> 1, 2);
-       }
-       default: {
-               struct bch_str_hash_ctx ctx;
-
-               bch_str_hash_init(&ctx, info->type);
-               bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed));
+       struct bch_str_hash_ctx ctx;
 
-               bch_str_hash_update(&ctx, info->type, name->name, name->len);
+       bch_str_hash_init(&ctx, info);
+       bch_str_hash_update(&ctx, info, name->name, name->len);
 
-               /* [0,2) reserved for dots */
-               return max_t(u64, bch_str_hash_end(&ctx, info->type), 2);
-       }
-       }
+       /* [0,2) reserved for dots */
+       return max_t(u64, bch_str_hash_end(&ctx, info), 2);
 }
 
 static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
index c026d59150f8badb3dfb30ce84eb86b4ab56c889..4b8a26658e0f1b382a19c803ed42fd0b0a0d4d46 100644 (file)
@@ -9,19 +9,19 @@
 #include "bkey_methods.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "checksum.h"
 #include "debug.h"
 #include "dirent.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
-#include "super.h"
+#include "super-io.h"
 #include "writeback.h"
 #include "xattr.h"
 
 #include <trace/events/bcache.h>
 
-static bool __bch_extent_normalize(struct cache_set *, struct bkey_s, bool);
 static enum merge_result bch_extent_merge(struct cache_set *, struct btree *,
                                          struct bkey_i *, struct bkey_i *);
 
@@ -120,21 +120,38 @@ bch_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
        return NULL;
 }
 
-unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent e,
-                                const struct bch_extent_ptr *start)
+unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e)
 {
        const struct bch_extent_ptr *ptr;
        unsigned nr_ptrs = 0;
 
-       extent_for_each_ptr_from(e, ptr, start)
+       extent_for_each_ptr(e, ptr)
                nr_ptrs++;
 
        return nr_ptrs;
 }
 
-unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent e)
+unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c k)
 {
-       return bch_extent_nr_ptrs_from(e, &e.v->start->ptr);
+       struct bkey_s_c_extent e;
+       const struct bch_extent_ptr *ptr;
+       unsigned nr_ptrs = 0;
+
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               e = bkey_s_c_to_extent(k);
+
+               extent_for_each_ptr(e, ptr)
+                       nr_ptrs += !ptr->cached;
+               break;
+
+       case BCH_RESERVATION:
+               nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
+               break;
+       }
+
+       return nr_ptrs;
 }
 
 /* returns true if equal */
@@ -177,16 +194,19 @@ void bch_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc
  *
  * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
  * use crc_live here (that we verified was correct earlier)
+ *
+ * note: doesn't work with encryption
  */
 void bch_extent_narrow_crcs(struct bkey_s_extent e)
 {
        union bch_extent_crc *crc;
        bool have_wide = false, have_narrow = false;
-       u64 csum = 0;
+       struct bch_csum csum = { 0 };
        unsigned csum_type = 0;
 
        extent_for_each_crc(e, crc) {
-               if (crc_compression_type(crc))
+               if (crc_compression_type(crc) ||
+                   bch_csum_type_is_encryption(crc_csum_type(crc)))
                        continue;
 
                if (crc_uncompressed_size(e.k, crc) != e.k->size) {
@@ -210,26 +230,38 @@ void bch_extent_narrow_crcs(struct bkey_s_extent e)
                        case BCH_EXTENT_CRC_NONE:
                                BUG();
                        case BCH_EXTENT_CRC32:
-                               if (bch_crc_size[csum_type] > sizeof(crc->crc32.csum))
+                               if (bch_crc_bytes[csum_type] > 4)
                                        continue;
 
                                bch_extent_crc_narrow_pointers(e, crc);
-                               crc->crc32.compressed_size      = e.k->size;
-                               crc->crc32.uncompressed_size    = e.k->size;
+                               crc->crc32._compressed_size     = e.k->size - 1;
+                               crc->crc32._uncompressed_size   = e.k->size - 1;
                                crc->crc32.offset               = 0;
                                crc->crc32.csum_type            = csum_type;
-                               crc->crc32.csum                 = csum;
+                               crc->crc32.csum                 = csum.lo;
                                break;
                        case BCH_EXTENT_CRC64:
-                               if (bch_crc_size[csum_type] > sizeof(crc->crc64.csum))
+                               if (bch_crc_bytes[csum_type] > 10)
                                        continue;
 
                                bch_extent_crc_narrow_pointers(e, crc);
-                               crc->crc64.compressed_size      = e.k->size;
-                               crc->crc64.uncompressed_size    = e.k->size;
+                               crc->crc64._compressed_size     = e.k->size - 1;
+                               crc->crc64._uncompressed_size   = e.k->size - 1;
                                crc->crc64.offset               = 0;
                                crc->crc64.csum_type            = csum_type;
-                               crc->crc64.csum                 = csum;
+                               crc->crc64.csum_lo              = csum.lo;
+                               crc->crc64.csum_hi              = csum.hi;
+                               break;
+                       case BCH_EXTENT_CRC128:
+                               if (bch_crc_bytes[csum_type] > 16)
+                                       continue;
+
+                               bch_extent_crc_narrow_pointers(e, crc);
+                               crc->crc128._compressed_size    = e.k->size - 1;
+                               crc->crc128._uncompressed_size  = e.k->size - 1;
+                               crc->crc128.offset              = 0;
+                               crc->crc128.csum_type           = csum_type;
+                               crc->crc128.csum                = csum;
                                break;
                        }
                }
@@ -300,13 +332,8 @@ static void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e)
        struct bch_extent_ptr *ptr = &e.v->start->ptr;
        bool dropped = false;
 
-       /*
-        * We don't want to change which pointers are considered cached/dirty,
-        * so don't remove pointers that are considered dirty:
-        */
        rcu_read_lock();
-       while ((ptr = extent_ptr_next(e, ptr)) &&
-              !bch_extent_ptr_is_dirty(c, e.c, ptr))
+       while ((ptr = extent_ptr_next(e, ptr)))
                if (should_drop_ptr(c, e.c, ptr)) {
                        __bch_extent_drop_ptr(e, ptr);
                        dropped = true;
@@ -321,16 +348,43 @@ static void bch_extent_drop_stale(struct cache_set *c, struct bkey_s_extent e)
 static bool bch_ptr_normalize(struct cache_set *c, struct btree *bk,
                              struct bkey_s k)
 {
-       return __bch_extent_normalize(c, k, false);
+       return bch_extent_normalize(c, k);
 }
 
 static void bch_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 {
-       u64 *d = (u64 *) bkeyp_val(f, k);
-       unsigned i;
+       switch (k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED: {
+               union bch_extent_entry *entry;
+               u64 *d = (u64 *) bkeyp_val(f, k);
+               unsigned i;
 
-       for (i = 0; i < bkeyp_val_u64s(f, k); i++)
-               d[i] = swab64(d[i]);
+               for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+                       d[i] = swab64(d[i]);
+
+               for (entry = (union bch_extent_entry *) d;
+                    entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+                    entry = extent_entry_next(entry)) {
+                       switch (extent_entry_type(entry)) {
+                       case BCH_EXTENT_ENTRY_crc32:
+                               entry->crc32.csum = swab32(entry->crc32.csum);
+                               break;
+                       case BCH_EXTENT_ENTRY_crc64:
+                               entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+                               entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+                               break;
+                       case BCH_EXTENT_ENTRY_crc128:
+                               entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
+                               entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+                               break;
+                       case BCH_EXTENT_ENTRY_ptr:
+                               break;
+                       }
+               }
+               break;
+       }
+       }
 }
 
 static const char *extent_ptr_invalid(struct bkey_s_c_extent e,
@@ -341,7 +395,7 @@ static const char *extent_ptr_invalid(struct bkey_s_c_extent e,
        const struct bch_extent_ptr *ptr2;
        const struct cache_member_cpu *m = mi->m + ptr->dev;
 
-       if (ptr->dev > mi->nr_in_set || !m->valid)
+       if (ptr->dev > mi->nr_devices || !m->valid)
                return "pointer to invalid device";
 
        extent_for_each_ptr(e, ptr2)
@@ -380,7 +434,9 @@ static size_t extent_print_ptrs(struct cache_set *c, char *buf,
                switch (__extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
+               case BCH_EXTENT_ENTRY_crc128:
                        crc = entry_to_crc(entry);
+
                        p("crc: c_size %u size %u offset %u csum %u compress %u",
                          crc_compressed_size(e.k, crc),
                          crc_uncompressed_size(e.k, crc),
@@ -388,7 +444,8 @@ static size_t extent_print_ptrs(struct cache_set *c, char *buf,
                          crc_compression_type(crc));
                        break;
                case BCH_EXTENT_ENTRY_ptr:
-                       ptr = &entry->ptr;
+                       ptr = entry_to_ptr(entry);
+
                        p("ptr: %u:%llu gen %u%s", ptr->dev,
                          (u64) ptr->offset, ptr->gen,
                          (ca = PTR_CACHE(c, ptr)) && ptr_stale(ca, ptr)
@@ -621,6 +678,10 @@ static bool __bch_cut_front(struct bpos where, struct bkey_s k)
                                if (prev_crc != crc)
                                        crc->crc64.offset += e.k->size - len;
                                break;
+                       case BCH_EXTENT_CRC128:
+                               if (prev_crc != crc)
+                                       crc->crc128.offset += e.k->size - len;
+                               break;
                        }
                        prev_crc = crc;
                }
@@ -948,7 +1009,7 @@ static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
        BUG_ON(!l.k->size || !r.k->size);
 
        if (l.k->type != r.k->type ||
-           l.k->version != r.k->version)
+           bversion_cmp(l.k->version, r.k->version))
                return false;
 
        switch (l.k->type) {
@@ -985,7 +1046,7 @@ static bool bch_extent_cmpxchg_cmp(struct bkey_s_c l, struct bkey_s_c r)
 
                extent_for_each_ptr(le, lp) {
                        const union bch_extent_entry *entry =
-                               bkey_idx(re.v, (u64 *) lp - le.v->_data);
+                               vstruct_idx(re.v, (u64 *) lp - le.v->_data);
 
                        if (!extent_entry_is_ptr(entry))
                                return false;
@@ -1142,7 +1203,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
 
        if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
            bkey_cmp(s->committed, insert->k.p) &&
-           bkey_extent_is_compressed(c, bkey_i_to_s_c(insert))) {
+           bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
                /* XXX: possibly need to increase our reservation? */
                bch_cut_subtract_back(s, s->committed,
                                      bkey_i_to_s(&split.k));
@@ -1178,12 +1239,19 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
 {
        struct extent_insert_hook *hook = s->trans->hook;
        enum extent_insert_hook_ret ret;
-
+#if 0
+       /*
+        * Currently disabled for encryption - broken with fcollapse. Will have
+        * to reenable when versions are exposed for send/receive - versions
+        * will have to be monotonic then:
+        */
        if (k.k && k.k->size &&
-           s->insert->k->k.version &&
-           k.k->version > s->insert->k->k.version)
+           !bversion_zero(s->insert->k->k.version) &&
+           bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
                ret = BTREE_HOOK_NO_INSERT;
-       else if (hook)
+       } else
+#endif
+       if (hook)
                ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
        else
                ret = BTREE_HOOK_DO_INSERT;
@@ -1257,7 +1325,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
        unsigned sectors;
 
        if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-           (sectors = bkey_extent_is_compressed(c, k))) {
+           (sectors = bkey_extent_is_compressed(k))) {
                int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
 
                if (s->trans->flags & BTREE_INSERT_NOFAIL)
@@ -1680,6 +1748,7 @@ static const char *bch_extent_invalid(const struct cache_set *c,
                struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
                const union bch_extent_entry *entry;
                const union bch_extent_crc *crc;
+               const struct bch_extent_ptr *ptr;
                struct cache_member_rcu *mi = cache_member_info_get(c);
                unsigned size_ondisk = e.k->size;
                const char *reason;
@@ -1689,9 +1758,7 @@ static const char *bch_extent_invalid(const struct cache_set *c,
                        if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
                                goto invalid;
 
-                       switch (extent_entry_type(entry)) {
-                       case BCH_EXTENT_ENTRY_crc32:
-                       case BCH_EXTENT_ENTRY_crc64:
+                       if (extent_entry_is_crc(entry)) {
                                crc = entry_to_crc(entry);
 
                                reason = "checksum offset + key size > uncompressed size";
@@ -1702,19 +1769,19 @@ static const char *bch_extent_invalid(const struct cache_set *c,
                                size_ondisk = crc_compressed_size(e.k, crc);
 
                                reason = "invalid checksum type";
-                               if (crc_csum_type(crc) >= BCH_CSUM_NR)
+                               if (!bch_checksum_type_valid(c, crc_csum_type(crc)))
                                        goto invalid;
 
                                reason = "invalid compression type";
                                if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
                                        goto invalid;
-                               break;
-                       case BCH_EXTENT_ENTRY_ptr:
+                       } else {
+                               ptr = entry_to_ptr(entry);
+
                                reason = extent_ptr_invalid(e, mi,
                                                &entry->ptr, size_ondisk);
                                if (reason)
                                        goto invalid;
-                               break;
                        }
                }
 
@@ -1725,8 +1792,17 @@ invalid:
                return reason;
        }
 
-       case BCH_RESERVATION:
+       case BCH_RESERVATION: {
+               struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+               if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+                       return "incorrect value size";
+
+               if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+                       return "invalid nr_replicas";
+
                return NULL;
+       }
 
        default:
                return "invalid value type";
@@ -1743,7 +1819,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
        unsigned seq, stale;
        char buf[160];
        bool bad;
-       unsigned ptrs_per_tier[CACHE_TIERS];
+       unsigned ptrs_per_tier[BCH_TIER_MAX];
        unsigned tier, replicas = 0;
 
        /*
@@ -1760,11 +1836,9 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
        mi = cache_member_info_get(c);
 
        extent_for_each_ptr(e, ptr) {
-               bool dirty = bch_extent_ptr_is_dirty(c, e, ptr);
-
                replicas++;
 
-               if (ptr->dev >= mi->nr_in_set)
+               if (ptr->dev >= mi->nr_devices)
                        goto bad_device;
 
                /*
@@ -1796,7 +1870,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
 
                                stale = ptr_stale(ca, ptr);
 
-                               cache_set_bug_on(stale && dirty, c,
+                               cache_set_bug_on(stale && !ptr->cached, c,
                                                 "stale dirty pointer");
 
                                cache_set_bug_on(stale > 96, c,
@@ -1809,9 +1883,9 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
                                bad = (mark.is_metadata ||
                                       (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
                                        !mark.owned_by_allocator &&
-                                       !(dirty
-                                         ? mark.dirty_sectors
-                                         : mark.cached_sectors)));
+                                       !(ptr->cached
+                                         ? mark.cached_sectors
+                                         : mark.dirty_sectors)));
                        } while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
                        if (bad)
@@ -1869,6 +1943,7 @@ static void bch_extent_debugcheck(struct cache_set *c, struct btree *b,
        case BCH_EXTENT:
        case BCH_EXTENT_CACHED:
                bch_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
+               break;
        case BCH_RESERVATION:
                break;
        default:
@@ -1896,69 +1971,77 @@ static void bch_extent_to_text(struct cache_set *c, char *buf,
 static unsigned PTR_TIER(struct cache_member_rcu *mi,
                         const struct bch_extent_ptr *ptr)
 {
-       return ptr->dev < mi->nr_in_set
+       return ptr->dev < mi->nr_devices
                ? mi->m[ptr->dev].tier
                : UINT_MAX;
 }
 
-void bch_extent_entry_append(struct bkey_i_extent *e,
-                            union bch_extent_entry *entry)
-{
-       BUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
-              BKEY_EXTENT_VAL_U64s_MAX);
-
-       memcpy_u64s(extent_entry_last(extent_i_to_s(e)),
-                   entry,
-                   extent_entry_u64s(entry));
-       e->k.u64s += extent_entry_u64s(entry);
-}
-
-const unsigned bch_crc_size[] = {
-       [BCH_CSUM_NONE]                 = 0,
-       [BCH_CSUM_CRC32C]               = 4,
-       [BCH_CSUM_CRC64]                = 8,
-};
-
 static void bch_extent_crc_init(union bch_extent_crc *crc,
                                unsigned compressed_size,
                                unsigned uncompressed_size,
                                unsigned compression_type,
-                               u64 csum, unsigned csum_type)
+                               unsigned nonce,
+                               struct bch_csum csum, unsigned csum_type)
 {
-       if (bch_crc_size[csum_type] <= 4 &&
-           uncompressed_size <= CRC32_EXTENT_SIZE_MAX) {
+       if (bch_crc_bytes[csum_type]    <= 4 &&
+           uncompressed_size           <= CRC32_SIZE_MAX &&
+           nonce                       <= CRC32_NONCE_MAX) {
                crc->crc32 = (struct bch_extent_crc32) {
                        .type = 1 << BCH_EXTENT_ENTRY_crc32,
-                       .compressed_size        = compressed_size,
-                       .uncompressed_size      = uncompressed_size,
+                       ._compressed_size       = compressed_size - 1,
+                       ._uncompressed_size     = uncompressed_size - 1,
                        .offset                 = 0,
                        .compression_type       = compression_type,
                        .csum_type              = csum_type,
-                       .csum                   = csum,
+                       .csum                   = *((__le32 *) &csum.lo),
                };
-       } else {
-               BUG_ON(uncompressed_size > CRC64_EXTENT_SIZE_MAX);
+               return;
+       }
 
+       if (bch_crc_bytes[csum_type]    <= 10 &&
+           uncompressed_size           <= CRC64_SIZE_MAX &&
+           nonce                       <= CRC64_NONCE_MAX) {
                crc->crc64 = (struct bch_extent_crc64) {
                        .type = 1 << BCH_EXTENT_ENTRY_crc64,
-                       .compressed_size        = compressed_size,
-                       .uncompressed_size      = uncompressed_size,
+                       ._compressed_size       = compressed_size - 1,
+                       ._uncompressed_size     = uncompressed_size - 1,
+                       .offset                 = 0,
+                       .nonce                  = nonce,
+                       .compression_type       = compression_type,
+                       .csum_type              = csum_type,
+                       .csum_lo                = csum.lo,
+                       .csum_hi                = *((__le16 *) &csum.hi),
+               };
+               return;
+       }
+
+       if (bch_crc_bytes[csum_type]    <= 16 &&
+           uncompressed_size           <= CRC128_SIZE_MAX &&
+           nonce                       <= CRC128_NONCE_MAX) {
+               crc->crc128 = (struct bch_extent_crc128) {
+                       .type = 1 << BCH_EXTENT_ENTRY_crc128,
+                       ._compressed_size       = compressed_size - 1,
+                       ._uncompressed_size     = uncompressed_size - 1,
                        .offset                 = 0,
+                       .nonce                  = nonce,
                        .compression_type       = compression_type,
                        .csum_type              = csum_type,
                        .csum                   = csum,
                };
+               return;
        }
+
+       BUG();
 }
 
 void bch_extent_crc_append(struct bkey_i_extent *e,
                           unsigned compressed_size,
                           unsigned uncompressed_size,
                           unsigned compression_type,
-                          u64 csum, unsigned csum_type)
+                          unsigned nonce,
+                          struct bch_csum csum, unsigned csum_type)
 {
        union bch_extent_crc *crc;
-       union bch_extent_crc new;
 
        BUG_ON(compressed_size > uncompressed_size);
        BUG_ON(uncompressed_size != e->k.size);
@@ -1971,123 +2054,26 @@ void bch_extent_crc_append(struct bkey_i_extent *e,
        extent_for_each_crc(extent_i_to_s(e), crc)
                ;
 
-       switch (extent_crc_type(crc)) {
-       case BCH_EXTENT_CRC_NONE:
-               if (!csum_type && !compression_type)
-                       return;
-               break;
-       case BCH_EXTENT_CRC32:
-       case BCH_EXTENT_CRC64:
-               if (crc_compressed_size(&e->k, crc)     == compressed_size &&
-                   crc_uncompressed_size(&e->k, crc)   == uncompressed_size &&
-                   crc_offset(crc)                     == 0 &&
-                   crc_compression_type(crc)           == compression_type &&
-                   crc_csum_type(crc)                  == csum_type &&
-                   crc_csum(crc)                       == csum)
-                       return;
-               break;
-       }
+       if (!crc && !csum_type && !compression_type)
+               return;
+
+       if (crc &&
+           crc_compressed_size(&e->k, crc)     == compressed_size &&
+           crc_uncompressed_size(&e->k, crc)   == uncompressed_size &&
+           crc_offset(crc)                     == 0 &&
+           crc_nonce(crc)                      == nonce &&
+           crc_csum_type(crc)                  == csum_type &&
+           crc_compression_type(crc)           == compression_type &&
+           crc_csum(crc).lo                    == csum.lo &&
+           crc_csum(crc).hi                    == csum.hi)
+               return;
 
-       bch_extent_crc_init(&new,
+       bch_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
                            compressed_size,
                            uncompressed_size,
                            compression_type,
-                           csum, csum_type);
-       bch_extent_entry_append(e, to_entry(&new));
-}
-
-static void __extent_sort_ptrs(struct cache_member_rcu *mi,
-                              struct bkey_s_extent src)
-{
-       struct bch_extent_ptr *src_ptr, *dst_ptr;
-       union bch_extent_crc *src_crc, *dst_crc;
-       union bch_extent_crc _src;
-       BKEY_PADDED(k) tmp;
-       struct bkey_s_extent dst;
-       size_t u64s, crc_u64s;
-       u64 *p;
-
-       /*
-        * Insertion sort:
-        *
-        * Note: this sort needs to be stable, because pointer order determines
-        * pointer dirtyness.
-        */
-
-       tmp.k.k = *src.k;
-       dst = bkey_i_to_s_extent(&tmp.k);
-       set_bkey_val_u64s(dst.k, 0);
-
-       extent_for_each_ptr_crc(src, src_ptr, src_crc) {
-               extent_for_each_ptr_crc(dst, dst_ptr, dst_crc)
-                       if (PTR_TIER(mi, src_ptr) < PTR_TIER(mi, dst_ptr))
-                               goto found;
-
-               dst_ptr = &extent_entry_last(dst)->ptr;
-               dst_crc = NULL;
-found:
-               /* found insert position: */
-
-               /*
-                * we're making sure everything has a crc at this point, if
-                * dst_ptr points to a pointer it better have a crc:
-                */
-               BUG_ON(dst_ptr != &extent_entry_last(dst)->ptr && !dst_crc);
-               BUG_ON(dst_crc &&
-                      (extent_entry_next(to_entry(dst_crc)) !=
-                       to_entry(dst_ptr)));
-
-               if (!src_crc) {
-                       bch_extent_crc_init(&_src, src.k->size,
-                                           src.k->size, 0, 0, 0);
-                       src_crc = &_src;
-               }
-
-               p = dst_ptr != &extent_entry_last(dst)->ptr
-                       ? (void *) dst_crc
-                       : (void *) dst_ptr;
-
-               crc_u64s = extent_entry_u64s(to_entry(src_crc));
-               u64s = crc_u64s + sizeof(*dst_ptr) / sizeof(u64);
-
-               memmove_u64s_up(p + u64s, p,
-                               (u64 *) extent_entry_last(dst) - (u64 *) p);
-               set_bkey_val_u64s(dst.k, bkey_val_u64s(dst.k) + u64s);
-
-               memcpy_u64s(p, src_crc, crc_u64s);
-               memcpy_u64s(p + crc_u64s, src_ptr,
-                           sizeof(*src_ptr) / sizeof(u64));
-       }
-
-       /* Sort done - now drop redundant crc entries: */
-       bch_extent_drop_redundant_crcs(dst);
-
-       memcpy_u64s(src.v, dst.v, bkey_val_u64s(dst.k));
-       set_bkey_val_u64s(src.k, bkey_val_u64s(dst.k));
-}
-
-static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e)
-{
-       struct cache_member_rcu *mi;
-       struct bch_extent_ptr *ptr, *prev = NULL;
-       union bch_extent_crc *crc;
-
-       /*
-        * First check if any pointers are out of order before doing the actual
-        * sort:
-        */
-       mi = cache_member_info_get(c);
-
-       extent_for_each_ptr_crc(e, ptr, crc) {
-               if (prev &&
-                   PTR_TIER(mi, ptr) < PTR_TIER(mi, prev)) {
-                       __extent_sort_ptrs(mi, e);
-                       break;
-               }
-               prev = ptr;
-       }
-
-       cache_member_info_put();
+                           nonce, csum, csum_type);
+       __extent_entry_push(e);
 }
 
 /*
@@ -2098,8 +2084,7 @@ static void extent_sort_ptrs(struct cache_set *c, struct bkey_s_extent e)
  * For existing keys, only called when btree nodes are being rewritten, not when
  * they're merely being compacted/resorted in memory.
  */
-static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
-                                  bool sort)
+bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
 {
        struct bkey_s_extent e;
 
@@ -2112,7 +2097,7 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
                return true;
 
        case KEY_TYPE_DISCARD:
-               return !k.k->version;
+               return bversion_zero(k.k->version);
 
        case BCH_EXTENT:
        case BCH_EXTENT_CACHED:
@@ -2120,13 +2105,10 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
 
                bch_extent_drop_stale(c, e);
 
-               if (sort)
-                       extent_sort_ptrs(c, e);
-
                if (!bkey_val_u64s(e.k)) {
                        if (bkey_extent_is_cached(e.k)) {
                                k.k->type = KEY_TYPE_DISCARD;
-                               if (!k.k->version)
+                               if (bversion_zero(k.k->version))
                                        return true;
                        } else {
                                k.k->type = KEY_TYPE_ERROR;
@@ -2141,9 +2123,40 @@ static bool __bch_extent_normalize(struct cache_set *c, struct bkey_s k,
        }
 }
 
-bool bch_extent_normalize(struct cache_set *c, struct bkey_s k)
+void bch_extent_mark_replicas_cached(struct cache_set *c,
+                                    struct bkey_s_extent e,
+                                    unsigned nr_cached)
 {
-       return __bch_extent_normalize(c, k, true);
+       struct bch_extent_ptr *ptr;
+       struct cache_member_rcu *mi;
+       bool have_higher_tier;
+       unsigned tier = 0;
+
+       if (!nr_cached)
+               return;
+
+       mi = cache_member_info_get(c);
+
+       do {
+               have_higher_tier = false;
+
+               extent_for_each_ptr(e, ptr) {
+                       if (!ptr->cached &&
+                           PTR_TIER(mi, ptr) == tier) {
+                               ptr->cached = true;
+                               nr_cached--;
+                               if (!nr_cached)
+                                       goto out;
+                       }
+
+                       if (PTR_TIER(mi, ptr) > tier)
+                               have_higher_tier = true;
+               }
+
+               tier++;
+       } while (have_higher_tier);
+out:
+       cache_member_info_put();
 }
 
 /*
@@ -2183,7 +2196,7 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
                extent_for_each_online_device_crc(c, e, crc, ptr, ca)
                        if (!ptr_stale(ca, ptr)) {
                                *ret = (struct extent_pick_ptr) {
-                                       .crc = crc_to_64(e.k, crc),
+                                       .crc = crc_to_128(e.k, crc),
                                        .ptr = *ptr,
                                        .ca = ca,
                                };
@@ -2227,7 +2240,7 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
 
        if (l->k.u64s           != r->k.u64s ||
            l->k.type           != r->k.type ||
-           l->k.version        != r->k.version ||
+           bversion_cmp(l->k.version, r->k.version) ||
            bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
                return BCH_MERGE_NOMERGE;
 
@@ -2235,7 +2248,6 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
        case KEY_TYPE_DELETED:
        case KEY_TYPE_DISCARD:
        case KEY_TYPE_ERROR:
-       case BCH_RESERVATION:
                /* These types are mergeable, and no val to check */
                break;
 
@@ -2248,7 +2260,7 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
                        struct bch_extent_ptr *lp, *rp;
                        struct cache_member_cpu *m;
 
-                       en_r = bkey_idx(er.v, (u64 *) en_l - el.v->_data);
+                       en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
 
                        if ((extent_entry_type(en_l) !=
                             extent_entry_type(en_r)) ||
@@ -2276,6 +2288,15 @@ static enum merge_result bch_extent_merge(struct cache_set *c,
                }
 
                break;
+       case BCH_RESERVATION: {
+               struct bkey_i_reservation *li = bkey_i_to_reservation(l);
+               struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+
+               if (li->v.generation != ri->v.generation ||
+                   li->v.nr_replicas != ri->v.nr_replicas)
+                       return BCH_MERGE_NOMERGE;
+               break;
+       }
        default:
                return BCH_MERGE_NOMERGE;
        }
index e1cb47ab1fe8c54a2538188030b48b9b09924728..b0a054226ade387c6cad69d33e368df6af058119 100644 (file)
@@ -26,7 +26,7 @@ struct cache_set;
 struct journal_res;
 
 struct extent_pick_ptr {
-       struct bch_extent_crc64         crc;
+       struct bch_extent_crc128        crc;
        struct bch_extent_ptr           ptr;
        struct cache                    *ca;
 };
@@ -53,10 +53,11 @@ bch_insert_fixup_extent(struct btree_insert *,
                        struct btree_insert_entry *);
 
 bool bch_extent_normalize(struct cache_set *, struct bkey_s);
+void bch_extent_mark_replicas_cached(struct cache_set *,
+                                    struct bkey_s_extent, unsigned);
 
-unsigned bch_extent_nr_ptrs_from(struct bkey_s_c_extent,
-                                const struct bch_extent_ptr *);
 unsigned bch_extent_nr_ptrs(struct bkey_s_c_extent);
+unsigned bch_extent_nr_dirty_ptrs(struct bkey_s_c);
 
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
@@ -117,6 +118,8 @@ static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
                return sizeof(struct bch_extent_crc32);
        case BCH_EXTENT_ENTRY_crc64:
                return sizeof(struct bch_extent_crc64);
+       case BCH_EXTENT_ENTRY_crc128:
+               return sizeof(struct bch_extent_crc128);
        case BCH_EXTENT_ENTRY_ptr:
                return sizeof(struct bch_extent_ptr);
        default:
@@ -143,6 +146,7 @@ union bch_extent_crc {
        u8                              type;
        struct bch_extent_crc32         crc32;
        struct bch_extent_crc64         crc64;
+       struct bch_extent_crc128        crc128;
 };
 
 /* downcast, preserves const */
@@ -185,10 +189,11 @@ enum bch_extent_crc_type {
        BCH_EXTENT_CRC_NONE,
        BCH_EXTENT_CRC32,
        BCH_EXTENT_CRC64,
+       BCH_EXTENT_CRC128,
 };
 
 static inline enum bch_extent_crc_type
-extent_crc_type(const union bch_extent_crc *crc)
+__extent_crc_type(const union bch_extent_crc *crc)
 {
        if (!crc)
                return BCH_EXTENT_CRC_NONE;
@@ -198,16 +203,31 @@ extent_crc_type(const union bch_extent_crc *crc)
                return BCH_EXTENT_CRC32;
        case BCH_EXTENT_ENTRY_crc64:
                return BCH_EXTENT_CRC64;
+       case BCH_EXTENT_ENTRY_crc128:
+               return BCH_EXTENT_CRC128;
        default:
                BUG();
        }
 }
 
+#define extent_crc_type(_crc)                                          \
+({                                                                     \
+       BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) &&       \
+                    !type_is(_crc, struct bch_extent_crc64 *) &&       \
+                    !type_is(_crc, struct bch_extent_crc128 *) &&      \
+                    !type_is(_crc, union bch_extent_crc *));           \
+                                                                       \
+         type_is(_crc, struct bch_extent_crc32 *)  ? BCH_EXTENT_CRC32  \
+       : type_is(_crc, struct bch_extent_crc64 *)  ? BCH_EXTENT_CRC64  \
+       : type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \
+       : __extent_crc_type((union bch_extent_crc *) _crc);             \
+})
+
 #define extent_entry_next(_entry)                                      \
        ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
 
 #define extent_entry_last(_e)                                          \
-       bkey_idx((_e).v, bkey_val_u64s((_e).k))
+       vstruct_idx((_e).v, bkey_val_u64s((_e).k))
 
 /* Iterate over all entries: */
 
@@ -283,20 +303,16 @@ out:                                                                      \
 #define extent_ptr_next(_e, _ptr)                                      \
        extent_ptr_next_filter(_e, _ptr, true)
 
-#define extent_for_each_ptr_from_filter(_e, _ptr, _start, _filter)     \
-       for ((_ptr) = (_start);                         \
+#define extent_for_each_ptr_filter(_e, _ptr, _filter)                  \
+       for ((_ptr) = &(_e).v->start->ptr;                              \
             ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter));      \
             (_ptr)++)
 
-#define extent_for_each_ptr_from(_e, _ptr, _start)                     \
-       extent_for_each_ptr_from_filter(_e, _ptr, _start, true)
-
 #define extent_for_each_ptr(_e, _ptr)                                  \
-       extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr, true)
+       extent_for_each_ptr_filter(_e, _ptr, true)
 
 #define extent_for_each_online_device(_c, _e, _ptr, _ca)               \
-       extent_for_each_ptr_from_filter(_e, _ptr, &(_e).v->start->ptr,  \
-                                       ((_ca) = PTR_CACHE(_c, _ptr)))
+       extent_for_each_ptr_filter(_e, _ptr, ((_ca) = PTR_CACHE(_c, _ptr)))
 
 #define extent_ptr_prev(_e, _ptr)                                      \
 ({                                                                     \
@@ -321,67 +337,114 @@ out:                                                                     \
             (_ptr);                                                    \
             (_ptr) = extent_ptr_prev(_e, _ptr))
 
-void bch_extent_entry_append(struct bkey_i_extent *, union bch_extent_entry *);
 void bch_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
-                          unsigned, u64, unsigned);
+                          unsigned, unsigned, struct bch_csum, unsigned);
+
+static inline void __extent_entry_push(struct bkey_i_extent *e)
+{
+       union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
+
+       EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
+               BKEY_EXTENT_VAL_U64s_MAX);
+
+       e->k.u64s += extent_entry_u64s(entry);
+}
 
 static inline void extent_ptr_append(struct bkey_i_extent *e,
                                     struct bch_extent_ptr ptr)
 {
        ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-       bch_extent_entry_append(e, to_entry(&ptr));
+       extent_entry_last(extent_i_to_s(e))->ptr = ptr;
+       __extent_entry_push(e);
 }
 
-/* XXX: inefficient */
-static inline bool bch_extent_ptr_is_dirty(const struct cache_set *c,
-                                          struct bkey_s_c_extent e,
-                                          const struct bch_extent_ptr *ptr)
+static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k,
+                                                 const union bch_extent_crc *crc)
 {
-       if (bkey_extent_is_cached(e.k))
-               return false;
-
-       /* Dirty pointers come last */
-       return bch_extent_nr_ptrs_from(e, ptr) <= c->opts.data_replicas;
-}
-
-extern const unsigned bch_crc_size[];
+       EBUG_ON(!k->size);
 
-static inline struct bch_extent_crc64 crc_to_64(const struct bkey *k,
-                                               const union bch_extent_crc *crc)
-{
        switch (extent_crc_type(crc)) {
        case BCH_EXTENT_CRC_NONE:
-               return (struct bch_extent_crc64) {
-                       .compressed_size        = k->size,
-                       .uncompressed_size      = k->size,
+               return (struct bch_extent_crc128) {
+                       ._compressed_size       = k->size - 1,
+                       ._uncompressed_size     = k->size - 1,
                };
        case BCH_EXTENT_CRC32:
-               return (struct bch_extent_crc64) {
-                       .compressed_size        = crc->crc32.compressed_size,
-                       .uncompressed_size      = crc->crc32.uncompressed_size,
+               return (struct bch_extent_crc128) {
+                       .type                   = 1 << BCH_EXTENT_ENTRY_crc128,
+                       ._compressed_size       = crc->crc32._compressed_size,
+                       ._uncompressed_size     = crc->crc32._uncompressed_size,
                        .offset                 = crc->crc32.offset,
                        .csum_type              = crc->crc32.csum_type,
                        .compression_type       = crc->crc32.compression_type,
-                       .csum                   = crc->crc32.csum,
+                       .csum.lo                = crc->crc32.csum,
                };
        case BCH_EXTENT_CRC64:
-               return crc->crc64;
+               return (struct bch_extent_crc128) {
+                       .type                   = 1 << BCH_EXTENT_ENTRY_crc128,
+                       ._compressed_size       = crc->crc64._compressed_size,
+                       ._uncompressed_size     = crc->crc64._uncompressed_size,
+                       .offset                 = crc->crc64.offset,
+                       .nonce                  = crc->crc64.nonce,
+                       .csum_type              = crc->crc64.csum_type,
+                       .compression_type       = crc->crc64.compression_type,
+                       .csum.lo                = crc->crc64.csum_lo,
+                       .csum.hi                = crc->crc64.csum_hi,
+               };
+       case BCH_EXTENT_CRC128:
+               return crc->crc128;
        default:
                BUG();
        }
 }
 
-static inline unsigned crc_compressed_size(const struct bkey *k,
-                                          const union bch_extent_crc *crc)
-{
-       return crc_to_64(k, crc).compressed_size;
-}
+#define crc_compressed_size(_k, _crc)                                  \
+({                                                                     \
+       unsigned _size = 0;                                             \
+                                                                       \
+       switch (extent_crc_type(_crc)) {                                \
+       case BCH_EXTENT_CRC_NONE:                                       \
+               _size = ((const struct bkey *) (_k))->size;             \
+               break;                                                  \
+       case BCH_EXTENT_CRC32:                                          \
+               _size = ((struct bch_extent_crc32 *) _crc)              \
+                       ->_compressed_size + 1;                         \
+               break;                                                  \
+       case BCH_EXTENT_CRC64:                                          \
+               _size = ((struct bch_extent_crc64 *) _crc)              \
+                       ->_compressed_size + 1;                         \
+               break;                                                  \
+       case BCH_EXTENT_CRC128:                                         \
+               _size = ((struct bch_extent_crc128 *) _crc)             \
+                       ->_compressed_size + 1;                         \
+               break;                                                  \
+       }                                                               \
+       _size;                                                          \
+})
 
-static inline unsigned crc_uncompressed_size(const struct bkey *k,
-                                            const union bch_extent_crc *crc)
-{
-       return crc_to_64(k, crc).uncompressed_size;
-}
+#define crc_uncompressed_size(_k, _crc)                                        \
+({                                                                     \
+       unsigned _size = 0;                                             \
+                                                                       \
+       switch (extent_crc_type(_crc)) {                                \
+       case BCH_EXTENT_CRC_NONE:                                       \
+               _size = ((const struct bkey *) (_k))->size;             \
+               break;                                                  \
+       case BCH_EXTENT_CRC32:                                          \
+               _size = ((struct bch_extent_crc32 *) _crc)              \
+                       ->_uncompressed_size + 1;                       \
+               break;                                                  \
+       case BCH_EXTENT_CRC64:                                          \
+               _size = ((struct bch_extent_crc64 *) _crc)              \
+                       ->_uncompressed_size + 1;                       \
+               break;                                                  \
+       case BCH_EXTENT_CRC128:                                         \
+               _size = ((struct bch_extent_crc128 *) _crc)             \
+                       ->_uncompressed_size + 1;                       \
+               break;                                                  \
+       }                                                               \
+       _size;                                                          \
+})
 
 static inline unsigned crc_offset(const union bch_extent_crc *crc)
 {
@@ -392,6 +455,23 @@ static inline unsigned crc_offset(const union bch_extent_crc *crc)
                return crc->crc32.offset;
        case BCH_EXTENT_CRC64:
                return crc->crc64.offset;
+       case BCH_EXTENT_CRC128:
+               return crc->crc128.offset;
+       default:
+               BUG();
+       }
+}
+
+static inline unsigned crc_nonce(const union bch_extent_crc *crc)
+{
+       switch (extent_crc_type(crc)) {
+       case BCH_EXTENT_CRC_NONE:
+       case BCH_EXTENT_CRC32:
+               return 0;
+       case BCH_EXTENT_CRC64:
+               return crc->crc64.nonce;
+       case BCH_EXTENT_CRC128:
+               return crc->crc128.nonce;
        default:
                BUG();
        }
@@ -406,6 +486,8 @@ static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
                return crc->crc32.csum_type;
        case BCH_EXTENT_CRC64:
                return crc->crc64.csum_type;
+       case BCH_EXTENT_CRC128:
+               return crc->crc128.csum_type;
        default:
                BUG();
        }
@@ -420,27 +502,33 @@ static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
                return crc->crc32.compression_type;
        case BCH_EXTENT_CRC64:
                return crc->crc64.compression_type;
+       case BCH_EXTENT_CRC128:
+               return crc->crc128.compression_type;
        default:
                BUG();
        }
 }
 
-static inline u64 crc_csum(const union bch_extent_crc *crc)
+static inline struct bch_csum crc_csum(const union bch_extent_crc *crc)
 {
        switch (extent_crc_type(crc)) {
        case BCH_EXTENT_CRC_NONE:
-               return 0;
+               return (struct bch_csum) { 0 };
        case BCH_EXTENT_CRC32:
-               return crc->crc32.csum;
+               return (struct bch_csum) { .lo = crc->crc32.csum };
        case BCH_EXTENT_CRC64:
-               return crc->crc64.csum;
+               return (struct bch_csum) {
+                       .lo = crc->crc64.csum_lo,
+                       .hi = crc->crc64.csum_hi,
+               };
+       case BCH_EXTENT_CRC128:
+               return crc->crc128.csum;
        default:
                BUG();
        }
 }
 
-static inline unsigned bkey_extent_is_compressed(struct cache_set *c,
-                                                struct bkey_s_c k)
+static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k)
 {
        struct bkey_s_c_extent e;
        const struct bch_extent_ptr *ptr;
@@ -453,7 +541,7 @@ static inline unsigned bkey_extent_is_compressed(struct cache_set *c,
                e = bkey_s_c_to_extent(k);
 
                extent_for_each_ptr_crc(e, ptr, crc)
-                       if (bch_extent_ptr_is_dirty(c, e, ptr) &&
+                       if (!ptr->cached &&
                            crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
                            crc_compressed_size(e.k, crc) < k.k->size)
                                ret = max_t(unsigned, ret,
@@ -463,6 +551,17 @@ static inline unsigned bkey_extent_is_compressed(struct cache_set *c,
        return ret;
 }
 
+static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
+{
+       const union bch_extent_crc *crc;
+
+       extent_for_each_crc(e, crc)
+               if (bch_csum_type_is_encryption(crc_csum_type(crc)))
+                       return crc_offset(crc) + crc_nonce(crc);
+
+       return 0;
+}
+
 void bch_extent_narrow_crcs(struct bkey_s_extent);
 void bch_extent_drop_redundant_crcs(struct bkey_s_extent);
 
index 1dec230fdf6893d0fd48c07524d3d092ab705697..a758e895c3a9ae6f1a67e4451e41c9e5b501c206 100644 (file)
@@ -17,7 +17,7 @@ static int remove_dirent(struct cache_set *c, struct btree_iter *iter,
                         struct bkey_s_c_dirent dirent)
 {
        struct qstr name;
-       struct bkey_i_inode dir_inode;
+       struct bch_inode_unpacked dir_inode;
        struct bch_hash_info dir_hash_info;
        u64 dir_inum = dirent.k->p.inode;
        int ret;
@@ -39,7 +39,7 @@ static int remove_dirent(struct cache_set *c, struct btree_iter *iter,
        if (ret)
                goto err;
 
-       dir_hash_info = bch_hash_info_init(&dir_inode.v);
+       dir_hash_info = bch_hash_info_init(&dir_inode);
 
        ret = bch_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
 err:
@@ -48,11 +48,12 @@ err:
 }
 
 static int reattach_inode(struct cache_set *c,
-                         struct bkey_i_inode *lostfound_inode,
+                         struct bch_inode_unpacked *lostfound_inode,
                          u64 inum)
 {
        struct bch_hash_info lostfound_hash_info =
-               bch_hash_info_init(&lostfound_inode->v);
+               bch_hash_info_init(lostfound_inode);
+       struct bkey_inode_buf packed;
        char name_buf[20];
        struct qstr name;
        int ret;
@@ -60,14 +61,16 @@ static int reattach_inode(struct cache_set *c,
        snprintf(name_buf, sizeof(name_buf), "%llu", inum);
        name = (struct qstr) QSTR(name_buf);
 
-       le32_add_cpu(&lostfound_inode->v.i_nlink, 1);
+       lostfound_inode->i_nlink++;
 
-       ret = bch_btree_insert(c, BTREE_ID_INODES, &lostfound_inode->k_i,
+       bch_inode_pack(&packed, lostfound_inode);
+
+       ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
                               NULL, NULL, NULL, 0);
        if (ret)
                return ret;
 
-       return bch_dirent_create(c, lostfound_inode->k.p.inode,
+       return bch_dirent_create(c, lostfound_inode->inum,
                                 &lostfound_hash_info,
                                 DT_DIR, &name, inum, NULL, 0);
 }
@@ -75,10 +78,8 @@ static int reattach_inode(struct cache_set *c,
 struct inode_walker {
        bool                    first_this_inode;
        bool                    have_inode;
-       u16                     i_mode;
-       u64                     i_size;
        u64                     cur_inum;
-       struct bkey_i_inode     inode;
+       struct bch_inode_unpacked inode;
 };
 
 static struct inode_walker inode_walker_init(void)
@@ -101,11 +102,6 @@ static int walk_inode(struct cache_set *c, struct inode_walker *w, u64 inum)
                        return ret;
 
                w->have_inode = !ret;
-
-               if (w->have_inode) {
-                       w->i_mode = le16_to_cpu(w->inode.v.i_mode);
-                       w->i_size = le64_to_cpu(w->inode.v.i_size);
-               }
        }
 
        return 0;
@@ -138,20 +134,20 @@ static int check_extents(struct cache_set *c)
                        k.k->type, k.k->p.inode);
 
                unfixable_fsck_err_on(w.first_this_inode && w.have_inode &&
-                       le64_to_cpu(w.inode.v.i_sectors) !=
+                       w.inode.i_sectors !=
                        (i_sectors = bch_count_inode_sectors(c, w.cur_inum)),
                        c, "i_sectors wrong: got %llu, should be %llu",
-                       le64_to_cpu(w.inode.v.i_sectors), i_sectors);
+                       w.inode.i_sectors, i_sectors);
 
                unfixable_fsck_err_on(w.have_inode &&
-                       !S_ISREG(w.i_mode) && !S_ISLNK(w.i_mode), c,
+                       !S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c,
                        "extent type %u for non regular file, inode %llu mode %o",
-                       k.k->type, k.k->p.inode, w.i_mode);
+                       k.k->type, k.k->p.inode, w.inode.i_mode);
 
                unfixable_fsck_err_on(k.k->type != BCH_RESERVATION &&
-                       k.k->p.offset > round_up(w.i_size, PAGE_SIZE) >> 9, c,
+                       k.k->p.offset > round_up(w.inode.i_size, PAGE_SIZE) >> 9, c,
                        "extent type %u offset %llu past end of inode %llu, i_size %llu",
-                       k.k->type, k.k->p.offset, k.k->p.inode, w.i_size);
+                       k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size);
        }
 fsck_err:
        return bch_btree_iter_unlock(&iter) ?: ret;
@@ -172,7 +168,7 @@ static int check_dirents(struct cache_set *c)
        for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
                           POS(BCACHE_ROOT_INO, 0), k) {
                struct bkey_s_c_dirent d;
-               struct bkey_i_inode target;
+               struct bch_inode_unpacked target;
                bool have_target;
                u64 d_inum;
 
@@ -184,9 +180,9 @@ static int check_dirents(struct cache_set *c)
                                      "dirent in nonexisting directory %llu",
                                      k.k->p.inode);
 
-               unfixable_fsck_err_on(!S_ISDIR(w.i_mode), c,
+               unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c,
                                      "dirent in non directory inode %llu, type %u",
-                                     k.k->p.inode, mode_to_type(w.i_mode));
+                                     k.k->p.inode, mode_to_type(w.inode.i_mode));
 
                if (k.k->type != BCH_DIRENT)
                        continue;
@@ -220,10 +216,10 @@ static int check_dirents(struct cache_set *c)
 
                if (fsck_err_on(have_target &&
                                d.v->d_type !=
-                               mode_to_type(le16_to_cpu(target.v.i_mode)), c,
+                               mode_to_type(le16_to_cpu(target.i_mode)), c,
                                "incorrect d_type: got %u should be %u, filename %s",
                                d.v->d_type,
-                               mode_to_type(le16_to_cpu(target.v.i_mode)),
+                               mode_to_type(le16_to_cpu(target.i_mode)),
                                d.v->d_name)) {
                        struct bkey_i_dirent *n;
 
@@ -234,7 +230,7 @@ static int check_dirents(struct cache_set *c)
                        }
 
                        bkey_reassemble(&n->k_i, d.s_c);
-                       n->v.d_type = mode_to_type(le16_to_cpu(target.v.i_mode));
+                       n->v.d_type = mode_to_type(le16_to_cpu(target.i_mode));
 
                        ret = bch_btree_insert_at(c, NULL, NULL, NULL,
                                        BTREE_INSERT_NOFAIL,
@@ -276,8 +272,9 @@ fsck_err:
 }
 
 /* Get root directory, create if it doesn't exist: */
-static int check_root(struct cache_set *c, struct bkey_i_inode *root_inode)
+static int check_root(struct cache_set *c, struct bch_inode_unpacked *root_inode)
 {
+       struct bkey_inode_buf packed;
        int ret;
 
        ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, root_inode);
@@ -287,7 +284,7 @@ static int check_root(struct cache_set *c, struct bkey_i_inode *root_inode)
        if (fsck_err_on(ret, c, "root directory missing"))
                goto create_root;
 
-       if (fsck_err_on(!S_ISDIR(le16_to_cpu(root_inode->v.i_mode)), c,
+       if (fsck_err_on(!S_ISDIR(root_inode->i_mode), c,
                        "root inode not a directory"))
                goto create_root;
 
@@ -296,19 +293,23 @@ fsck_err:
        return ret;
 create_root:
        bch_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
-       root_inode->k.p.inode = BCACHE_ROOT_INO;
+       root_inode->inum = BCACHE_ROOT_INO;
+
+       bch_inode_pack(&packed, root_inode);
 
-       return bch_btree_insert(c, BTREE_ID_INODES, &root_inode->k_i,
+       return bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
                                NULL, NULL, NULL, 0);
 }
 
 /* Get lost+found, create if it doesn't exist: */
 static int check_lostfound(struct cache_set *c,
-                          struct bkey_i_inode *root_inode,
-                          struct bkey_i_inode *lostfound_inode)
+                          struct bch_inode_unpacked *root_inode,
+                          struct bch_inode_unpacked *lostfound_inode)
 {
        struct qstr lostfound = QSTR("lost+found");
-       struct bch_hash_info root_hash_info = bch_hash_info_init(&root_inode->v);
+       struct bch_hash_info root_hash_info =
+               bch_hash_info_init(root_inode);
+       struct bkey_inode_buf packed;
        u64 inum;
        int ret;
 
@@ -326,7 +327,7 @@ static int check_lostfound(struct cache_set *c,
        if (fsck_err_on(ret, c, "lost+found missing"))
                goto create_lostfound;
 
-       if (fsck_err_on(!S_ISDIR(le16_to_cpu(lostfound_inode->v.i_mode)), c,
+       if (fsck_err_on(!S_ISDIR(lostfound_inode->i_mode), c,
                        "lost+found inode not a directory"))
                goto create_lostfound;
 
@@ -334,22 +335,27 @@ static int check_lostfound(struct cache_set *c,
 fsck_err:
        return ret;
 create_lostfound:
-       le32_add_cpu(&root_inode->v.i_nlink, 1);
+       root_inode->i_nlink++;
 
-       ret = bch_btree_insert(c, BTREE_ID_INODES, &root_inode->k_i,
+       bch_inode_pack(&packed, root_inode);
+
+       ret = bch_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
                               NULL, NULL, NULL, 0);
        if (ret)
                return ret;
 
        bch_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+       bch_inode_pack(&packed, lostfound_inode);
 
-       ret = bch_inode_create(c, &lostfound_inode->k_i, BLOCKDEV_INODE_MAX, 0,
+       ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0,
                               &c->unused_inode_hint);
        if (ret)
                return ret;
 
+       lostfound_inode->inum = packed.inode.k.p.inode;
+
        ret = bch_dirent_create(c, BCACHE_ROOT_INO, &root_hash_info, DT_DIR,
-                               &lostfound, lostfound_inode->k.p.inode, NULL, 0);
+                               &lostfound, lostfound_inode->inum, NULL, 0);
        if (ret)
                return ret;
 
@@ -420,7 +426,7 @@ static int path_down(struct pathbuf *p, u64 inum)
 
 noinline_for_stack
 static int check_directory_structure(struct cache_set *c,
-                                    struct bkey_i_inode *lostfound_inode)
+                                    struct bch_inode_unpacked *lostfound_inode)
 {
        struct inode_bitmap dirs_done = { NULL, 0 };
        struct pathbuf path = { 0, 0, NULL };
@@ -618,25 +624,30 @@ s64 bch_count_inode_sectors(struct cache_set *c, u64 inum)
 }
 
 static int bch_gc_do_inode(struct cache_set *c,
-                          struct bkey_i_inode *lostfound_inode,
+                          struct bch_inode_unpacked *lostfound_inode,
                           struct btree_iter *iter,
                           struct bkey_s_c_inode inode, struct nlink link)
 {
-       u16 i_mode  = le16_to_cpu(inode.v->i_mode);
-       u32 i_flags = le32_to_cpu(inode.v->i_flags);
-       u32 i_nlink = le32_to_cpu(inode.v->i_nlink);
-       u64 i_size  = le64_to_cpu(inode.v->i_size);
-       s64 i_sectors = 0;
+       struct bch_inode_unpacked u;
        int ret = 0;
-       u32 real_i_nlink;
+       u32 i_nlink, real_i_nlink;
+       bool do_update = false;
+
+       ret = bch_inode_unpack(inode, &u);
+       if (cache_set_inconsistent_on(ret, c,
+                        "error unpacking inode %llu in fs-gc",
+                        inode.k->p.inode))
+               return ret;
+
+       i_nlink = u.i_nlink + nlink_bias(u.i_mode);
 
        fsck_err_on(i_nlink < link.count, c,
                    "inode %llu i_link too small (%u < %u, type %i)",
                    inode.k->p.inode, i_nlink,
-                   link.count, mode_to_type(i_mode));
+                   link.count, mode_to_type(u.i_mode));
 
        /* These should have been caught/fixed by earlier passes: */
-       if (S_ISDIR(i_mode)) {
+       if (S_ISDIR(u.i_mode)) {
                need_fsck_err_on(link.count > 1, c,
                        "directory %llu with multiple hardlinks: %u",
                        inode.k->p.inode, link.count);
@@ -656,7 +667,7 @@ static int bch_gc_do_inode(struct cache_set *c,
                            "but found orphaned inode %llu",
                            inode.k->p.inode);
 
-               if (fsck_err_on(S_ISDIR(i_mode) &&
+               if (fsck_err_on(S_ISDIR(u.i_mode) &&
                                bch_empty_dir(c, inode.k->p.inode), c,
                                "non empty directory with link count 0, "
                                "inode nlink %u, dir links found %u",
@@ -676,7 +687,7 @@ static int bch_gc_do_inode(struct cache_set *c,
                return ret;
        }
 
-       if (i_flags & BCH_INODE_I_SIZE_DIRTY) {
+       if (u.i_flags & BCH_INODE_I_SIZE_DIRTY) {
                fsck_err_on(c->sb.clean, c,
                            "filesystem marked clean, "
                            "but inode %llu has i_size dirty",
@@ -690,7 +701,7 @@ static int bch_gc_do_inode(struct cache_set *c,
                 */
 
                ret = bch_inode_truncate(c, inode.k->p.inode,
-                               round_up(i_size, PAGE_SIZE) >> 9,
+                               round_up(u.i_size, PAGE_SIZE) >> 9,
                                NULL, NULL);
                if (ret) {
                        bch_err(c, "error in fs gc: error %i "
@@ -702,10 +713,15 @@ static int bch_gc_do_inode(struct cache_set *c,
                 * We truncated without our normal sector accounting hook, just
                 * make sure we recalculate it:
                 */
-               i_flags |= BCH_INODE_I_SECTORS_DIRTY;
+               u.i_flags |= BCH_INODE_I_SECTORS_DIRTY;
+
+               u.i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+               do_update = true;
        }
 
-       if (i_flags & BCH_INODE_I_SECTORS_DIRTY) {
+       if (u.i_flags & BCH_INODE_I_SECTORS_DIRTY) {
+               s64 sectors;
+
                fsck_err_on(c->sb.clean, c,
                            "filesystem marked clean, "
                            "but inode %llu has i_sectors dirty",
@@ -714,13 +730,17 @@ static int bch_gc_do_inode(struct cache_set *c,
                bch_verbose(c, "recounting sectors for inode %llu",
                            inode.k->p.inode);
 
-               i_sectors = bch_count_inode_sectors(c, inode.k->p.inode);
-               if (i_sectors < 0) {
+               sectors = bch_count_inode_sectors(c, inode.k->p.inode);
+               if (sectors < 0) {
                        bch_err(c, "error in fs gc: error %i "
                                "recounting inode sectors",
-                               (int) i_sectors);
-                       return i_sectors;
+                               (int) sectors);
+                       return sectors;
                }
+
+               u.i_sectors = sectors;
+               u.i_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+               do_update = true;
        }
 
        if (i_nlink != real_i_nlink) {
@@ -728,30 +748,23 @@ static int bch_gc_do_inode(struct cache_set *c,
                            "filesystem marked clean, "
                            "but inode %llu has wrong i_nlink "
                            "(type %u i_nlink %u, should be %u)",
-                           inode.k->p.inode, mode_to_type(i_mode),
+                           inode.k->p.inode, mode_to_type(u.i_mode),
                            i_nlink, real_i_nlink);
 
                bch_verbose(c, "setting inode %llu nlinks from %u to %u",
                            inode.k->p.inode, i_nlink, real_i_nlink);
+               u.i_nlink = real_i_nlink - nlink_bias(u.i_mode);;
+               do_update = true;
        }
 
-       if (i_nlink != real_i_nlink||
-           i_flags & BCH_INODE_I_SECTORS_DIRTY ||
-           i_flags & BCH_INODE_I_SIZE_DIRTY) {
-               struct bkey_i_inode update;
-
-               bkey_reassemble(&update.k_i, inode.s_c);
-               update.v.i_nlink = cpu_to_le32(real_i_nlink);
-               update.v.i_flags = cpu_to_le32(i_flags &
-                               ~(BCH_INODE_I_SIZE_DIRTY|
-                                 BCH_INODE_I_SECTORS_DIRTY));
+       if (do_update) {
+               struct bkey_inode_buf p;
 
-               if (i_flags & BCH_INODE_I_SECTORS_DIRTY)
-                       update.v.i_sectors = cpu_to_le64(i_sectors);
+               bch_inode_pack(&p, &u);
 
                ret = bch_btree_insert_at(c, NULL, NULL, NULL,
                                          BTREE_INSERT_NOFAIL,
-                                         BTREE_INSERT_ENTRY(iter, &update.k_i));
+                                         BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
                if (ret && ret != -EINTR)
                        bch_err(c, "error in fs gc: error %i "
                                "updating inode", ret);
@@ -762,7 +775,7 @@ fsck_err:
 
 noinline_for_stack
 static int bch_gc_walk_inodes(struct cache_set *c,
-                             struct bkey_i_inode *lostfound_inode,
+                             struct bch_inode_unpacked *lostfound_inode,
                              struct nlinks *links,
                              u64 range_start, u64 range_end)
 {
@@ -835,7 +848,7 @@ fsck_err:
 
 noinline_for_stack
 static int check_inode_nlinks(struct cache_set *c,
-                             struct bkey_i_inode *lostfound_inode)
+                             struct bch_inode_unpacked *lostfound_inode)
 {
        struct nlinks links;
        u64 this_iter_range_start, next_iter_range_start = 0;
@@ -873,7 +886,7 @@ static int check_inode_nlinks(struct cache_set *c,
  */
 int bch_fsck(struct cache_set *c, bool full_fsck)
 {
-       struct bkey_i_inode root_inode, lostfound_inode;
+       struct bch_inode_unpacked root_inode, lostfound_inode;
        int ret;
 
        ret = check_root(c, &root_inode);
index 942baeb1235df9132b59fd32637d376a5da0ac66..ecf249c3e66482c726dbd7a289d61931454c453d 100644 (file)
@@ -59,22 +59,20 @@ static int write_invalidate_inode_pages_range(struct address_space *mapping,
 
 /* i_size updates: */
 
-static int inode_set_size(struct bch_inode_info *ei, struct bch_inode *bi,
+static int inode_set_size(struct bch_inode_info *ei,
+                         struct bch_inode_unpacked *bi,
                          void *p)
 {
        loff_t *new_i_size = p;
-       unsigned i_flags = le32_to_cpu(bi->i_flags);
 
        lockdep_assert_held(&ei->update_lock);
 
-       bi->i_size = cpu_to_le64(*new_i_size);
+       bi->i_size = *new_i_size;
 
        if (atomic_long_read(&ei->i_size_dirty_count))
-               i_flags |= BCH_INODE_I_SIZE_DIRTY;
+               bi->i_flags |= BCH_INODE_I_SIZE_DIRTY;
        else
-               i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
-       bi->i_flags = cpu_to_le32(i_flags);
+               bi->i_flags &= ~BCH_INODE_I_SIZE_DIRTY;
 
        return 0;
 }
@@ -122,23 +120,22 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
 }
 
 static int inode_set_i_sectors_dirty(struct bch_inode_info *ei,
-                                   struct bch_inode *bi, void *p)
+                                   struct bch_inode_unpacked *bi, void *p)
 {
-       BUG_ON(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY);
+       BUG_ON(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY);
 
-       bi->i_flags = cpu_to_le32(le32_to_cpu(bi->i_flags)|
-                                 BCH_INODE_I_SECTORS_DIRTY);
+       bi->i_flags |= BCH_INODE_I_SECTORS_DIRTY;
        return 0;
 }
 
 static int inode_clear_i_sectors_dirty(struct bch_inode_info *ei,
-                                      struct bch_inode *bi, void *p)
+                                      struct bch_inode_unpacked *bi,
+                                      void *p)
 {
-       BUG_ON(!(le32_to_cpu(bi->i_flags) & BCH_INODE_I_SECTORS_DIRTY));
+       BUG_ON(!(bi->i_flags & BCH_INODE_I_SECTORS_DIRTY));
 
-       bi->i_sectors   = cpu_to_le64(atomic64_read(&ei->i_sectors));
-       bi->i_flags     = cpu_to_le32(le32_to_cpu(bi->i_flags) &
-                                     ~BCH_INODE_I_SECTORS_DIRTY);
+       bi->i_sectors   = atomic64_read(&ei->i_sectors);
+       bi->i_flags     &= ~BCH_INODE_I_SECTORS_DIRTY;
        return 0;
 }
 
@@ -203,7 +200,10 @@ static int __must_check i_sectors_dirty_get(struct bch_inode_info *ei,
 struct bchfs_extent_trans_hook {
        struct bchfs_write_op           *op;
        struct extent_insert_hook       hook;
-       struct bkey_i_inode             new_inode;
+
+       struct bch_inode_unpacked       inode_u;
+       struct bkey_inode_buf           inode_p;
+
        bool                            need_inode_update;
 };
 
@@ -222,6 +222,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
                (k.k && bkey_extent_is_allocation(k.k));
        s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
        u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
+       bool do_pack = false;
 
        BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
 
@@ -234,7 +235,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
                        return BTREE_HOOK_RESTART_TRANS;
                }
 
-               h->new_inode.v.i_size = cpu_to_le64(offset);
+               h->inode_u.i_size = offset;
+               do_pack = true;
+
                ei->i_size = offset;
 
                if (h->op->is_dio)
@@ -247,7 +250,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
                        return BTREE_HOOK_RESTART_TRANS;
                }
 
-               le64_add_cpu(&h->new_inode.v.i_sectors, sectors);
+               h->inode_u.i_sectors += sectors;
+               do_pack = true;
+
                atomic64_add(sectors, &ei->i_sectors);
 
                h->op->sectors_added += sectors;
@@ -259,6 +264,9 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
                }
        }
 
+       if (do_pack)
+               bch_inode_pack(&h->inode_p, &h->inode_u);
+
        return BTREE_HOOK_DO_INSERT;
 }
 
@@ -310,13 +318,32 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
                                break;
                        }
 
-                       bkey_reassemble(&hook.new_inode.k_i, inode);
+                       if (WARN_ONCE(bkey_bytes(inode.k) >
+                                     sizeof(hook.inode_p),
+                                     "inode %llu too big (%zu bytes, buf %zu)",
+                                     extent_iter.pos.inode,
+                                     bkey_bytes(inode.k),
+                                     sizeof(hook.inode_p))) {
+                               ret = -ENOENT;
+                               break;
+                       }
+
+                       bkey_reassemble(&hook.inode_p.inode.k_i, inode);
+                       ret = bch_inode_unpack(bkey_s_c_to_inode(inode),
+                                              &hook.inode_u);
+                       if (WARN_ONCE(ret,
+                                     "error %i unpacking inode %llu",
+                                     ret, extent_iter.pos.inode)) {
+                               ret = -ENOENT;
+                               break;
+                       }
 
                        ret = bch_btree_insert_at(wop->c, &wop->res,
                                        &hook.hook, op_journal_seq(wop),
                                        BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
                                        BTREE_INSERT_ENTRY(&extent_iter, k),
-                                       BTREE_INSERT_ENTRY(&inode_iter, &hook.new_inode.k_i));
+                                       BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
+                                                       &hook.inode_p.inode.k_i, 2));
                } else {
                        ret = bch_btree_insert_at(wop->c, &wop->res,
                                        &hook.hook, op_journal_seq(wop),
@@ -350,25 +377,15 @@ err:
 struct bch_page_state {
 union { struct {
        /*
-        * BCH_PAGE_ALLOCATED: page is _fully_ written on disk, and not
-        * compressed - which means to write this page we don't have to reserve
-        * space (the new write will never take up more space on disk than what
-        * it's overwriting)
-        *
-        * BCH_PAGE_UNALLOCATED: page is not fully written on disk, or is
-        * compressed - before writing we have to reserve space with
-        * bch_reserve_sectors()
-        *
-        * BCH_PAGE_RESERVED: page has space reserved on disk (reservation will
-        * be consumed when the page is written).
+        * page is _fully_ written on disk, and not compressed - which means to
+        * write this page we don't have to reserve space (the new write will
+        * never take up more space on disk than what it's overwriting)
         */
-       enum {
-               BCH_PAGE_UNALLOCATED    = 0,
-               BCH_PAGE_ALLOCATED,
-       }                       alloc_state:2;
+       unsigned allocated:1;
 
        /* Owns PAGE_SECTORS sized reservation: */
        unsigned                reserved:1;
+       unsigned                nr_replicas:4;
 
        /*
         * Number of sectors on disk - for i_blocks
@@ -431,11 +448,9 @@ static int bch_get_page_reservation(struct cache_set *c, struct page *page,
        struct disk_reservation res;
        int ret = 0;
 
-       BUG_ON(s->alloc_state == BCH_PAGE_ALLOCATED &&
-              s->sectors != PAGE_SECTORS);
+       BUG_ON(s->allocated && s->sectors != PAGE_SECTORS);
 
-       if (s->reserved ||
-           s->alloc_state == BCH_PAGE_ALLOCATED)
+       if (s->allocated || s->reserved)
                return 0;
 
        ret = bch_disk_reservation_get(c, &res, PAGE_SECTORS, !check_enospc
@@ -448,7 +463,8 @@ static int bch_get_page_reservation(struct cache_set *c, struct page *page,
                        bch_disk_reservation_put(c, &res);
                        return 0;
                }
-               new.reserved = 1;
+               new.reserved    = 1;
+               new.nr_replicas = res.nr_replicas;
        });
 
        return 0;
@@ -585,10 +601,10 @@ static void bch_mark_pages_unalloc(struct bio *bio)
        struct bio_vec bv;
 
        bio_for_each_segment(bv, bio, iter)
-               page_state(bv.bv_page)->alloc_state = BCH_PAGE_UNALLOCATED;
+               page_state(bv.bv_page)->allocated = 0;
 }
 
-static void bch_add_page_sectors(struct bio *bio, const struct bkey *k)
+static void bch_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
        struct bvec_iter iter;
        struct bio_vec bv;
@@ -597,12 +613,17 @@ static void bch_add_page_sectors(struct bio *bio, const struct bkey *k)
                struct bch_page_state *s = page_state(bv.bv_page);
 
                /* sectors in @k from the start of this page: */
-               unsigned k_sectors = k->size - (iter.bi_sector - k->p.offset);
+               unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
 
                unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
 
-               BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
+               if (!s->sectors)
+                       s->nr_replicas = bch_extent_nr_dirty_ptrs(k);
+               else
+                       s->nr_replicas = min_t(unsigned, s->nr_replicas,
+                                              bch_extent_nr_dirty_ptrs(k));
 
+               BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
                s->sectors += page_sectors;
        }
 }
@@ -634,7 +655,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode
 
                EBUG_ON(s->reserved);
 
-               s->alloc_state = BCH_PAGE_ALLOCATED;
+               s->allocated = 1;
                s->sectors = 0;
        }
 
@@ -650,7 +671,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode
                k = bkey_i_to_s_c(&tmp.k);
 
                if (!bkey_extent_is_allocation(k.k) ||
-                   bkey_extent_is_compressed(c, k))
+                   bkey_extent_is_compressed(k))
                        bch_mark_pages_unalloc(bio);
 
                bch_extent_pick_ptr(c, k, &pick);
@@ -667,7 +688,7 @@ static void bchfs_read(struct cache_set *c, struct bch_read_bio *rbio, u64 inode
                swap(bio->bi_iter.bi_size, bytes);
 
                if (bkey_extent_is_allocation(k.k))
-                       bch_add_page_sectors(bio, k.k);
+                       bch_add_page_sectors(bio, k);
 
                if (pick.ca) {
                        PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
@@ -859,6 +880,10 @@ static void bch_writepage_io_alloc(struct cache_set *c,
                                   struct page *page)
 {
        u64 inum = ei->vfs_inode.i_ino;
+       unsigned nr_replicas = page_state(page)->nr_replicas;
+
+       EBUG_ON(!nr_replicas);
+       /* XXX: disk_reservation->gen isn't plumbed through */
 
        if (!w->io) {
 alloc_io:
@@ -881,7 +906,8 @@ alloc_io:
                w->io->op.op.index_update_fn = bchfs_write_index_update;
        }
 
-       if (bio_add_page_contig(&w->io->bio.bio, page)) {
+       if (w->io->op.op.res.nr_replicas != nr_replicas ||
+           bio_add_page_contig(&w->io->bio.bio, page)) {
                bch_writepage_do_io(w);
                goto alloc_io;
        }
@@ -936,13 +962,13 @@ do_io:
 
        /* Before unlocking the page, transfer reservation to w->io: */
        old = page_state_cmpxchg(page_state(page), new, {
-               BUG_ON(!new.reserved &&
-                      (new.sectors != PAGE_SECTORS ||
-                       new.alloc_state != BCH_PAGE_ALLOCATED));
+               EBUG_ON(!new.reserved &&
+                       (new.sectors != PAGE_SECTORS ||
+                       !new.allocated));
 
-               if (new.alloc_state == BCH_PAGE_ALLOCATED &&
+               if (new.allocated &&
                    w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
-                       new.alloc_state = BCH_PAGE_UNALLOCATED;
+                       new.allocated = 0;
                else if (!new.reserved)
                        goto out;
                new.reserved = 0;
@@ -1919,7 +1945,7 @@ int bch_truncate(struct inode *inode, struct iattr *iattr)
 
        mutex_lock(&ei->update_lock);
        setattr_copy(inode, iattr);
-       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
 
        /* clear I_SIZE_DIRTY: */
        i_size_dirty_put(ei);
@@ -1981,7 +2007,7 @@ static long bch_fpunch(struct inode *inode, loff_t offset, loff_t len)
                ret = bch_discard(c,
                                  POS(ino, discard_start),
                                  POS(ino, discard_end),
-                                 0,
+                                 ZERO_VERSION,
                                  &disk_res,
                                  &i_sectors_hook.hook,
                                  &ei->journal_seq);
@@ -2132,12 +2158,11 @@ static long bch_fallocate(struct inode *inode, int mode,
        struct cache_set *c = inode->i_sb->s_fs_info;
        struct i_sectors_hook i_sectors_hook;
        struct btree_iter iter;
-       struct bkey_i reservation;
-       struct bkey_s_c k;
        struct bpos end;
        loff_t block_start, block_end;
        loff_t new_size = offset + len;
        unsigned sectors;
+       unsigned replicas = READ_ONCE(c->opts.data_replicas);
        int ret;
 
        bch_btree_iter_init_intent(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
@@ -2186,13 +2211,16 @@ static long bch_fallocate(struct inode *inode, int mode,
 
        while (bkey_cmp(iter.pos, end) < 0) {
                struct disk_reservation disk_res = { 0 };
+               struct bkey_i_reservation reservation;
+               struct bkey_s_c k;
 
                k = bch_btree_iter_peek_with_holes(&iter);
                if ((ret = btree_iter_err(k)))
                        goto btree_iter_err;
 
                /* already reserved */
-               if (k.k->type == BCH_RESERVATION) {
+               if (k.k->type == BCH_RESERVATION &&
+                   bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
                        bch_btree_iter_advance_pos(&iter);
                        continue;
                }
@@ -2204,29 +2232,32 @@ static long bch_fallocate(struct inode *inode, int mode,
                        }
                }
 
-               bkey_init(&reservation.k);
+               bkey_reservation_init(&reservation.k_i);
                reservation.k.type      = BCH_RESERVATION;
                reservation.k.p         = k.k->p;
                reservation.k.size      = k.k->size;
 
-               bch_cut_front(iter.pos, &reservation);
+               bch_cut_front(iter.pos, &reservation.k_i);
                bch_cut_back(end, &reservation.k);
 
                sectors = reservation.k.size;
+               reservation.v.nr_replicas = bch_extent_nr_dirty_ptrs(k);
 
-               if (!bkey_extent_is_allocation(k.k) ||
-                   bkey_extent_is_compressed(c, k)) {
+               if (reservation.v.nr_replicas < replicas ||
+                   bkey_extent_is_compressed(k)) {
                        ret = bch_disk_reservation_get(c, &disk_res,
                                                       sectors, 0);
                        if (ret)
                                goto err_put_sectors_dirty;
+
+                       reservation.v.nr_replicas = disk_res.nr_replicas;
                }
 
                ret = bch_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
                                          &ei->journal_seq,
                                          BTREE_INSERT_ATOMIC|
                                          BTREE_INSERT_NOFAIL,
-                                         BTREE_INSERT_ENTRY(&iter, &reservation));
+                                         BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
                bch_disk_reservation_put(c, &disk_res);
 btree_iter_err:
                if (ret < 0 && ret != -EINTR)
index 884a950f1f51630403c6cf908e25227f216f8b91..76948e79d33598d326924726a2aa1ab734a63ce7 100644 (file)
@@ -26,7 +26,9 @@
 
 static struct kmem_cache *bch_inode_cache;
 
-static void bch_vfs_inode_init(struct bch_inode_info *, struct bkey_s_c_inode);
+static void bch_vfs_inode_init(struct cache_set *,
+                              struct bch_inode_info *,
+                              struct bch_inode_unpacked *);
 
 /*
  * I_SIZE_DIRTY requires special handling:
@@ -63,11 +65,20 @@ int __must_check __bch_write_inode(struct cache_set *c,
 {
        struct btree_iter iter;
        struct inode *inode = &ei->vfs_inode;
-       struct bkey_i_inode new_inode;
-       struct bch_inode *bi;
+       struct bch_inode_unpacked inode_u;
+       struct bkey_inode_buf inode_p;
        u64 inum = inode->i_ino;
+       unsigned i_nlink = READ_ONCE(inode->i_nlink);
        int ret;
 
+       /*
+        * We can't write an inode with i_nlink == 0 because it's stored biased;
+        * however, we don't need to because if i_nlink is 0 the inode is
+        * getting deleted when it's evicted.
+        */
+       if (!i_nlink)
+               return 0;
+
        lockdep_assert_held(&ei->update_lock);
 
        bch_btree_iter_init_intent(&iter, c, BTREE_ID_INODES, POS(inum, 0));
@@ -84,33 +95,41 @@ int __must_check __bch_write_inode(struct cache_set *c,
                        return -ENOENT;
                }
 
-               bkey_reassemble(&new_inode.k_i, k);
-               bi = &new_inode.v;
+               ret = bch_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+               if (WARN_ONCE(ret,
+                             "error %i unpacking inode %llu", ret, inum)) {
+                       ret = -ENOENT;
+                       break;
+               }
 
                if (set) {
-                       ret = set(ei, bi, p);
+                       ret = set(ei, &inode_u, p);
                        if (ret)
                                goto out;
                }
 
-               bi->i_mode      = cpu_to_le16(inode->i_mode);
-               bi->i_uid       = cpu_to_le32(i_uid_read(inode));
-               bi->i_gid       = cpu_to_le32(i_gid_read(inode));
-               bi->i_nlink     = cpu_to_le32(inode->i_nlink);
-               bi->i_dev       = cpu_to_le32(inode->i_rdev);
-               bi->i_atime     = cpu_to_le64(timespec_to_ns(&inode->i_atime));
-               bi->i_mtime     = cpu_to_le64(timespec_to_ns(&inode->i_mtime));
-               bi->i_ctime     = cpu_to_le64(timespec_to_ns(&inode->i_ctime));
+               BUG_ON(i_nlink < nlink_bias(inode->i_mode));
+
+               inode_u.i_mode  = inode->i_mode;
+               inode_u.i_uid   = i_uid_read(inode);
+               inode_u.i_gid   = i_gid_read(inode);
+               inode_u.i_nlink = i_nlink - nlink_bias(inode->i_mode);
+               inode_u.i_dev   = inode->i_rdev;
+               inode_u.i_atime = timespec_to_bch_time(c, inode->i_atime);
+               inode_u.i_mtime = timespec_to_bch_time(c, inode->i_mtime);
+               inode_u.i_ctime = timespec_to_bch_time(c, inode->i_ctime);
+
+               bch_inode_pack(&inode_p, &inode_u);
 
                ret = bch_btree_insert_at(c, NULL, NULL, &ei->journal_seq,
                                BTREE_INSERT_ATOMIC|
                                BTREE_INSERT_NOFAIL,
-                               BTREE_INSERT_ENTRY(&iter, &new_inode.k_i));
+                               BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
        } while (ret == -EINTR);
 
        if (!ret) {
-               ei->i_size      = le64_to_cpu(bi->i_size);
-               ei->i_flags     = le32_to_cpu(bi->i_flags);
+               ei->i_size      = inode_u.i_size;
+               ei->i_flags     = inode_u.i_flags;
        }
 out:
        bch_btree_iter_unlock(&iter);
@@ -138,7 +157,7 @@ int bch_inc_nlink(struct cache_set *c, struct bch_inode_info *ei)
 
 int bch_dec_nlink(struct cache_set *c, struct bch_inode_info *ei)
 {
-       int ret;
+       int ret = 0;
 
        mutex_lock(&ei->update_lock);
        drop_nlink(&ei->vfs_inode);
@@ -152,9 +171,8 @@ static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
 {
        struct cache_set *c = sb->s_fs_info;
        struct inode *inode;
+       struct bch_inode_unpacked inode_u;
        struct bch_inode_info *ei;
-       struct btree_iter iter;
-       struct bkey_s_c k;
        int ret;
 
        pr_debug("inum %llu", inum);
@@ -165,24 +183,19 @@ static struct inode *bch_vfs_inode_get(struct super_block *sb, u64 inum)
        if (!(inode->i_state & I_NEW))
                return inode;
 
-       bch_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0));
-       k = bch_btree_iter_peek_with_holes(&iter);
-
-       if ((ret = btree_iter_err(k)) || k.k->type != BCH_INODE_FS) {
-               ret = bch_btree_iter_unlock(&iter);
+       ret = bch_inode_find_by_inum(c, inum, &inode_u);
+       if (ret) {
                iget_failed(inode);
-               return ERR_PTR(ret ?: -ENOENT);
+               return ERR_PTR(ret);
        }
 
        ei = to_bch_ei(inode);
-       bch_vfs_inode_init(ei, bkey_s_c_to_inode(k));
+       bch_vfs_inode_init(c, ei, &inode_u);
 
        ei->journal_seq = bch_inode_journal_seq(&c->journal, inum);
 
        unlock_new_inode(inode);
 
-       bch_btree_iter_unlock(&iter);
-
        return inode;
 }
 
@@ -193,7 +206,8 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c,
        struct inode *inode;
        struct posix_acl *default_acl = NULL, *acl = NULL;
        struct bch_inode_info *ei;
-       struct bkey_i_inode bkey_inode;
+       struct bch_inode_unpacked inode_u;
+       struct bkey_inode_buf inode_p;
        int ret;
 
        inode = new_inode(parent->i_sb);
@@ -210,10 +224,11 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c,
 
        ei = to_bch_ei(inode);
 
-       bch_inode_init(c, &bkey_inode, i_uid_read(inode),
+       bch_inode_init(c, &inode_u, i_uid_read(inode),
                       i_gid_read(inode), inode->i_mode, rdev);
+       bch_inode_pack(&inode_p, &inode_u);
 
-       ret = bch_inode_create(c, &bkey_inode.k_i,
+       ret = bch_inode_create(c, &inode_p.inode.k_i,
                               BLOCKDEV_INODE_MAX, 0,
                               &c->unused_inode_hint);
        if (unlikely(ret)) {
@@ -225,7 +240,8 @@ static struct inode *bch_vfs_inode_create(struct cache_set *c,
                goto err;
        }
 
-       bch_vfs_inode_init(ei, inode_i_to_s_c(&bkey_inode));
+       inode_u.inum = inode_p.inode.k.p.inode;
+       bch_vfs_inode_init(c, ei, &inode_u);
 
        if (default_acl) {
                ret = bch_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
@@ -266,7 +282,7 @@ static int bch_vfs_dirent_create(struct cache_set *c, struct inode *dir,
        if (unlikely(ret))
                return ret;
 
-       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+       dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
        mark_inode_dirty_sync(dir);
        return 0;
 }
@@ -337,7 +353,7 @@ static int bch_link(struct dentry *old_dentry, struct inode *dir,
 
        lockdep_assert_held(&inode->i_rwsem);
 
-       inode->i_ctime = CURRENT_TIME;
+       inode->i_ctime = current_fs_time(dir->i_sb);
 
        ret = bch_inc_nlink(c, ei);
        if (ret)
@@ -382,12 +398,7 @@ static int bch_unlink(struct inode *dir, struct dentry *dentry)
                drop_nlink(inode);
        }
 
-       drop_nlink(inode);
-       if (inode->i_nlink) {
-               mutex_lock(&ei->update_lock);
-               ret = bch_write_inode(c, ei);
-               mutex_unlock(&ei->update_lock);
-       }
+       bch_dec_nlink(c, ei);
 
        return 0;
 }
@@ -473,7 +484,7 @@ static int bch_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode = old_dentry->d_inode;
        struct bch_inode_info *ei = to_bch_ei(old_inode);
        struct inode *new_inode = new_dentry->d_inode;
-       struct timespec now = CURRENT_TIME;
+       struct timespec now = current_fs_time(old_dir->i_sb);
        int ret;
 
        lockdep_assert_held(&old_dir->i_rwsem);
@@ -550,7 +561,7 @@ static int bch_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct bch_inode_info *ei = to_bch_ei(old_inode);
-       struct timespec now = CURRENT_TIME;
+       struct timespec now = current_fs_time(old_dir->i_sb);
        int ret;
 
        ret = bch_dirent_rename(c,
@@ -783,14 +794,14 @@ static unsigned bch_inode_flags_to_user_flags(unsigned flags)
 }
 
 static int bch_inode_user_flags_set(struct bch_inode_info *ei,
-                                   struct bch_inode *bi,
+                                   struct bch_inode_unpacked *bi,
                                    void *p)
 {
        /*
         * We're relying on btree locking here for exclusion with other ioctl
         * calls - use the flags in the btree (@bi), not ei->i_flags:
         */
-       unsigned bch_flags = le32_to_cpu(bi->i_flags);
+       unsigned bch_flags = bi->i_flags;
        unsigned oldflags = bch_inode_flags_to_user_flags(bch_flags);
        unsigned newflags = *((unsigned *) p);
        unsigned i;
@@ -812,8 +823,8 @@ static int bch_inode_user_flags_set(struct bch_inode_info *ei,
        if (oldflags != newflags)
                return -EOPNOTSUPP;
 
-       bi->i_flags = cpu_to_le32(bch_flags);
-       ei->vfs_inode.i_ctime = CURRENT_TIME;
+       bi->i_flags = bch_flags;
+       ei->vfs_inode.i_ctime = current_fs_time(ei->vfs_inode.i_sb);
 
        return 0;
 }
@@ -1010,32 +1021,33 @@ static const struct address_space_operations bch_address_space_operations = {
        .error_remove_page = generic_error_remove_page,
 };
 
-static void bch_vfs_inode_init(struct bch_inode_info *ei,
-                              struct bkey_s_c_inode bkey_inode)
+static void bch_vfs_inode_init(struct cache_set *c,
+                              struct bch_inode_info *ei,
+                              struct bch_inode_unpacked *bi)
 {
        struct inode *inode = &ei->vfs_inode;
-       const struct bch_inode *bi = bkey_inode.v;
 
        pr_debug("init inode %llu with mode %o",
-                bkey_inode.k->p.inode, bi->i_mode);
-
-       ei->i_flags     = le32_to_cpu(bi->i_flags);
-       ei->i_size      = le64_to_cpu(bi->i_size);
-
-       inode->i_mode   = le16_to_cpu(bi->i_mode);
-       i_uid_write(inode, le32_to_cpu(bi->i_uid));
-       i_gid_write(inode, le32_to_cpu(bi->i_gid));
-
-       atomic64_set(&ei->i_sectors, le64_to_cpu(bi->i_sectors));
-       inode->i_blocks = atomic64_read(&ei->i_sectors);
-
-       inode->i_ino    = bkey_inode.k->p.inode;
-       set_nlink(inode, le32_to_cpu(bi->i_nlink));
-       inode->i_rdev   = le32_to_cpu(bi->i_dev);
-       inode->i_size   = le64_to_cpu(bi->i_size);
-       inode->i_atime  = ns_to_timespec(le64_to_cpu(bi->i_atime));
-       inode->i_mtime  = ns_to_timespec(le64_to_cpu(bi->i_mtime));
-       inode->i_ctime  = ns_to_timespec(le64_to_cpu(bi->i_ctime));
+                bi->inum, bi->i_mode);
+
+       ei->i_flags     = bi->i_flags;
+       ei->i_size      = bi->i_size;
+
+       inode->i_mode   = bi->i_mode;
+       i_uid_write(inode, bi->i_uid);
+       i_gid_write(inode, bi->i_gid);
+
+       atomic64_set(&ei->i_sectors, bi->i_sectors);
+       inode->i_blocks = bi->i_sectors;
+
+       inode->i_ino    = bi->inum;
+       set_nlink(inode, bi->i_nlink + nlink_bias(inode->i_mode));
+       inode->i_rdev   = bi->i_dev;
+       inode->i_generation = bi->i_generation;
+       inode->i_size   = bi->i_size;
+       inode->i_atime  = bch_time_to_timespec(c, bi->i_atime);
+       inode->i_mtime  = bch_time_to_timespec(c, bi->i_mtime);
+       inode->i_ctime  = bch_time_to_timespec(c, bi->i_ctime);
        bch_inode_flags_to_vfs(inode);
 
        ei->str_hash = bch_hash_info_init(bi);
@@ -1149,8 +1161,8 @@ static int bch_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files    = atomic_long_read(&c->nr_inodes);
        buf->f_ffree    = U64_MAX;
 
-       fsid = le64_to_cpup((void *) c->disk_sb.user_uuid.b) ^
-              le64_to_cpup((void *) c->disk_sb.user_uuid.b + sizeof(u64));
+       fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
+              le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
        buf->f_namelen  = NAME_MAX;
@@ -1380,7 +1392,7 @@ static struct dentry *bch_mount(struct file_system_type *fs_type,
        sb->s_op                = &bch_super_operations;
        sb->s_xattr             = bch_xattr_handlers;
        sb->s_magic             = BCACHE_STATFS_MAGIC;
-       sb->s_time_gran         = 1;
+       sb->s_time_gran         = c->sb.time_precision;
        c->vfs_sb               = sb;
        sb->s_bdi               = &c->bdi;
 
index c98202417fdee592863918b8df419fd4248d95c6..aec6159b58ffdf158acd63b51af10de2da62a53a 100644 (file)
@@ -34,9 +34,16 @@ static inline u8 mode_to_type(umode_t mode)
        return (mode >> 12) & 15;
 }
 
+static inline unsigned nlink_bias(umode_t mode)
+{
+       return S_ISDIR(mode) ? 2 : 1;
+}
+
+struct bch_inode_unpacked;
+
 /* returns 0 if we want to do the update, or error is passed up */
 typedef int (*inode_set_fn)(struct bch_inode_info *,
-                           struct bch_inode *, void *);
+                           struct bch_inode_unpacked *, void *);
 
 int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
                                   inode_set_fn, void *);
index 200deb0e60e13b33b434b05a82cf2ab877b6f797..b72a1c51ff9701f43635cfb864c2eb6a3e88ba53 100644 (file)
 
 #include <linux/random.h>
 
-ssize_t bch_inode_status(char *buf, size_t len, const struct bkey *k)
+#include <asm/unaligned.h>
+
+#define FIELD_BYTES()                                          \
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+static const u8 bits_table[8] = {
+       1  * 8 - 1,
+       2  * 8 - 2,
+       3  * 8 - 3,
+       4  * 8 - 4,
+       6  * 8 - 5,
+       8  * 8 - 6,
+       10 * 8 - 7,
+       13 * 8 - 8,
+};
+
+static int inode_encode_field(u8 *out, u8 *end, const u64 in[2])
 {
-       if (k->p.offset)
-               return scnprintf(buf, len, "offset nonzero: %llu", k->p.offset);
-
-       if (k->size)
-               return scnprintf(buf, len, "size nonzero: %u", k->size);
-
-       switch (k->type) {
-       case KEY_TYPE_DELETED:
-               return scnprintf(buf, len, "deleted");
-       case KEY_TYPE_DISCARD:
-               return scnprintf(buf, len, "discarded");
-       case KEY_TYPE_ERROR:
-               return scnprintf(buf, len, "error");
-       case KEY_TYPE_COOKIE:
-               return scnprintf(buf, len, "cookie");
+       unsigned bytes, bits, shift;
 
-       case BCH_INODE_FS:
-               if (bkey_val_bytes(k) != sizeof(struct bch_inode))
-                       return scnprintf(buf, len, "bad size: %zu",
-                                        bkey_val_bytes(k));
+       if (likely(!in[1]))
+               bits = fls64(in[0]);
+       else
+               bits = fls64(in[1]) + 64;
 
-               if (k->p.inode < BLOCKDEV_INODE_MAX)
-                       return scnprintf(buf, len,
-                                        "fs inode in blockdev range: %llu",
-                                        k->p.inode);
-               return 0;
+       for (shift = 1; shift <= 8; shift++)
+               if (bits < bits_table[shift - 1])
+                       goto got_shift;
 
-       case BCH_INODE_BLOCKDEV:
-               if (bkey_val_bytes(k) != sizeof(struct bch_inode_blockdev))
-                       return scnprintf(buf, len, "bad size: %zu",
-                                        bkey_val_bytes(k));
+       BUG();
+got_shift:
+       bytes = byte_table[shift - 1];
 
-               if (k->p.inode >= BLOCKDEV_INODE_MAX)
-                       return scnprintf(buf, len,
-                                        "blockdev inode in fs range: %llu",
-                                        k->p.inode);
-               return 0;
+       BUG_ON(out + bytes > end);
 
-       default:
-               return scnprintf(buf, len, "unknown inode type: %u", k->type);
+       if (likely(bytes <= 8)) {
+               u64 b = cpu_to_be64(in[0]);
+
+               memcpy(out, (void *) &b + 8 - bytes, bytes);
+       } else {
+               u64 b = cpu_to_be64(in[1]);
+
+               memcpy(out, (void *) &b + 16 - bytes, bytes);
+               put_unaligned_be64(in[0], out + bytes - 8);
+       }
+
+       *out |= (1 << 8) >> shift;
+
+       return bytes;
+}
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+                             u64 out[2], unsigned *out_bits)
+{
+       unsigned bytes, bits, shift;
+
+       if (in >= end)
+               return -1;
+
+       if (!*in)
+               return -1;
+
+       /*
+        * position of highest set bit indicates number of bytes:
+        * shift = number of bits to remove in high byte:
+        */
+       shift   = 8 - __fls(*in); /* 1 <= shift <= 8 */
+       bytes   = byte_table[shift - 1];
+       bits    = bytes * 8 - shift;
+
+       if (in + bytes > end)
+               return -1;
+
+       /*
+        * we're assuming it's safe to deref up to 7 bytes < in; this will work
+        * because keys always start quite a bit more than 7 bytes after the
+        * start of the btree node header:
+        */
+       if (likely(bytes <= 8)) {
+               out[0] = get_unaligned_be64(in + bytes - 8);
+               out[0] <<= 64 - bits;
+               out[0] >>= 64 - bits;
+               out[1] = 0;
+       } else {
+               out[0] = get_unaligned_be64(in + bytes - 8);
+               out[1] = get_unaligned_be64(in + bytes - 16);
+               out[1] <<= 128 - bits;
+               out[1] >>= 128 - bits;
+       }
+
+       *out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]);
+       return bytes;
+}
+
+void bch_inode_pack(struct bkey_inode_buf *packed,
+                   const struct bch_inode_unpacked *inode)
+{
+       u8 *out = packed->inode.v.fields;
+       u8 *end = (void *) &packed[1];
+       u8 *last_nonzero_field = out;
+       u64 field[2];
+       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+
+       bkey_inode_init(&packed->inode.k_i);
+       packed->inode.k.p.inode         = inode->inum;
+       packed->inode.v.i_hash_seed     = inode->i_hash_seed;
+       packed->inode.v.i_flags         = cpu_to_le32(inode->i_flags);
+       packed->inode.v.i_mode          = cpu_to_le16(inode->i_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)                                  \
+       field[0] = inode->_name;                                        \
+       field[1] = 0;                                                   \
+       out += inode_encode_field(out, end, field);                     \
+       nr_fields++;                                                    \
+                                                                       \
+       if (field[0] | field[1]) {                                      \
+               last_nonzero_field = out;                               \
+               last_nonzero_fieldnr = nr_fields;                       \
+       }
+
+       BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+
+       out = last_nonzero_field;
+       nr_fields = last_nonzero_fieldnr;
+
+       set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
+       memset(out, 0,
+              (u8 *) &packed->inode.v +
+              bkey_val_bytes(&packed->inode.k) - out);
+
+       SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+
+       if (IS_ENABLED(CONFIG_BCACHE_DEBUG)) {
+               struct bch_inode_unpacked unpacked;
+
+               int ret = bch_inode_unpack(inode_i_to_s_c(&packed->inode),
+                                          &unpacked);
+               BUG_ON(ret);
+               BUG_ON(unpacked.inum            != inode->inum);
+               BUG_ON(unpacked.i_hash_seed     != inode->i_hash_seed);
+               BUG_ON(unpacked.i_mode          != inode->i_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)  BUG_ON(unpacked._name != inode->_name);
+               BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
        }
 }
 
+int bch_inode_unpack(struct bkey_s_c_inode inode,
+                    struct bch_inode_unpacked *unpacked)
+{
+       const u8 *in = inode.v->fields;
+       const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+       u64 field[2];
+       unsigned fieldnr = 0, field_bits;
+       int ret;
+
+       unpacked->inum          = inode.k->p.inode;
+       unpacked->i_hash_seed   = inode.v->i_hash_seed;
+       unpacked->i_flags       = le32_to_cpu(inode.v->i_flags);
+       unpacked->i_mode        = le16_to_cpu(inode.v->i_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)                                  \
+       if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {                    \
+               memset(&unpacked->_name, 0,                             \
+                      sizeof(*unpacked) -                              \
+                      offsetof(struct bch_inode_unpacked, _name));     \
+               return 0;                                               \
+       }                                                               \
+                                                                       \
+       ret = inode_decode_field(in, end, field, &field_bits);          \
+       if (ret < 0)                                                    \
+               return ret;                                             \
+                                                                       \
+       if (field_bits > sizeof(unpacked->_name) * 8)                   \
+               return -1;                                              \
+                                                                       \
+       unpacked->_name = field[0];                                     \
+       in += ret;
+
+       BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+
+       /* XXX: signal if there were more fields than expected? */
+
+       return 0;
+}
+
 static const char *bch_inode_invalid(const struct cache_set *c,
                                     struct bkey_s_c k)
 {
@@ -63,16 +207,20 @@ static const char *bch_inode_invalid(const struct cache_set *c,
        switch (k.k->type) {
        case BCH_INODE_FS: {
                struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+               struct bch_inode_unpacked unpacked;
 
-               if (bkey_val_bytes(k.k) != sizeof(struct bch_inode))
+               if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
                        return "incorrect value size";
 
                if (k.k->p.inode < BLOCKDEV_INODE_MAX)
                        return "fs inode in blockdev range";
 
-               if (INODE_STR_HASH_TYPE(inode.v) >= BCH_STR_HASH_NR)
+               if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
                        return "invalid str hash type";
 
+               if (bch_inode_unpack(inode, &unpacked))
+                       return "invalid variable length fields";
+
                return NULL;
        }
        case BCH_INODE_BLOCKDEV:
@@ -92,12 +240,17 @@ static void bch_inode_to_text(struct cache_set *c, char *buf,
                              size_t size, struct bkey_s_c k)
 {
        struct bkey_s_c_inode inode;
+       struct bch_inode_unpacked unpacked;
 
        switch (k.k->type) {
        case BCH_INODE_FS:
                inode = bkey_s_c_to_inode(k);
+               if (bch_inode_unpack(inode, &unpacked)) {
+                       scnprintf(buf, size, "(unpack error)");
+                       break;
+               }
 
-               scnprintf(buf, size, "i_size %llu", inode.v->i_size);
+               scnprintf(buf, size, "i_size %llu", unpacked.i_size);
                break;
        }
 }
@@ -107,26 +260,25 @@ const struct bkey_ops bch_bkey_inode_ops = {
        .val_to_text    = bch_inode_to_text,
 };
 
-void bch_inode_init(struct cache_set *c, struct bkey_i_inode *inode,
+void bch_inode_init(struct cache_set *c, struct bch_inode_unpacked *inode_u,
                    uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
 {
-       struct timespec ts = CURRENT_TIME;
-       s64 now = timespec_to_ns(&ts);
-       struct bch_inode *bi;
-
-       bi = &bkey_inode_init(&inode->k_i)->v;
-       bi->i_uid       = cpu_to_le32(uid);
-       bi->i_gid       = cpu_to_le32(gid);
-
-       bi->i_mode      = cpu_to_le16(mode);
-       bi->i_dev       = cpu_to_le32(rdev);
-       bi->i_atime     = cpu_to_le64(now);
-       bi->i_mtime     = cpu_to_le64(now);
-       bi->i_ctime     = cpu_to_le64(now);
-       bi->i_nlink     = cpu_to_le32(S_ISDIR(mode) ? 2 : 1);
-
-       get_random_bytes(&bi->i_hash_seed, sizeof(bi->i_hash_seed));
-       SET_INODE_STR_HASH_TYPE(bi, c->sb.str_hash_type);
+       s64 now = timespec_to_bch_time(c, CURRENT_TIME);
+
+       memset(inode_u, 0, sizeof(*inode_u));
+
+       /* ick */
+       inode_u->i_flags |= c->sb.str_hash_type << INODE_STR_HASH_OFFSET;
+       get_random_bytes(&inode_u->i_hash_seed, sizeof(inode_u->i_hash_seed));
+
+       inode_u->i_mode         = mode;
+       inode_u->i_uid          = uid;
+       inode_u->i_gid          = gid;
+       inode_u->i_dev          = rdev;
+       inode_u->i_atime        = now;
+       inode_u->i_mtime        = now;
+       inode_u->i_ctime        = now;
+       inode_u->i_otime        = now;
 }
 
 int bch_inode_create(struct cache_set *c, struct bkey_i *inode,
@@ -200,7 +352,7 @@ int bch_inode_truncate(struct cache_set *c, u64 inode_nr, u64 new_size,
                       struct extent_insert_hook *hook, u64 *journal_seq)
 {
        return bch_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0),
-                          0, NULL, hook, journal_seq);
+                          ZERO_VERSION, NULL, hook, journal_seq);
 }
 
 int bch_inode_rm(struct cache_set *c, u64 inode_nr)
@@ -215,7 +367,7 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr)
        ret = bch_btree_delete_range(c, BTREE_ID_XATTRS,
                                     POS(inode_nr, 0),
                                     POS(inode_nr + 1, 0),
-                                    0, NULL, NULL, NULL);
+                                    ZERO_VERSION, NULL, NULL, NULL);
        if (ret < 0)
                return ret;
 
@@ -230,7 +382,7 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr)
        ret = bch_btree_delete_range(c, BTREE_ID_DIRENTS,
                                     POS(inode_nr, 0),
                                     POS(inode_nr + 1, 0),
-                                    0, NULL, NULL, NULL);
+                                    ZERO_VERSION, NULL, NULL, NULL);
        if (ret < 0)
                return ret;
 
@@ -241,25 +393,19 @@ int bch_inode_rm(struct cache_set *c, u64 inode_nr)
                                NULL, NULL, BTREE_INSERT_NOFAIL);
 }
 
-int bch_inode_update(struct cache_set *c, struct bkey_i *inode,
-                    u64 *journal_seq)
-{
-       return bch_btree_update(c, BTREE_ID_INODES, inode, journal_seq);
-}
-
 int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr,
-                          struct bkey_i_inode *inode)
+                          struct bch_inode_unpacked *inode)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
+       int ret = -ENOENT;
 
        for_each_btree_key_with_holes(&iter, c, BTREE_ID_INODES,
                                      POS(inode_nr, 0), k) {
                switch (k.k->type) {
                case BCH_INODE_FS:
-                       bkey_reassemble(&inode->k_i, k);
-                       bch_btree_iter_unlock(&iter);
-                       return 0;
+                       ret = bch_inode_unpack(bkey_s_c_to_inode(k), inode);
+                       break;
                default:
                        /* hole, not found */
                        break;
@@ -269,7 +415,7 @@ int bch_inode_find_by_inum(struct cache_set *c, u64 inode_nr,
 
        }
 
-       return bch_btree_iter_unlock(&iter) ?: -ENOENT;
+       return bch_btree_iter_unlock(&iter) ?: ret;
 }
 
 int bch_cached_dev_inode_find_by_uuid(struct cache_set *c, uuid_le *uuid,
index fa1a4cf9cb7be96b18b00462f5471f4073f96eb8..81dccf68357e6cf3a17b8e771a859e668f566f01 100644 (file)
@@ -3,18 +3,53 @@
 
 extern const struct bkey_ops bch_bkey_inode_ops;
 
-ssize_t bch_inode_status(char *, size_t, const struct bkey *);
+struct bch_inode_unpacked {
+       u64                     inum;
+       __le64                  i_hash_seed;
+       u32                     i_flags;
+       u16                     i_mode;
 
-void bch_inode_init(struct cache_set *, struct bkey_i_inode *,
+#define BCH_INODE_FIELD(_name, _bits)  u##_bits _name;
+       BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+};
+
+struct bkey_inode_buf {
+       struct bkey_i_inode     inode;
+
+#define BCH_INODE_FIELD(_name, _bits)          + 8 + _bits / 8
+       u8              _pad[0 + BCH_INODE_FIELDS()];
+#undef  BCH_INODE_FIELD
+} __packed;
+
+void bch_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+
+void bch_inode_init(struct cache_set *, struct bch_inode_unpacked *,
                    uid_t, gid_t, umode_t, dev_t);
 int bch_inode_create(struct cache_set *, struct bkey_i *, u64, u64, u64 *);
 int bch_inode_truncate(struct cache_set *, u64, u64,
                       struct extent_insert_hook *, u64 *);
 int bch_inode_rm(struct cache_set *, u64);
-int bch_inode_update(struct cache_set *, struct bkey_i *, u64 *);
 
-int bch_inode_find_by_inum(struct cache_set *, u64, struct bkey_i_inode *);
+int bch_inode_find_by_inum(struct cache_set *, u64,
+                          struct bch_inode_unpacked *);
 int bch_cached_dev_inode_find_by_uuid(struct cache_set *, uuid_le *,
                                      struct bkey_i_inode_blockdev *);
 
+static inline struct timespec bch_time_to_timespec(struct cache_set *c, u64 time)
+{
+       return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline u64 timespec_to_bch_time(struct cache_set *c, struct timespec ts)
+{
+       s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo;
+
+       if (c->sb.time_precision == 1)
+               return ns;
+
+       return div_s64(ns, c->sb.time_precision);
+}
+
 #endif
index 4112ea50c4723b0aabb56ccc7564ca5d2a3791f2..2f0e48a0c5cc951e62d1e5bb4a22aa9e3b1e869c 100644 (file)
@@ -22,7 +22,7 @@
 #include "move.h"
 #include "notify.h"
 #include "stats.h"
-#include "super.h"
+#include "super-io.h"
 
 #include <linux/blkdev.h>
 #include <linux/random.h>
@@ -382,11 +382,27 @@ static void bch_write_endio(struct bio *bio)
                closure_put(cl);
 }
 
+static struct nonce extent_nonce(struct bversion version,
+                                unsigned nonce,
+                                unsigned uncompressed_size,
+                                unsigned compression_type)
+{
+       return (struct nonce) {{
+               [0] = cpu_to_le32((nonce                << 12) |
+                                 (uncompressed_size    << 22)),
+               [1] = cpu_to_le32(version.lo),
+               [2] = cpu_to_le32(version.lo >> 32),
+               [3] = cpu_to_le32(version.hi|
+                                 (compression_type << 24))^BCH_NONCE_EXTENT,
+       }};
+}
+
 static void init_append_extent(struct bch_write_op *op,
                               unsigned compressed_size,
                               unsigned uncompressed_size,
                               unsigned compression_type,
-                              u64 csum, unsigned csum_type,
+                              unsigned nonce,
+                              struct bch_csum csum, unsigned csum_type,
                               struct open_bucket *ob)
 {
        struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
@@ -394,11 +410,13 @@ static void init_append_extent(struct bch_write_op *op,
        op->pos.offset += uncompressed_size;
        e->k.p = op->pos;
        e->k.size = uncompressed_size;
+       e->k.version = op->version;
+       bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
 
        bch_extent_crc_append(e, compressed_size,
                              uncompressed_size,
                              compression_type,
-                             csum, csum_type);
+                             nonce, csum, csum_type);
 
        bch_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas,
                                      ob, compressed_size);
@@ -417,7 +435,7 @@ static int bch_write_extent(struct bch_write_op *op,
        unsigned key_to_write_offset = op->insert_keys.top_p -
                op->insert_keys.keys_p;
        struct bkey_i *key_to_write;
-       unsigned csum_type = c->opts.data_checksum;
+       unsigned csum_type = op->csum_type;
        unsigned compression_type = op->compression_type;
        int ret;
 
@@ -426,8 +444,8 @@ static int bch_write_extent(struct bch_write_op *op,
 
        /* Need to decompress data? */
        if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
-           (op->crc.uncompressed_size != op->size ||
-            op->crc.compressed_size > ob->sectors_free)) {
+           (crc_uncompressed_size(NULL, &op->crc) != op->size ||
+            crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) {
                int ret;
 
                ret = bch_bio_uncompress_inplace(c, orig, op->size, op->crc);
@@ -439,9 +457,10 @@ static int bch_write_extent(struct bch_write_op *op,
 
        if (op->flags & BCH_WRITE_DATA_COMPRESSED) {
                init_append_extent(op,
-                                  op->crc.compressed_size,
-                                  op->crc.uncompressed_size,
+                                  crc_compressed_size(NULL, &op->crc),
+                                  crc_uncompressed_size(NULL, &op->crc),
                                   op->crc.compression_type,
+                                  op->crc.nonce,
                                   op->crc.csum,
                                   op->crc.csum_type,
                                   ob);
@@ -457,7 +476,10 @@ static int bch_write_extent(struct bch_write_op *op,
                /* all units here in bytes */
                unsigned total_output = 0, output_available =
                        min(ob->sectors_free << 9, orig->bi_iter.bi_size);
-               u64 csum;
+               unsigned crc_nonce = bch_csum_type_is_encryption(csum_type)
+                       ? op->nonce : 0;
+               struct bch_csum csum;
+               struct nonce nonce;
 
                bio = bio_alloc_bioset(GFP_NOIO,
                                       DIV_ROUND_UP(output_available, PAGE_SIZE),
@@ -489,13 +511,20 @@ static int bch_write_extent(struct bch_write_op *op,
                        BUG_ON(src_len & (block_bytes(c) - 1));
 
                        swap(bio->bi_iter.bi_size, dst_len);
-                       csum = bch_checksum_bio(bio, csum_type);
+                       nonce = extent_nonce(op->version,
+                                            crc_nonce,
+                                            src_len >> 9,
+                                            compression_type),
+
+                       bch_encrypt_bio(c, csum_type, nonce, bio);
+
+                       csum = bch_checksum_bio(c, csum_type, nonce, bio);
                        swap(bio->bi_iter.bi_size, dst_len);
 
                        init_append_extent(op,
                                           dst_len >> 9, src_len >> 9,
                                           fragment_compression_type,
-                                          csum, csum_type, ob);
+                                          crc_nonce, csum, csum_type, ob);
 
                        total_output += dst_len;
                        bio_advance(bio, dst_len);
@@ -531,7 +560,8 @@ static int bch_write_extent(struct bch_write_op *op,
                wbio->put_bio           = bio != orig;
 
                init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
-                                  compression_type, 0, csum_type, ob);
+                                  compression_type, 0,
+                                  (struct bch_csum) { 0 }, csum_type, ob);
 
                ret = bio != orig;
        }
@@ -546,8 +576,7 @@ static int bch_write_extent(struct bch_write_op *op,
 
        key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 
-       if (!(op->flags & BCH_WRITE_CACHED))
-               bch_check_mark_super(c, key_to_write, false);
+       bch_check_mark_super(c, key_to_write, false);
 
 #ifndef CONFIG_BCACHE_NO_IO
        bch_submit_wbio_replicas(to_wbio(bio), c, key_to_write, false);
@@ -748,6 +777,11 @@ void bch_write(struct closure *cl)
                closure_return(cl);
        }
 
+       if (bversion_zero(op->version) &&
+           bch_csum_type_is_encryption(op->csum_type))
+               op->version.lo =
+                       atomic64_inc_return(&c->key_version) + 1;
+
        if (!(op->flags & BCH_WRITE_DISCARD))
                bch_increment_clock(c, bio_sectors(bio), WRITE);
 
@@ -804,17 +838,21 @@ void bch_write_op_init(struct bch_write_op *op, struct cache_set *c,
                       struct write_point *wp, struct bpos pos,
                       u64 *journal_seq, unsigned flags)
 {
+       EBUG_ON(res.sectors && !res.nr_replicas);
+
        op->c           = c;
        op->io_wq       = index_update_wq(op);
        op->bio         = bio;
        op->written     = 0;
        op->error       = 0;
        op->flags       = flags;
+       op->csum_type   = bch_data_checksum_type(c);
        op->compression_type = c->opts.compression;
        op->nr_replicas = res.nr_replicas;
        op->alloc_reserve = RESERVE_NONE;
+       op->nonce       = 0;
        op->pos         = pos;
-       op->version     = 0;
+       op->version     = ZERO_VERSION;
        op->res         = res;
        op->wp          = wp;
 
@@ -853,7 +891,7 @@ void bch_write_op_init(struct bch_write_op *op, struct cache_set *c,
  *     appropriately inode_truncate should call this
  */
 int bch_discard(struct cache_set *c, struct bpos start,
-               struct bpos end, u64 version,
+               struct bpos end, struct bversion version,
                struct disk_reservation *disk_res,
                struct extent_insert_hook *hook,
                u64 *journal_seq)
@@ -878,7 +916,11 @@ static int bio_checksum_uncompress(struct cache_set *c,
        struct bio *src = &rbio->bio;
        struct bio *dst = &bch_rbio_parent(rbio)->bio;
        struct bvec_iter dst_iter = rbio->parent_iter;
-       u64 csum;
+       struct nonce nonce = extent_nonce(rbio->version,
+                               rbio->crc.nonce,
+                               crc_uncompressed_size(NULL, &rbio->crc),
+                               rbio->crc.compression_type);
+       struct bch_csum csum;
        int ret = 0;
 
        /*
@@ -888,18 +930,19 @@ static int bio_checksum_uncompress(struct cache_set *c,
         * in order to promote
         */
        if (rbio->bounce) {
-               src->bi_iter.bi_size            = rbio->crc.compressed_size << 9;
-               src->bi_iter.bi_idx             = 0;
-               src->bi_iter.bi_bvec_done       = 0;
+               src->bi_iter.bi_size    = crc_compressed_size(NULL, &rbio->crc) << 9;
+               src->bi_iter.bi_idx     = 0;
+               src->bi_iter.bi_bvec_done = 0;
        } else {
                src->bi_iter = rbio->parent_iter;
        }
 
-       csum = bch_checksum_bio(src, rbio->crc.csum_type);
-       if (cache_nonfatal_io_err_on(rbio->crc.csum != csum, rbio->ca,
-                       "data checksum error, inode %llu offset %llu: expected %0llx got %0llx (type %u)",
+       csum = bch_checksum_bio(c, rbio->crc.csum_type, nonce, src);
+       if (cache_nonfatal_io_err_on(bch_crc_cmp(rbio->crc.csum, csum), rbio->ca,
+                       "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
                        rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
-                       rbio->crc.csum, csum, rbio->crc.csum_type))
+                       rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo,
+                       rbio->crc.csum_type))
                ret = -EIO;
 
        /*
@@ -908,6 +951,7 @@ static int bio_checksum_uncompress(struct cache_set *c,
         */
        if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
                if (!ret) {
+                       bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
                        ret = bch_bio_uncompress(c, src, dst,
                                                 dst_iter, rbio->crc);
                        if (ret)
@@ -915,8 +959,20 @@ static int bio_checksum_uncompress(struct cache_set *c,
                }
        } else if (rbio->bounce) {
                bio_advance(src, rbio->crc.offset << 9);
+
+               /* don't need to decrypt the entire bio: */
+               BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+               src->bi_iter.bi_size = dst_iter.bi_size;
+
+               nonce = nonce_add(nonce, rbio->crc.offset << 9);
+
+               bch_encrypt_bio(c, rbio->crc.csum_type,
+                               nonce, src);
+
                bio_copy_data_iter(dst, dst_iter,
                                   src, src->bi_iter);
+       } else {
+               bch_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
        }
 
        return ret;
@@ -1108,7 +1164,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
                 */
                unsigned sectors =
                        max_t(unsigned, k.k->size,
-                             pick->crc.uncompressed_size);
+                             crc_uncompressed_size(NULL, &pick->crc));
                unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
 
                promote_op = kmalloc(sizeof(*promote_op) +
@@ -1130,7 +1186,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
         */
        if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
            (pick->crc.csum_type != BCH_CSUM_NONE &&
-            (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
+            (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
              (flags & BCH_READ_FORCE_BOUNCE)))) {
                read_full = true;
                bounce = true;
@@ -1138,7 +1194,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
 
        if (bounce) {
                unsigned sectors = read_full
-                       ? (pick->crc.compressed_size ?: k.k->size)
+                       ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
                        : bvec_iter_sectors(iter);
 
                rbio = container_of(bio_alloc_bioset(GFP_NOIO,
@@ -1183,6 +1239,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
        rbio->flags             = flags;
        rbio->bounce            = bounce;
        rbio->split             = split;
+       rbio->version           = k.k->version;
        rbio->crc               = pick->crc;
        /*
         * crc.compressed_size will be 0 if there wasn't any checksum
@@ -1190,7 +1247,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
         * bounced (which isn't necessarily the original key size, if we bounced
         * only for promoting)
         */
-       rbio->crc.compressed_size = bio_sectors(&rbio->bio);
+       rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1;
        rbio->ptr               = pick->ptr;
        rbio->ca                = pick->ca;
        rbio->promote           = promote_op;
@@ -1210,7 +1267,8 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
                bch_migrate_write_init(c, &promote_op->write,
                                       &c->promote_write_point,
                                       k, NULL,
-                                      BCH_WRITE_ALLOC_NOWAIT);
+                                      BCH_WRITE_ALLOC_NOWAIT|
+                                      BCH_WRITE_CACHED);
                promote_op->write.promote = true;
 
                if (rbio->crc.compression_type) {
index b7668b4e71947124ee521ec98615109f9da95cf7..99e51089580e01d67567f4c7ddc869e0af06befe 100644 (file)
@@ -79,7 +79,7 @@ void bch_submit_wbio_replicas(struct bch_write_bio *, struct cache_set *,
                              const struct bkey_i *, bool);
 
 int bch_discard(struct cache_set *, struct bpos, struct bpos,
-               u64, struct disk_reservation *,
+               struct bversion, struct disk_reservation *,
                struct extent_insert_hook *, u64 *);
 
 void bch_read_retry_work(struct work_struct *);
index f7d99cdb7c6537d9a7620fd635409dcc78175c53..64269d9437b8a10cef715e3d18ac8ed35241bf2e 100644 (file)
@@ -43,7 +43,8 @@ struct bch_read_bio {
        u8                      bounce:1,
                                split:1;
 
-       struct bch_extent_crc64 crc;
+       struct bversion         version;
+       struct bch_extent_crc128 crc;
        struct bch_extent_ptr   ptr;
        struct cache            *ca;
 
@@ -101,15 +102,17 @@ struct bch_write_op {
        short                   error;
 
        u16                     flags;
+       unsigned                csum_type:4;
        unsigned                compression_type:4;
        unsigned                nr_replicas:4;
        unsigned                alloc_reserve:4;
+       unsigned                nonce:14;
 
        struct bpos             pos;
-       unsigned                version;
+       struct bversion         version;
 
        /* For BCH_WRITE_DATA_COMPRESSED: */
-       struct bch_extent_crc64 crc;
+       struct bch_extent_crc128 crc;
        unsigned                size;
 
        struct disk_reservation res;
index 9e09b86df1df8cdf66d27deeb3950f7c1f15802a..3bb9e3c38763107094abd427b80c12f7ea9ddc90 100644 (file)
@@ -18,7 +18,8 @@
 #include "io.h"
 #include "keylist.h"
 #include "journal.h"
-#include "super.h"
+#include "super-io.h"
+#include "vstructs.h"
 
 #include <trace/events/bcache.h>
 
@@ -52,19 +53,14 @@ static inline u64 journal_pin_seq(struct journal *j,
        return last_seq(j) + fifo_entry_idx(&j->pin, pin_list);
 }
 
-#define for_each_jset_entry(entry, jset)                               \
-       for (entry = (jset)->start;                                     \
-            entry < bkey_idx(jset, le32_to_cpu((jset)->u64s));         \
-            entry = jset_keys_next(entry))
-
 static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
                                        struct jset_entry *entry, unsigned type)
 {
-       while (entry < bkey_idx(jset, le32_to_cpu(jset->u64s))) {
+       while (entry < vstruct_last(jset)) {
                if (JOURNAL_ENTRY_TYPE(entry) == type)
                        return entry;
 
-               entry = jset_keys_next(entry);
+               entry = vstruct_next(entry);
        }
 
        return NULL;
@@ -73,14 +69,11 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 #define for_each_jset_entry_type(entry, jset, type)                    \
        for (entry = (jset)->start;                                     \
             (entry = __jset_entry_type_next(jset, entry, type));       \
-            entry = jset_keys_next(entry))
+            entry = vstruct_next(entry))
 
 #define for_each_jset_key(k, _n, entry, jset)                          \
        for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \
-               for (k = (entry)->start;                        \
-                    (k < bkey_idx(entry, le16_to_cpu((entry)->u64s)) &&\
-                     (_n = bkey_next(k), 1));                  \
-                    k = _n)
+               vstruct_for_each_safe(entry, k, _n)
 
 static inline void bch_journal_add_entry(struct journal_buf *buf,
                                         const void *data, size_t u64s,
@@ -199,8 +192,6 @@ redo_peek:
 
        closure_sync(&cl);
 
-       mutex_lock(&c->btree_interior_update_lock);
-
        for (i = 0;; i++) {
                struct btree_interior_update *as;
                struct pending_btree_node_free *d;
@@ -212,6 +203,8 @@ redo_peek:
                }
                n = bl->entries[i];
                mutex_unlock(&j->blacklist_lock);
+redo_wait:
+               mutex_lock(&c->btree_interior_update_lock);
 
                /*
                 * Is the node on the list of pending interior node updates -
@@ -225,11 +218,11 @@ redo_peek:
                                closure_wait(&as->wait, &cl);
                                mutex_unlock(&c->btree_interior_update_lock);
                                closure_sync(&cl);
-                               break;
+                               goto redo_wait;
                        }
-       }
 
-       mutex_unlock(&c->btree_interior_update_lock);
+               mutex_unlock(&c->btree_interior_update_lock);
+       }
 
        mutex_lock(&j->blacklist_lock);
 
@@ -377,7 +370,6 @@ out:
 struct journal_list {
        struct closure          cl;
        struct mutex            lock;
-       struct mutex            cache_set_buffer_lock;
        struct list_head        *head;
        int                     ret;
 };
@@ -394,7 +386,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
 {
        struct journal_replay *i, *pos;
        struct list_head *where;
-       size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
+       size_t bytes = vstruct_bytes(j);
        __le64 last_seq;
        int ret;
 
@@ -422,8 +414,7 @@ static int journal_entry_add(struct cache_set *c, struct journal_list *jlist,
        list_for_each_entry_reverse(i, jlist->head, list) {
                /* Duplicate? */
                if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
-                       fsck_err_on(bytes != __set_bytes(&i->j,
-                                               le32_to_cpu(i->j.u64s)) ||
+                       fsck_err_on(bytes != vstruct_bytes(&i->j) ||
                                    memcmp(j, &i->j, bytes), c,
                                    "found duplicate but non identical journal entries (seq %llu)",
                                    le64_to_cpu(j->seq));
@@ -455,11 +446,21 @@ fsck_err:
        return ret;
 }
 
+static struct nonce journal_nonce(const struct jset *jset)
+{
+       return (struct nonce) {{
+               [0] = 0,
+               [1] = ((__le32 *) &jset->seq)[0],
+               [2] = ((__le32 *) &jset->seq)[1],
+               [3] = BCH_NONCE_JOURNAL,
+       }};
+}
+
 static void journal_entry_null_range(void *start, void *end)
 {
        struct jset_entry *entry;
 
-       for (entry = start; entry != end; entry = jset_keys_next(entry)) {
+       for (entry = start; entry != end; entry = vstruct_next(entry)) {
                entry->u64s     = 0;
                entry->btree_id = 0;
                entry->level    = 0;
@@ -473,7 +474,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
                                struct bkey_i *k, enum bkey_type key_type,
                                const char *type)
 {
-       void *next = jset_keys_next(entry);
+       void *next = vstruct_next(entry);
        const char *invalid;
        char buf[160];
        int ret = 0;
@@ -481,16 +482,16 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
        if (mustfix_fsck_err_on(!k->k.u64s, c,
                        "invalid %s in journal: k->u64s 0", type)) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-               journal_entry_null_range(jset_keys_next(entry), next);
+               journal_entry_null_range(vstruct_next(entry), next);
                return 0;
        }
 
        if (mustfix_fsck_err_on((void *) bkey_next(k) >
-                       (void *) jset_keys_next(entry), c,
+                               (void *) vstruct_next(entry), c,
                        "invalid %s in journal: extends past end of journal entry",
                        type)) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-               journal_entry_null_range(jset_keys_next(entry), next);
+               journal_entry_null_range(vstruct_next(entry), next);
                return 0;
        }
 
@@ -499,7 +500,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
                        type, k->k.format)) {
                le16_add_cpu(&entry->u64s, -k->k.u64s);
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-               journal_entry_null_range(jset_keys_next(entry), next);
+               journal_entry_null_range(vstruct_next(entry), next);
                return 0;
        }
 
@@ -514,7 +515,7 @@ static int journal_validate_key(struct cache_set *c, struct jset *j,
 
                le16_add_cpu(&entry->u64s, -k->k.u64s);
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-               journal_entry_null_range(jset_keys_next(entry), next);
+               journal_entry_null_range(vstruct_next(entry), next);
                return 0;
        }
 fsck_err:
@@ -525,16 +526,17 @@ fsck_err:
 #define JOURNAL_ENTRY_NONE     6
 #define JOURNAL_ENTRY_BAD      7
 
-static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 sector,
+static int journal_entry_validate(struct cache_set *c,
+                                 struct jset *j, u64 sector,
                                  unsigned bucket_sectors_left,
                                  unsigned sectors_read)
 {
        struct jset_entry *entry;
-       size_t bytes = __set_bytes(j, le32_to_cpu(j->u64s));
-       u64 got, expect;
+       size_t bytes = vstruct_bytes(j);
+       struct bch_csum csum;
        int ret = 0;
 
-       if (le64_to_cpu(j->magic) != jset_magic(&c->disk_sb))
+       if (le64_to_cpu(j->magic) != jset_magic(c))
                return JOURNAL_ENTRY_NONE;
 
        if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
@@ -554,25 +556,32 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
        if (bytes > sectors_read << 9)
                return JOURNAL_ENTRY_REREAD;
 
-       got = le64_to_cpu(j->csum);
-       expect = __csum_set(j, le32_to_cpu(j->u64s), JSET_CSUM_TYPE(j));
-       if (mustfix_fsck_err_on(got != expect, c,
-                       "journal checksum bad (got %llu expect %llu), sector %lluu",
-                       got, expect, sector)) {
+       if (fsck_err_on(!bch_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
+                       "journal entry with unknown csum type %llu sector %lluu",
+                       JSET_CSUM_TYPE(j), sector))
+               return JOURNAL_ENTRY_BAD;
+
+       csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+       if (mustfix_fsck_err_on(bch_crc_cmp(csum, j->csum), c,
+                       "journal checksum bad, sector %llu", sector)) {
                /* XXX: retry IO, when we start retrying checksum errors */
                /* XXX: note we might have missing journal entries */
                return JOURNAL_ENTRY_BAD;
        }
 
-       if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq),
-                       c, "invalid journal entry: last_seq > seq"))
+       bch_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+                   j->encrypted_start,
+                   vstruct_end(j) - (void *) j->encrypted_start);
+
+       if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+                       "invalid journal entry: last_seq > seq"))
                j->last_seq = j->seq;
 
-       for_each_jset_entry(entry, j) {
+       vstruct_for_each(j, entry) {
                struct bkey_i *k;
 
-               if (mustfix_fsck_err_on(jset_keys_next(entry) >
-                               bkey_idx(j, le32_to_cpu(j->u64s)), c,
+               if (mustfix_fsck_err_on(vstruct_next(entry) >
+                                       vstruct_last(j), c,
                                "journal entry extents past end of jset")) {
                        j->u64s = cpu_to_le64((u64 *) entry - j->_data);
                        break;
@@ -580,9 +589,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
 
                switch (JOURNAL_ENTRY_TYPE(entry)) {
                case JOURNAL_ENTRY_BTREE_KEYS:
-                       for (k = entry->start;
-                            k < bkey_idx(entry, le16_to_cpu(entry->u64s));
-                            k = bkey_next(k)) {
+                       vstruct_for_each(entry, k) {
                                ret = journal_validate_key(c, j, entry, k,
                                                bkey_type(entry->level,
                                                          entry->btree_id),
@@ -599,7 +606,7 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
                                        le16_to_cpu(entry->u64s) != k->k.u64s, c,
                                        "invalid btree root journal entry: wrong number of keys")) {
                                journal_entry_null_range(entry,
-                                               jset_keys_next(entry));
+                                               vstruct_next(entry));
                                continue;
                        }
 
@@ -616,14 +623,14 @@ static int journal_entry_validate(struct cache_set *c, struct jset *j, u64 secto
                        if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
                                "invalid journal seq blacklist entry: bad size")) {
                                journal_entry_null_range(entry,
-                                               jset_keys_next(entry));
+                                               vstruct_next(entry));
                        }
 
                        break;
                default:
                        mustfix_fsck_err(c, "invalid journal entry type %llu",
                                 JOURNAL_ENTRY_TYPE(entry));
-                       journal_entry_null_range(entry, jset_keys_next(entry));
+                       journal_entry_null_range(entry, vstruct_next(entry));
                        break;
                }
        }
@@ -632,126 +639,127 @@ fsck_err:
        return ret;
 }
 
-static int journal_read_bucket(struct cache *ca, struct journal_list *jlist,
+struct journal_read_buf {
+       void            *data;
+       size_t          size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+                                   size_t new_size)
+{
+       void *n;
+
+       new_size = roundup_pow_of_two(new_size);
+       n = (void *) __get_free_pages(GFP_KERNEL, get_order(new_size));
+       if (!n)
+               return -ENOMEM;
+
+       free_pages((unsigned long) b->data, get_order(b->size));
+       b->data = n;
+       b->size = new_size;
+       return 0;
+}
+
+static int journal_read_bucket(struct cache *ca,
+                              struct journal_read_buf *buf,
+                              struct journal_list *jlist,
                               unsigned bucket, u64 *seq, bool *entries_found)
 {
        struct cache_set *c = ca->set;
        struct journal_device *ja = &ca->journal;
        struct bio *bio = ja->bio;
-       struct jset *j, *data;
-       unsigned blocks, sectors_read, bucket_offset = 0;
-       unsigned max_entry_sectors = c->journal.entry_size_max >> 9;
-       u64 sector = bucket_to_sector(ca,
-                               journal_bucket(ca->disk_sb.sb, bucket));
+       struct jset *j = NULL;
+       unsigned sectors, sectors_read = 0;
+       u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+           end = offset + ca->mi.bucket_size;
        bool saw_bad = false;
        int ret = 0;
 
-       data = (void *) __get_free_pages(GFP_KERNEL,
-                               get_order(c->journal.entry_size_max));
-       if (!data) {
-               mutex_lock(&jlist->cache_set_buffer_lock);
-               data = c->journal.buf[0].data;
-       }
-
        pr_debug("reading %u", bucket);
 
-       while (bucket_offset < ca->mi.bucket_size) {
-reread:
-               sectors_read = min_t(unsigned,
-                                    ca->mi.bucket_size - bucket_offset,
-                                    max_entry_sectors);
+       while (offset < end) {
+               if (!sectors_read) {
+reread:                        sectors_read = min_t(unsigned,
+                               end - offset, buf->size >> 9);
 
-               bio_reset(bio);
-               bio->bi_bdev            = ca->disk_sb.bdev;
-               bio->bi_iter.bi_sector  = sector + bucket_offset;
-               bio->bi_iter.bi_size    = sectors_read << 9;
-               bio_set_op_attrs(bio, REQ_OP_READ, 0);
-               bch_bio_map(bio, data);
-
-               ret = submit_bio_wait(bio);
-
-               if (cache_fatal_io_err_on(ret, ca,
-                                         "journal read from sector %llu",
-                                         sector + bucket_offset) ||
-                   bch_meta_read_fault("journal")) {
-                       ret = -EIO;
-                       goto err;
-               }
+                       bio_reset(bio);
+                       bio->bi_bdev            = ca->disk_sb.bdev;
+                       bio->bi_iter.bi_sector  = offset;
+                       bio->bi_iter.bi_size    = sectors_read << 9;
+                       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+                       bch_bio_map(bio, buf->data);
 
-               /* This function could be simpler now since we no longer write
-                * journal entries that overlap bucket boundaries; this means
-                * the start of a bucket will always have a valid journal entry
-                * if it has any journal entries at all.
-                */
+                       ret = submit_bio_wait(bio);
 
-               j = data;
-               while (sectors_read) {
-                       ret = journal_entry_validate(c, j,
-                                       sector + bucket_offset,
-                                       ca->mi.bucket_size - bucket_offset,
-                                       sectors_read);
-                       switch (ret) {
-                       case BCH_FSCK_OK:
-                               break;
-                       case JOURNAL_ENTRY_REREAD:
-                               goto reread;
-                       case JOURNAL_ENTRY_NONE:
-                               if (!saw_bad)
-                                       goto out;
-                               blocks = 1;
-                               goto next_block;
-                       case JOURNAL_ENTRY_BAD:
-                               saw_bad = true;
-                               blocks = 1;
-                               goto next_block;
-                       default:
-                               goto err;
-                       }
+                       if (cache_fatal_io_err_on(ret, ca,
+                                                 "journal read from sector %llu",
+                                                 offset) ||
+                           bch_meta_read_fault("journal"))
+                               return -EIO;
 
-                       /*
-                        * This happens sometimes if we don't have discards on -
-                        * when we've partially overwritten a bucket with new
-                        * journal entries. We don't need the rest of the
-                        * bucket:
-                        */
-                       if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
-                               goto out;
-
-                       ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
-                       ret = journal_entry_add(c, jlist, j);
-                       switch (ret) {
-                       case JOURNAL_ENTRY_ADD_OK:
-                               *entries_found = true;
-                               break;
-                       case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
-                               break;
-                       default:
-                               goto err;
+                       j = buf->data;
+               }
+
+               ret = journal_entry_validate(c, j, offset,
+                                       end - offset, sectors_read);
+               switch (ret) {
+               case BCH_FSCK_OK:
+                       break;
+               case JOURNAL_ENTRY_REREAD:
+                       if (vstruct_bytes(j) > buf->size) {
+                               ret = journal_read_buf_realloc(buf,
+                                                       vstruct_bytes(j));
+                               if (ret)
+                                       return ret;
                        }
+                       goto reread;
+               case JOURNAL_ENTRY_NONE:
+                       if (!saw_bad)
+                               return 0;
+                       sectors = c->sb.block_size;
+                       goto next_block;
+               case JOURNAL_ENTRY_BAD:
+                       saw_bad = true;
+                       sectors = c->sb.block_size;
+                       goto next_block;
+               default:
+                       return ret;
+               }
 
-                       if (le64_to_cpu(j->seq) > *seq)
-                               *seq = le64_to_cpu(j->seq);
-next_block:
-                       blocks = __set_blocks(j, le32_to_cpu(j->u64s),
-                                             block_bytes(c));
+               /*
+                * This happens sometimes if we don't have discards on -
+                * when we've partially overwritten a bucket with new
+                * journal entries. We don't need the rest of the
+                * bucket:
+                */
+               if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+                       return 0;
+
+               ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
-                       pr_debug("next");
-                       bucket_offset   += blocks * c->sb.block_size;
-                       sectors_read    -= blocks * c->sb.block_size;
-                       j = ((void *) j) + blocks * block_bytes(c);
+               ret = journal_entry_add(c, jlist, j);
+               switch (ret) {
+               case JOURNAL_ENTRY_ADD_OK:
+                       *entries_found = true;
+                       break;
+               case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+                       break;
+               default:
+                       return ret;
                }
+
+               if (le64_to_cpu(j->seq) > *seq)
+                       *seq = le64_to_cpu(j->seq);
+
+               sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+               pr_debug("next");
+               offset          += sectors;
+               sectors_read    -= sectors;
+               j = ((void *) j) + (sectors << 9);
        }
-out:
-       ret = 0;
-err:
-       if (data == c->journal.buf[0].data)
-               mutex_unlock(&jlist->cache_set_buffer_lock);
-       else
-               free_pages((unsigned long) data,
-                               get_order(c->journal.entry_size_max));
 
-       return ret;
+       return 0;
 }
 
 static void bch_journal_read_device(struct closure *cl)
@@ -759,15 +767,11 @@ static void bch_journal_read_device(struct closure *cl)
 #define read_bucket(b)                                                 \
        ({                                                              \
                bool entries_found = false;                             \
-               int ret = journal_read_bucket(ca, jlist, b,             \
-                                             &seq, &entries_found);    \
+               ret = journal_read_bucket(ca, &buf, jlist, b, &seq,     \
+                                         &entries_found);              \
+               if (ret)                                                \
+                       goto err;                                       \
                __set_bit(b, bitmap);                                   \
-               if (ret) {                                              \
-                       mutex_lock(&jlist->lock);                       \
-                       jlist->ret = ret;                               \
-                       mutex_unlock(&jlist->lock);                     \
-                       closure_return(cl);                             \
-               }                                                       \
                entries_found;                                          \
         })
 
@@ -777,24 +781,29 @@ static void bch_journal_read_device(struct closure *cl)
        struct journal_list *jlist =
                container_of(cl->parent, struct journal_list, cl);
        struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+       struct journal_read_buf buf = { NULL, 0 };
 
-       unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
-       DECLARE_BITMAP(bitmap, nr_buckets);
+       DECLARE_BITMAP(bitmap, ja->nr);
        unsigned i, l, r;
        u64 seq = 0;
+       int ret;
 
-       if (!nr_buckets)
-               closure_return(cl);
+       if (!ja->nr)
+               goto out;
+
+       bitmap_zero(bitmap, ja->nr);
+       ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+       if (ret)
+               goto err;
 
-       bitmap_zero(bitmap, nr_buckets);
-       pr_debug("%u journal buckets", nr_buckets);
+       pr_debug("%u journal buckets", ja->nr);
 
        /*
         * If the device supports discard but not secure discard, we can't do
         * the fancy fibonacci hash/binary search because the live journal
         * entries might not form a contiguous range:
         */
-       for (i = 0; i < nr_buckets; i++)
+       for (i = 0; i < ja->nr; i++)
                read_bucket(i);
        goto search_done;
 
@@ -805,8 +814,8 @@ static void bch_journal_read_device(struct closure *cl)
         * Read journal buckets ordered by golden ratio hash to quickly
         * find a sequence of buckets with valid journal entries
         */
-       for (i = 0; i < nr_buckets; i++) {
-               l = (i * 2654435769U) % nr_buckets;
+       for (i = 0; i < ja->nr; i++) {
+               l = (i * 2654435769U) % ja->nr;
 
                if (test_bit(l, bitmap))
                        break;
@@ -821,18 +830,18 @@ static void bch_journal_read_device(struct closure *cl)
         */
        pr_debug("falling back to linear search");
 linear_scan:
-       for (l = find_first_zero_bit(bitmap, nr_buckets);
-            l < nr_buckets;
-            l = find_next_zero_bit(bitmap, nr_buckets, l + 1))
+       for (l = find_first_zero_bit(bitmap, ja->nr);
+            l < ja->nr;
+            l = find_next_zero_bit(bitmap, ja->nr, l + 1))
                if (read_bucket(l))
                        goto bsearch;
 
        /* no journal entries on this device? */
-       if (l == nr_buckets)
-               closure_return(cl);
+       if (l == ja->nr)
+               goto out;
 bsearch:
        /* Binary search */
-       r = find_next_bit(bitmap, nr_buckets, l + 1);
+       r = find_next_bit(bitmap, ja->nr, l + 1);
        pr_debug("starting binary search, l %u r %u", l, r);
 
        while (l + 1 < r) {
@@ -858,9 +867,9 @@ search_done:
         */
        seq = 0;
 
-       for (i = 0; i < nr_buckets; i++)
+       for (i = 0; i < ja->nr; i++)
                if (ja->bucket_seq[i] >= seq &&
-                   ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % nr_buckets]) {
+                   ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
                        /*
                         * When journal_next_bucket() goes to allocate for
                         * the first time, it'll use the bucket after
@@ -875,20 +884,26 @@ search_done:
         * reclaimed - journal reclaim will immediately reclaim whatever isn't
         * pinned when it first runs:
         */
-       ja->last_idx = (ja->cur_idx + 1) % nr_buckets;
+       ja->last_idx = (ja->cur_idx + 1) % ja->nr;
 
        /*
         * Read buckets in reverse order until we stop finding more journal
         * entries:
         */
-       for (i = (ja->cur_idx + nr_buckets - 1) % nr_buckets;
+       for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
             i != ja->cur_idx;
-            i = (i + nr_buckets - 1) % nr_buckets)
+            i = (i + ja->nr - 1) % ja->nr)
                if (!test_bit(i, bitmap) &&
                    !read_bucket(i))
                        break;
-
+out:
+       free_pages((unsigned long) buf.data, get_order(buf.size));
        closure_return(cl);
+err:
+       mutex_lock(&jlist->lock);
+       jlist->ret = ret;
+       mutex_unlock(&jlist->lock);
+       goto out;
 #undef read_bucket
 }
 
@@ -930,6 +945,19 @@ static int journal_seq_blacklist_read(struct journal *j,
        return 0;
 }
 
+static inline bool journal_has_keys(struct list_head *list)
+{
+       struct journal_replay *i;
+       struct jset_entry *entry;
+       struct bkey_i *k, *_n;
+
+       list_for_each_entry(i, list, list)
+               for_each_jset_key(k, _n, entry, &i->j)
+                       return true;
+
+       return false;
+}
+
 int bch_journal_read(struct cache_set *c, struct list_head *list)
 {
        struct jset_entry *prio_ptrs;
@@ -944,7 +972,6 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 
        closure_init_stack(&jlist.cl);
        mutex_init(&jlist.lock);
-       mutex_init(&jlist.cache_set_buffer_lock);
        jlist.head = list;
        jlist.ret = 0;
 
@@ -964,6 +991,9 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
                return BCH_FSCK_REPAIR_IMPOSSIBLE;
        }
 
+       fsck_err_on(c->sb.clean && journal_has_keys(list), c,
+                   "filesystem marked clean but journal has keys to replay");
+
        j = &list_entry(list->prev, struct journal_replay, list)->j;
 
        unfixable_fsck_err_on(le64_to_cpu(j->seq) -
@@ -1057,7 +1087,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
                        struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
 
                        if (btree_type_has_ptrs(type))
-                               __bch_btree_mark_key(c, type, k_s_c);
+                               bch_btree_mark_key_initial(c, type, k_s_c);
                }
 }
 
@@ -1171,10 +1201,9 @@ static enum {
        buf->data->last_seq     = cpu_to_le64(last_seq(j));
 
        j->prev_buf_sectors =
-               __set_blocks(buf->data,
-                            le32_to_cpu(buf->data->u64s) +
-                            journal_entry_u64s_reserve(buf),
-                            block_bytes(c)) * c->sb.block_size;
+               vstruct_blocks_plus(buf->data, c->block_bits,
+                                   journal_entry_u64s_reserve(buf)) *
+               c->sb.block_size;
 
        BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
@@ -1219,9 +1248,8 @@ static unsigned journal_dev_buckets_available(struct journal *j,
                                              struct cache *ca)
 {
        struct journal_device *ja = &ca->journal;
-       unsigned nr = bch_nr_journal_buckets(ca->disk_sb.sb);
-       unsigned next = (ja->cur_idx + 1) % nr;
-       unsigned available = (ja->last_idx + nr - next) % nr;
+       unsigned next = (ja->cur_idx + 1) % ja->nr;
+       unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
 
        /*
         * Hack to avoid a deadlock during journal replay:
@@ -1271,7 +1299,7 @@ static int journal_entry_sectors(struct journal *j)
                 * for the previous entry we have to make sure we have space for
                 * it too:
                 */
-               if (bch_extent_has_device(e.c, ca->sb.nr_this_dev)) {
+               if (bch_extent_has_device(e.c, ca->dev_idx)) {
                        if (j->prev_buf_sectors > ca->journal.sectors_free)
                                buckets_required++;
 
@@ -1479,17 +1507,28 @@ int bch_journal_replay(struct cache_set *c, struct list_head *list)
                entries++;
        }
 
+       if (keys) {
+               bch_btree_flush(c);
+
+               /*
+                * Write a new journal entry _before_ we start journalling new data -
+                * otherwise, we could end up with btree node bsets with journal seqs
+                * arbitrarily far in the future vs. the most recently written journal
+                * entry on disk, if we crash before writing the next journal entry:
+                */
+               ret = bch_journal_meta(&c->journal);
+               if (ret)
+                       goto err;
+       }
+
        bch_info(c, "journal replay done, %i keys in %i entries, seq %llu",
                 keys, entries, (u64) atomic64_read(&j->seq));
 
-       fsck_err_on(c->sb.clean && keys, c,
-                   "filesystem marked clean, but journal had keys to replay");
-
        bch_journal_set_replay_done(&c->journal);
 err:
        if (ret)
                bch_err(c, "journal replay error: %d", ret);
-fsck_err:
+
        bch_journal_entries_free(list);
 
        return ret;
@@ -1497,28 +1536,40 @@ fsck_err:
 
 static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
 {
-       unsigned u64s = bch_journal_buckets_offset(ca->disk_sb.sb) + nr;
+       struct journal_device *ja = &ca->journal;
+       struct bch_sb_field_journal *journal_buckets =
+               bch_sb_get_journal(ca->disk_sb.sb);
+       struct bch_sb_field *f;
        u64 *p;
-       int ret;
 
-       ret = bch_super_realloc(&ca->disk_sb, u64s);
-       if (ret)
-               return ret;
+       p = krealloc(ja->bucket_seq, nr * sizeof(u64),
+                    GFP_KERNEL|__GFP_ZERO);
+       if (!p)
+               return -ENOMEM;
+
+       ja->bucket_seq = p;
 
-       p = krealloc(ca->journal.bucket_seq,
-                    nr * sizeof(u64),
+       p = krealloc(ja->buckets, nr * sizeof(u64),
                     GFP_KERNEL|__GFP_ZERO);
        if (!p)
                return -ENOMEM;
 
-       ca->journal.bucket_seq = p;
-       ca->disk_sb.sb->u64s = cpu_to_le16(u64s);
+       ja->buckets = p;
+
+       f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
+                                   sizeof(*journal_buckets) / sizeof(u64));
+       if (!f)
+               return -ENOMEM;
+       f->type = BCH_SB_FIELD_journal;
 
+       ja->nr = nr;
        return 0;
 }
 
 int bch_cache_journal_alloc(struct cache *ca)
 {
+       struct journal_device *ja = &ca->journal;
+       struct bch_sb_field_journal *journal_buckets;
        int ret;
        unsigned i;
 
@@ -1540,11 +1591,15 @@ int bch_cache_journal_alloc(struct cache *ca)
        if (ret)
                return ret;
 
-       for (i = 0; i < bch_nr_journal_buckets(ca->disk_sb.sb); i++) {
-               unsigned long r = ca->mi.first_bucket + i;
+       journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
+
+       for (i = 0; i < ja->nr; i++) {
+               u64 bucket = ca->mi.first_bucket + i;
 
-               bch_mark_metadata_bucket(ca, &ca->buckets[r], true);
-               set_journal_bucket(ca->disk_sb.sb, i, r);
+               ja->buckets[i] = bucket;
+               journal_buckets->buckets[i] = cpu_to_le64(bucket);
+
+               bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
        }
 
        return 0;
@@ -1749,7 +1804,7 @@ static void journal_reclaim_work(struct work_struct *work)
        struct cache *ca;
        struct journal_entry_pin *pin;
        u64 seq_to_flush = 0;
-       unsigned iter, nr, bucket_to_flush;
+       unsigned iter, bucket_to_flush;
        unsigned long next_flush;
        bool reclaim_lock_held = false, need_flush;
 
@@ -1781,13 +1836,11 @@ static void journal_reclaim_work(struct work_struct *work)
                            blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
                                blkdev_issue_discard(ca->disk_sb.bdev,
                                        bucket_to_sector(ca,
-                                               journal_bucket(ca->disk_sb.sb,
-                                                              ja->last_idx)),
+                                               ja->buckets[ja->last_idx]),
                                        ca->mi.bucket_size, GFP_NOIO, 0);
 
                        spin_lock(&j->lock);
-                       ja->last_idx = (ja->last_idx + 1) %
-                               bch_nr_journal_buckets(ca->disk_sb.sb);
+                       ja->last_idx = (ja->last_idx + 1) % ja->nr;
                        spin_unlock(&j->lock);
 
                        wake_up(&j->wait);
@@ -1798,8 +1851,7 @@ static void journal_reclaim_work(struct work_struct *work)
                 * buckets
                 */
                spin_lock(&j->lock);
-               nr = bch_nr_journal_buckets(ca->disk_sb.sb),
-               bucket_to_flush = (ja->cur_idx + (nr >> 1)) % nr;
+               bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
                seq_to_flush = max_t(u64, seq_to_flush,
                                     ja->bucket_seq[bucket_to_flush]);
                spin_unlock(&j->lock);
@@ -1861,7 +1913,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
         */
        extent_for_each_ptr_backwards(e, ptr)
                if (!(ca = PTR_CACHE(c, ptr)) ||
-                   ca->mi.state != CACHE_ACTIVE ||
+                   ca->mi.state != BCH_MEMBER_STATE_ACTIVE ||
                    ca->journal.sectors_free <= sectors)
                        __bch_extent_drop_ptr(e, ptr);
                else
@@ -1875,7 +1927,6 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
         */
        group_for_each_cache_rcu(ca, &j->devs, iter) {
                struct journal_device *ja = &ca->journal;
-               unsigned nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
 
                if (replicas >= replicas_want)
                        break;
@@ -1884,21 +1935,20 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
                 * Check that we can use this device, and aren't already using
                 * it:
                 */
-               if (bch_extent_has_device(e.c, ca->sb.nr_this_dev) ||
+               if (bch_extent_has_device(e.c, ca->dev_idx) ||
                    !journal_dev_buckets_available(j, ca) ||
                    sectors > ca->mi.bucket_size)
                        continue;
 
                ja->sectors_free = ca->mi.bucket_size - sectors;
-               ja->cur_idx = (ja->cur_idx + 1) % nr_buckets;
+               ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
                ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
 
                extent_ptr_append(bkey_i_to_extent(&j->key),
                        (struct bch_extent_ptr) {
                                  .offset = bucket_to_sector(ca,
-                                       journal_bucket(ca->disk_sb.sb,
-                                                      ja->cur_idx)),
-                                 .dev = ca->sb.nr_this_dev,
+                                       ja->buckets[ja->cur_idx]),
+                                 .dev = ca->dev_idx,
                });
                replicas++;
 
@@ -1928,10 +1978,7 @@ static void journal_write_compact(struct jset *jset)
         * If we wanted to be really fancy here, we could sort all the keys in
         * the jset and drop keys that were overwritten - probably not worth it:
         */
-       for (i = jset->start;
-            i < (struct jset_entry *) bkey_idx(jset, le32_to_cpu(jset->u64s)) &&
-            (next = jset_keys_next(i), true);
-            i = next) {
+       vstruct_for_each_safe(jset, i, next) {
                unsigned u64s = le16_to_cpu(i->u64s);
 
                /* Empty entry: */
@@ -1945,7 +1992,7 @@ static void journal_write_compact(struct jset *jset)
                    JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_TYPE(prev) &&
                    JOURNAL_ENTRY_TYPE(i) == JOURNAL_ENTRY_BTREE_KEYS &&
                    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
-                       memmove_u64s_down(jset_keys_next(prev),
+                       memmove_u64s_down(vstruct_next(prev),
                                          i->_data,
                                          u64s);
                        le16_add_cpu(&prev->u64s, u64s);
@@ -1953,12 +2000,12 @@ static void journal_write_compact(struct jset *jset)
                }
 
                /* Couldn't merge, move i into new position (after prev): */
-               prev = prev ? jset_keys_next(prev) : jset->start;
+               prev = prev ? vstruct_next(prev) : jset->start;
                if (i != prev)
                        memmove_u64s_down(prev, i, jset_u64s(u64s));
        }
 
-       prev = prev ? jset_keys_next(prev) : jset->start;
+       prev = prev ? vstruct_next(prev) : jset->start;
        jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
 }
 
@@ -2019,6 +2066,7 @@ static void journal_write(struct closure *cl)
        struct cache_set *c = container_of(j, struct cache_set, journal);
        struct cache *ca;
        struct journal_buf *w = journal_prev_buf(j);
+       struct jset *jset = w->data;
        struct bio *bio;
        struct bch_extent_ptr *ptr;
        unsigned i, sectors, bytes;
@@ -2036,24 +2084,27 @@ static void journal_write(struct closure *cl)
        }
        mutex_unlock(&c->btree_root_lock);
 
-       journal_write_compact(w->data);
+       journal_write_compact(jset);
+
+       jset->read_clock        = cpu_to_le16(c->prio_clock[READ].hand);
+       jset->write_clock       = cpu_to_le16(c->prio_clock[WRITE].hand);
+       jset->magic             = cpu_to_le64(jset_magic(c));
+       jset->version           = cpu_to_le32(BCACHE_JSET_VERSION);
 
-       w->data->read_clock     = cpu_to_le16(c->prio_clock[READ].hand);
-       w->data->write_clock    = cpu_to_le16(c->prio_clock[WRITE].hand);
-       w->data->magic          = cpu_to_le64(jset_magic(&c->disk_sb));
-       w->data->version        = cpu_to_le32(BCACHE_JSET_VERSION);
+       SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+       SET_JSET_CSUM_TYPE(jset, bch_meta_checksum_type(c));
 
-       SET_JSET_BIG_ENDIAN(w->data, CPU_BIG_ENDIAN);
-       SET_JSET_CSUM_TYPE(w->data, c->opts.metadata_checksum);
-       w->data->csum = cpu_to_le64(__csum_set(w->data,
-                                              le32_to_cpu(w->data->u64s),
-                                              JSET_CSUM_TYPE(w->data)));
+       bch_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+                   jset->encrypted_start,
+                   vstruct_end(jset) - (void *) jset->encrypted_start);
 
-       sectors = __set_blocks(w->data, le32_to_cpu(w->data->u64s),
-                              block_bytes(c)) * c->sb.block_size;
+       jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+                                 journal_nonce(jset), jset);
+
+       sectors = vstruct_sectors(jset, c->block_bits);
        BUG_ON(sectors > j->prev_buf_sectors);
 
-       bytes = __set_bytes(w->data, le32_to_cpu(w->data->u64s));
+       bytes = vstruct_bytes(w->data);
        memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
 
        if (journal_write_alloc(j, sectors)) {
@@ -2096,7 +2147,7 @@ static void journal_write(struct closure *cl)
                bio->bi_private         = ca;
                bio_set_op_attrs(bio, REQ_OP_WRITE,
                                 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
-               bch_bio_map(bio, w->data);
+               bch_bio_map(bio, jset);
 
                trace_bcache_journal_write(bio);
                closure_bio_submit_punt(bio, cl, c);
@@ -2105,7 +2156,7 @@ static void journal_write(struct closure *cl)
        }
 
        for_each_cache(ca, c, i)
-               if (ca->mi.state == CACHE_ACTIVE &&
+               if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
                    journal_flushes_device(ca) &&
                    !bch_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
                        percpu_ref_get(&ca->ref);
@@ -2503,7 +2554,7 @@ ssize_t bch_journal_print_debug(struct journal *j, char *buf)
                                 "\tnr\t\t%u\n"
                                 "\tcur_idx\t\t%u (seq %llu)\n"
                                 "\tlast_idx\t%u (seq %llu)\n",
-                                iter, bch_nr_journal_buckets(ca->disk_sb.sb),
+                                iter, ja->nr,
                                 ja->cur_idx,   ja->bucket_seq[ja->cur_idx],
                                 ja->last_idx,  ja->bucket_seq[ja->last_idx]);
        }
@@ -2521,7 +2572,7 @@ static bool bch_journal_writing_to_device(struct cache *ca)
 
        spin_lock(&j->lock);
        ret = bch_extent_has_device(bkey_i_to_s_c_extent(&j->key),
-                                   ca->sb.nr_this_dev);
+                                   ca->dev_idx);
        spin_unlock(&j->lock);
 
        return ret;
@@ -2541,10 +2592,11 @@ static bool bch_journal_writing_to_device(struct cache *ca)
 
 int bch_journal_move(struct cache *ca)
 {
-       unsigned i, nr_buckets;
        u64 last_flushed_seq;
+       struct journal_device *ja = &ca->journal;
        struct cache_set *c = ca->set;
        struct journal *j = &c->journal;
+       unsigned i;
        int ret = 0;            /* Success */
 
        if (bch_journal_writing_to_device(ca)) {
@@ -2585,10 +2637,45 @@ int bch_journal_move(struct cache *ca)
        last_flushed_seq = last_seq(j);
        spin_unlock(&j->lock);
 
-       nr_buckets = bch_nr_journal_buckets(ca->disk_sb.sb);
-
-       for (i = 0; i < nr_buckets; i += 1)
-               BUG_ON(ca->journal.bucket_seq[i] > last_flushed_seq);
+       for (i = 0; i < ja->nr; i += 1)
+               BUG_ON(ja->bucket_seq[i] > last_flushed_seq);
 
        return ret;
 }
+
+void bch_journal_free_cache(struct cache *ca)
+{
+       kfree(ca->journal.buckets);
+       kfree(ca->journal.bucket_seq);
+}
+
+int bch_journal_init_cache(struct cache *ca)
+{
+       struct journal_device *ja = &ca->journal;
+       struct bch_sb_field_journal *journal_buckets =
+               bch_sb_get_journal(ca->disk_sb.sb);
+       unsigned i, journal_entry_pages;
+
+       journal_entry_pages =
+               DIV_ROUND_UP(1U << BCH_SB_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
+                            PAGE_SECTORS);
+
+       ja->nr = bch_nr_journal_buckets(journal_buckets);
+
+       ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+       if (!ja->bucket_seq)
+               return -ENOMEM;
+
+       ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages);
+       if (!ca->journal.bio)
+               return -ENOMEM;
+
+       ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
+       if (!ja->buckets)
+               return -ENOMEM;
+
+       for (i = 0; i < ja->nr; i++)
+               ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+
+       return 0;
+}
index 759ed60961a779c1a1e087a5aaa769f8ccf09230..9274831acae3f24f752b631e51db0098154bf4ce 100644 (file)
 #include <linux/hash.h>
 
 #include "journal_types.h"
-
-static inline struct jset_entry *jset_keys_next(struct jset_entry *j)
-{
-       return (void *) __bkey_idx(j, le16_to_cpu(j->u64s));
-}
+//#include "super-io.h"
 
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
@@ -182,7 +178,7 @@ static inline void bch_journal_add_entry_at(struct journal_buf *buf,
                                            unsigned type, enum btree_id id,
                                            unsigned level, unsigned offset)
 {
-       struct jset_entry *entry = bkey_idx(buf->data, offset);
+       struct jset_entry *entry = vstruct_idx(buf->data, offset);
 
        entry->u64s = cpu_to_le16(u64s);
        entry->btree_id = id;
@@ -336,7 +332,7 @@ static inline int bch_journal_error(struct journal *j)
 
 static inline bool is_journal_device(struct cache *ca)
 {
-       return ca->mi.state == CACHE_ACTIVE && ca->mi.tier == 0;
+       return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0;
 }
 
 static inline bool journal_flushes_device(struct cache *ca)
@@ -367,21 +363,16 @@ ssize_t bch_journal_print_debug(struct journal *, char *);
 
 int bch_cache_journal_alloc(struct cache *);
 
-static inline __le64 *__journal_buckets(struct cache_sb *sb)
-{
-       return sb->_data + bch_journal_buckets_offset(sb);
-}
-
-static inline u64 journal_bucket(struct cache_sb *sb, unsigned nr)
+static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
 {
-       return le64_to_cpu(__journal_buckets(sb)[nr]);
-}
-
-static inline void set_journal_bucket(struct cache_sb *sb, unsigned nr, u64 bucket)
-{
-       __journal_buckets(sb)[nr] = cpu_to_le64(bucket);
+       return j
+               ? (__le64 *) vstruct_end(&j->field) - j->buckets
+               : 0;
 }
 
 int bch_journal_move(struct cache *);
 
+void bch_journal_free_cache(struct cache *);
+int bch_journal_init_cache(struct cache *);
+
 #endif /* _BCACHE_JOURNAL_H */
index e3698b5adfeae0eaedf5b3f72e384f5925e77558..5c95e37d6413ce0998305b3ffc87bef48af96479 100644 (file)
@@ -186,7 +186,7 @@ struct journal {
         * ugh: need to get prio_buckets converted over to the eventual new
         * transaction machinery
         */
-       __le64                  prio_buckets[MAX_CACHES_PER_SET];
+       __le64                  prio_buckets[BCH_SB_MEMBERS_MAX];
        unsigned                nr_prio_buckets;
 
        unsigned                write_delay_ms;
@@ -208,7 +208,7 @@ struct journal {
 
 /*
  * Embedded in struct cache. First three fields refer to the array of journal
- * buckets, in cache_sb.
+ * buckets, in bch_sb.
  */
 struct journal_device {
        /*
@@ -229,6 +229,8 @@ struct journal_device {
         * sufficient to read:
         */
        unsigned                last_idx;
+       unsigned                nr;
+       u64                     *buckets;
 
        /* Bio for journal reads/writes to this device */
        struct bio              *bio;
index 5a26e22865042870bbfd7831a163311620eb4b3d..407ca17ec4418a91be5a694bab7daca36e74857c 100644 (file)
@@ -25,7 +25,7 @@ static int issue_migration_move(struct cache *ca,
                return -ENOSPC;
 
        extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
-               if (ptr->dev == ca->sb.nr_this_dev)
+               if (ptr->dev == ca->dev_idx)
                        goto found;
 
        BUG();
@@ -62,7 +62,7 @@ int bch_move_data_off_device(struct cache *ca)
        u64 seen_key_count;
        int ret = 0;
 
-       BUG_ON(ca->mi.state == CACHE_ACTIVE);
+       BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);
 
        bch_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
        ctxt.avoid = ca;
@@ -99,7 +99,7 @@ int bch_move_data_off_device(struct cache *ca)
                       !(ret = btree_iter_err(k))) {
                        if (!bkey_extent_is_data(k.k) ||
                            !bch_extent_has_device(bkey_s_c_to_extent(k),
-                                                  ca->sb.nr_this_dev))
+                                                  ca->dev_idx))
                                goto next;
 
                        ret = issue_migration_move(ca, &ctxt, k);
@@ -151,14 +151,14 @@ static int bch_move_btree_off(struct cache *ca, enum btree_id id)
        struct btree *b;
        int ret;
 
-       BUG_ON(ca->mi.state == CACHE_ACTIVE);
+       BUG_ON(ca->mi.state == BCH_MEMBER_STATE_ACTIVE);
 
        closure_init_stack(&cl);
 
        for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
                struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 retry:
-               if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+               if (!bch_extent_has_device(e, ca->dev_idx))
                        continue;
 
                ret = bch_btree_node_rewrite(&iter, b, &cl);
@@ -188,7 +188,7 @@ retry:
                for_each_btree_node(&iter, c, id, POS_MIN, 0, b) {
                        struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 
-                       BUG_ON(bch_extent_has_device(e, ca->sb.nr_this_dev));
+                       BUG_ON(bch_extent_has_device(e, ca->dev_idx));
                }
                bch_btree_iter_unlock(&iter);
        }
@@ -282,7 +282,7 @@ static int bch_flag_key_bad(struct btree_iter *iter,
        e = bkey_i_to_s_extent(&tmp.key);
 
        extent_for_each_ptr_backwards(e, ptr)
-               if (ptr->dev == ca->sb.nr_this_dev)
+               if (ptr->dev == ca->dev_idx)
                        bch_extent_drop_ptr(e, ptr);
 
        /*
@@ -323,7 +323,7 @@ int bch_flag_data_bad(struct cache *ca)
                        goto advance;
 
                e = bkey_s_c_to_extent(k);
-               if (!bch_extent_has_device(e, ca->sb.nr_this_dev))
+               if (!bch_extent_has_device(e, ca->dev_idx))
                        goto advance;
 
                ret = bch_flag_key_bad(&iter, ca, e);
index f3ab9e8360d9966d802e6356ba765d5eb5d2ca65..655a52331b233725b2a467046990ea4c679fe286 100644 (file)
@@ -5,7 +5,7 @@
 #include "buckets.h"
 #include "io.h"
 #include "move.h"
-#include "super.h"
+#include "super-io.h"
 #include "keylist.h"
 
 #include <linux/ioprio.h>
@@ -63,7 +63,8 @@ static int bch_migrate_index_update(struct bch_write_op *op)
                bkey_start_pos(&bch_keylist_front(keys)->k));
 
        while (1) {
-               struct bkey_i *insert = bch_keylist_front(keys);
+               struct bkey_s_extent insert =
+                       bkey_i_to_s_extent(bch_keylist_front(keys));
                struct bkey_s_c k = bch_btree_iter_peek_with_holes(&iter);
                struct bch_extent_ptr *ptr;
                struct bkey_s_extent e;
@@ -79,17 +80,18 @@ static int bch_migrate_index_update(struct bch_write_op *op)
 
                bkey_reassemble(&new.k, k);
                bch_cut_front(iter.pos, &new.k);
-               bch_cut_back(insert->k.p, &new.k.k);
+               bch_cut_back(insert.k->p, &new.k.k);
                e = bkey_i_to_s_extent(&new.k);
 
                /* hack - promotes can race: */
                if (m->promote)
-                       extent_for_each_ptr(bkey_i_to_s_extent(insert), ptr)
+                       extent_for_each_ptr(insert, ptr)
                                if (bch_extent_has_device(e.c, ptr->dev))
                                        goto nomatch;
 
                ptr = bch_migrate_matching_ptr(m, e);
                if (ptr) {
+                       int nr_new_dirty = bch_extent_nr_dirty_ptrs(insert.s_c);
                        unsigned insert_flags =
                                BTREE_INSERT_ATOMIC|
                                BTREE_INSERT_NOFAIL;
@@ -98,17 +100,22 @@ static int bch_migrate_index_update(struct bch_write_op *op)
                        if (m->move)
                                insert_flags |= BTREE_INSERT_USE_RESERVE;
 
-                       if (m->move)
+                       if (m->move) {
+                               nr_new_dirty -= !ptr->cached;
                                __bch_extent_drop_ptr(e, ptr);
+                       }
+
+                       BUG_ON(nr_new_dirty < 0);
 
                        memcpy_u64s(extent_entry_last(e),
-                                   &insert->v,
-                                   bkey_val_u64s(&insert->k));
-                       e.k->u64s += bkey_val_u64s(&insert->k);
+                                   insert.v,
+                                   bkey_val_u64s(insert.k));
+                       e.k->u64s += bkey_val_u64s(insert.k);
 
                        bch_extent_narrow_crcs(e);
                        bch_extent_drop_redundant_crcs(e);
                        bch_extent_normalize(c, e.s);
+                       bch_extent_mark_replicas_cached(c, e, nr_new_dirty);
 
                        ret = bch_btree_insert_at(c, &op->res,
                                        NULL, op_journal_seq(op),
@@ -148,7 +155,8 @@ void bch_migrate_write_init(struct cache_set *c,
        if (move_ptr)
                m->move_ptr = *move_ptr;
 
-       if (bkey_extent_is_cached(k.k))
+       if (bkey_extent_is_cached(k.k) ||
+           (move_ptr && move_ptr->cached))
                flags |= BCH_WRITE_CACHED;
 
        bch_write_op_init(&m->op, c, &m->wbio,
@@ -160,6 +168,7 @@ void bch_migrate_write_init(struct cache_set *c,
        if (m->move)
                m->op.alloc_reserve = RESERVE_MOVINGGC;
 
+       m->op.nonce             = extent_current_nonce(bkey_s_c_to_extent(k));
        m->op.nr_replicas       = 1;
        m->op.index_update_fn   = bch_migrate_index_update;
 }
index cb4f1654651ccecb32ef2bec0776731d37a10d8e..83407eb1acb0a386eae47502af3233e3dc602fbc 100644 (file)
@@ -28,7 +28,7 @@ static const struct bch_extent_ptr *moving_pred(struct cache *ca,
 
        if (bkey_extent_is_data(k.k) &&
            (ptr = bch_extent_has_device(bkey_s_c_to_extent(k),
-                                        ca->sb.nr_this_dev)) &&
+                                        ca->dev_idx)) &&
            PTR_BUCKET(ca, ptr)->mark.copygc)
                return ptr;
 
index e9b5568c95e34fde92eafb7fa3ae28bad777959a..3a50f8fb64b1dcb6b4e8d96b6d89c0097da56b1b 100644 (file)
@@ -25,7 +25,7 @@ static void notify_get(struct cache_set *c)
        env->envp_idx = 0;
        env->buflen = 0;
 
-       notify_var(c, "SET_UUID=%pU", c->disk_sb.user_uuid.b);
+       notify_var(c, "SET_UUID=%pU", c->sb.user_uuid.b);
 }
 
 static void notify_get_cache(struct cache *ca)
@@ -34,7 +34,7 @@ static void notify_get_cache(struct cache *ca)
        char buf[BDEVNAME_SIZE];
 
        notify_get(c);
-       notify_var(c, "UUID=%pU", ca->disk_sb.sb->disk_uuid.b);
+       notify_var(c, "UUID=%pU", ca->uuid.b);
        notify_var(c, "BLOCKDEV=%s", bdevname(ca->disk_sb.bdev, buf));
 }
 
index 60a2a4d1a5aefbcb373dc63326202d60bac1be41..333654eb79038340ac60aa446edaff35e6ff9b2b 100644 (file)
@@ -29,7 +29,6 @@ const char * const bch_str_hash_types[] = {
        "crc32c",
        "crc64",
        "siphash",
-       "sha1",
        NULL
 };
 
@@ -70,11 +69,11 @@ const char * const bch_uint_opt[] = {
 };
 
 enum bch_opts {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)     \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)           \
        Opt_##_name,
 
-       CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+       BCH_VISIBLE_OPTS()
+#undef BCH_OPT
 
        Opt_bad_opt,
 };
@@ -144,15 +143,15 @@ static int parse_string_opt(const struct bch_option *opt, const char *s)
 static struct bch_opt_result parse_one_opt(const char *opt)
 {
        static const struct bch_option opt_table[] = {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)     \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)           \
                [Opt_##_name] = {                                       \
                        .name = #_name,                                 \
                        .opts = _choices,                               \
                        .min = _min,                                    \
                        .max = _max,                                    \
                },
-               CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+               BCH_VISIBLE_OPTS()
+#undef BCH_OPT
        }, *i;
 
        for (i = opt_table;
@@ -186,13 +185,13 @@ int bch_parse_options(struct cache_set_opts *opts, int flags, char *options)
                struct bch_opt_result res = parse_one_opt(p);
 
                switch (res.opt) {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)     \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)           \
                case Opt_##_name:                                       \
                        opts->_name = res.val;                          \
                        break;
 
-               CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+               BCH_VISIBLE_OPTS()
+#undef BCH_OPT
 
                case Opt_bad_opt:
                        return -EINVAL;
index 70df232c589209c50a66138f02031c8057436a30..1d30848f775a79672fadbbbd6b1fc01aee5aaaa6 100644 (file)
@@ -30,47 +30,47 @@ extern const char * const bch_bool_opt[];
 extern const char * const bch_uint_opt[];
 
 /* dummy option, for options that aren't stored in the superblock */
-LE64_BITMASK(NO_SB_OPT,                struct cache_sb, flags, 0, 0);
-
-#define CACHE_SET_VISIBLE_OPTS()                               \
-       CACHE_SET_OPT(verbose_recovery,                         \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, false)                         \
-       CACHE_SET_OPT(posix_acl,                                \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, false)                         \
-       CACHE_SET_OPT(journal_flush_disabled,                   \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, true)                          \
-       CACHE_SET_OPT(nofsck,                                   \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, true)                          \
-       CACHE_SET_OPT(fix_errors,                               \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, true)                          \
-       CACHE_SET_OPT(nochanges,                                \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, 0)                             \
-       CACHE_SET_OPT(noreplay,                                 \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, 0)                             \
-       CACHE_SET_OPT(norecovery,                               \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, 0)                             \
-       CACHE_SET_SB_OPTS()
-
-#define CACHE_SET_OPTS()                                       \
-       CACHE_SET_OPT(read_only,                                \
-                     bch_bool_opt, 0, 2,                       \
-                     NO_SB_OPT, 0)                             \
-       CACHE_SET_VISIBLE_OPTS()
+LE64_BITMASK(NO_SB_OPT,                struct bch_sb, flags[0], 0, 0);
+
+#define BCH_VISIBLE_OPTS()                                     \
+       BCH_OPT(verbose_recovery,                               \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, false)                               \
+       BCH_OPT(posix_acl,                                      \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, false)                               \
+       BCH_OPT(journal_flush_disabled,                         \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, true)                                \
+       BCH_OPT(nofsck,                                         \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, true)                                \
+       BCH_OPT(fix_errors,                                     \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, true)                                \
+       BCH_OPT(nochanges,                                      \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, 0)                                   \
+       BCH_OPT(noreplay,                                       \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, 0)                                   \
+       BCH_OPT(norecovery,                                     \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, 0)                                   \
+       BCH_SB_OPTS()
+
+#define BCH_OPTS()                                             \
+       BCH_OPT(read_only,                                      \
+               bch_bool_opt, 0, 2,                             \
+               NO_SB_OPT, 0)                                   \
+       BCH_VISIBLE_OPTS()
 
 struct cache_set_opts {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
        s8 _name;
 
-       CACHE_SET_OPTS()
-#undef CACHE_SET_OPT
+       BCH_OPTS()
+#undef BCH_OPT
 };
 
 static inline struct cache_set_opts cache_set_opts_empty(void)
@@ -85,27 +85,27 @@ static inline struct cache_set_opts cache_set_opts_empty(void)
  * Initial options from superblock - here we don't want any options undefined,
  * any options the superblock doesn't specify are set to 0:
  */
-static inline struct cache_set_opts cache_superblock_opts(struct cache_sb *sb)
+static inline struct cache_set_opts cache_superblock_opts(struct bch_sb *sb)
 {
        return (struct cache_set_opts) {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
                ._name = _sb_opt##_BITS ? _sb_opt(sb) : 0,
 
-       CACHE_SET_OPTS()
-#undef CACHE_SET_OPT
+       BCH_SB_OPTS()
+#undef BCH_OPT
        };
 }
 
 static inline void cache_set_opts_apply(struct cache_set_opts *dst,
                                        struct cache_set_opts src)
 {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)\
        BUILD_BUG_ON(_max > S8_MAX);                            \
        if (src._name >= 0)                                     \
                dst->_name = src._name;
 
-       CACHE_SET_OPTS()
-#undef CACHE_SET_OPT
+       BCH_SB_OPTS()
+#undef BCH_OPT
 }
 
 int bch_parse_options(struct cache_set_opts *, int, char *);
index 5ba80b52b7c19c75db1c83953ae8d947640449d6..3a6c9c8217f03719561f236ad3fe3ec3e66095f5 100644 (file)
  * https://131002.net/siphash/
  */
 
-//#include <sys/param.h>
-//#include <sys/systm.h>
-
 #include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <linux/bitops.h>
 #include <linux/string.h>
 
 #include "siphash.h"
 
-static void    SipHash_CRounds(SIPHASH_CTX *, int);
-static void    SipHash_Rounds(SIPHASH_CTX *, int);
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
+{
+       while (rounds--) {
+               ctx->v[0] += ctx->v[1];
+               ctx->v[2] += ctx->v[3];
+               ctx->v[1] = rol64(ctx->v[1], 13);
+               ctx->v[3] = rol64(ctx->v[3], 16);
+
+               ctx->v[1] ^= ctx->v[0];
+               ctx->v[3] ^= ctx->v[2];
+               ctx->v[0] = rol64(ctx->v[0], 32);
+
+               ctx->v[2] += ctx->v[1];
+               ctx->v[0] += ctx->v[3];
+               ctx->v[1] = rol64(ctx->v[1], 17);
+               ctx->v[3] = rol64(ctx->v[3], 21);
+
+               ctx->v[1] ^= ctx->v[2];
+               ctx->v[3] ^= ctx->v[0];
+               ctx->v[2] = rol64(ctx->v[2], 32);
+       }
+}
+
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
+{
+       u64 m = get_unaligned_le64(ptr);
 
-void
-SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
+       ctx->v[3] ^= m;
+       SipHash_Rounds(ctx, rounds);
+       ctx->v[0] ^= m;
+}
+
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
 {
        u64 k0, k1;
 
@@ -71,8 +98,8 @@ SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
        ctx->bytes = 0;
 }
 
-void
-SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
+                   const void *src, size_t len)
 {
        const u8 *ptr = src;
        size_t left, used;
@@ -88,7 +115,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
 
                if (len >= left) {
                        memcpy(&ctx->buf[used], ptr, left);
-                       SipHash_CRounds(ctx, rc);
+                       SipHash_CRounds(ctx, ctx->buf, rc);
                        len -= left;
                        ptr += left;
                } else {
@@ -98,8 +125,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
        }
 
        while (len >= sizeof(ctx->buf)) {
-               memcpy(ctx->buf, ptr, sizeof(ctx->buf));
-               SipHash_CRounds(ctx, rc);
+               SipHash_CRounds(ctx, ptr, rc);
                len -= sizeof(ctx->buf);
                ptr += sizeof(ctx->buf);
        }
@@ -108,8 +134,7 @@ SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, const void *src, size_t len)
                memcpy(&ctx->buf[used], ptr, len);
 }
 
-void
-SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
 {
        u64 r;
 
@@ -118,8 +143,7 @@ SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
        *((__le64 *) dst) = cpu_to_le64(r);
 }
 
-u64
-SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
 {
        u64 r;
        size_t left, used;
@@ -129,7 +153,7 @@ SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
        memset(&ctx->buf[used], 0, left - 1);
        ctx->buf[7] = ctx->bytes;
 
-       SipHash_CRounds(ctx, rc);
+       SipHash_CRounds(ctx, ctx->buf, rc);
        ctx->v[2] ^= 0xff;
        SipHash_Rounds(ctx, rf);
 
@@ -138,48 +162,11 @@ SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
        return (r);
 }
 
-u64
-SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
 {
        SIPHASH_CTX ctx;
 
        SipHash_Init(&ctx, key);
        SipHash_Update(&ctx, rc, rf, src, len);
-       return (SipHash_End(&ctx, rc, rf));
-}
-
-#define SIP_ROTL(x, b) ((x) << (b)) | ( (x) >> (64 - (b)))
-
-static void
-SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
-{
-       while (rounds--) {
-               ctx->v[0] += ctx->v[1];
-               ctx->v[2] += ctx->v[3];
-               ctx->v[1] = SIP_ROTL(ctx->v[1], 13);
-               ctx->v[3] = SIP_ROTL(ctx->v[3], 16);
-
-               ctx->v[1] ^= ctx->v[0];
-               ctx->v[3] ^= ctx->v[2];
-               ctx->v[0] = SIP_ROTL(ctx->v[0], 32);
-
-               ctx->v[2] += ctx->v[1];
-               ctx->v[0] += ctx->v[3];
-               ctx->v[1] = SIP_ROTL(ctx->v[1], 17);
-               ctx->v[3] = SIP_ROTL(ctx->v[3], 21);
-
-               ctx->v[1] ^= ctx->v[2];
-               ctx->v[3] ^= ctx->v[0];
-               ctx->v[2] = SIP_ROTL(ctx->v[2], 32);
-       }
-}
-
-static void
-SipHash_CRounds(SIPHASH_CTX *ctx, int rounds)
-{
-       u64 m = le64_to_cpu(*((__le64 *)ctx->buf));
-
-       ctx->v[3] ^= m;
-       SipHash_Rounds(ctx, rounds);
-       ctx->v[0] ^= m;
+       return SipHash_End(&ctx, rc, rf);
 }
index a489304ce2e6316eb326a5a1a0a8f7ad05d46cc4..b14d05c9656328c1116f327cfc7b5a957ccf3cf8 100644 (file)
@@ -3,37 +3,74 @@
 
 #include "btree_iter.h"
 #include "checksum.h"
+#include "inode.h"
 #include "siphash.h"
 #include "super.h"
 
-#include <crypto/sha1_base.h>
 #include <linux/crc32c.h>
+#include <crypto/hash.h>
 
-static const SIPHASH_KEY bch_siphash_key = {
-       .k0 = cpu_to_le64(0x5a9585fd80087730ULL),
-       .k1 = cpu_to_le64(0xc8de666d50b45664ULL ),
+struct bch_hash_info {
+       u8                      type;
+       union {
+               __le64          crc_key;
+               SIPHASH_KEY     siphash_key;
+       };
 };
 
+static inline struct bch_hash_info
+bch_hash_info_init(const struct bch_inode_unpacked *bi)
+{
+       /* XXX ick */
+       struct bch_hash_info info = {
+               .type = (bi->i_flags >> INODE_STR_HASH_OFFSET) &
+                       ~(~0 << INODE_STR_HASH_BITS)
+       };
+
+       switch (info.type) {
+       case BCH_STR_HASH_CRC32C:
+       case BCH_STR_HASH_CRC64:
+               info.crc_key = bi->i_hash_seed;
+               break;
+       case BCH_STR_HASH_SIPHASH: {
+               SHASH_DESC_ON_STACK(desc, bch_sha256);
+               u8 digest[crypto_shash_digestsize(bch_sha256)];
+
+               desc->tfm = bch_sha256;
+               desc->flags = 0;
+
+               crypto_shash_digest(desc, (void *) &bi->i_hash_seed,
+                                   sizeof(bi->i_hash_seed), digest);
+               memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
+               break;
+       }
+       default:
+               BUG();
+       }
+
+       return info;
+}
+
 struct bch_str_hash_ctx {
        union {
-               u32                     crc32c;
-               u64                     crc64;
-               SIPHASH_CTX             siphash;
+               u32             crc32c;
+               u64             crc64;
+               SIPHASH_CTX     siphash;
        };
 };
 
 static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx,
-                                    enum bch_str_hash_type type)
+                                    const struct bch_hash_info *info)
 {
-       switch (type) {
+       switch (info->type) {
        case BCH_STR_HASH_CRC32C:
-               ctx->crc32c = ~0;
+               ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
                break;
        case BCH_STR_HASH_CRC64:
-               ctx->crc64 = ~0;
+               ctx->crc64 = bch_crc64_update(~0, &info->crc_key, sizeof(info->crc_key));
                break;
        case BCH_STR_HASH_SIPHASH:
-               SipHash24_Init(&ctx->siphash, &bch_siphash_key);
+               SipHash24_Init(&ctx->siphash, &info->siphash_key);
                break;
        default:
                BUG();
@@ -41,10 +78,10 @@ static inline void bch_str_hash_init(struct bch_str_hash_ctx *ctx,
 }
 
 static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx,
-                               enum bch_str_hash_type type,
-                               const void *data, size_t len)
+                                      const struct bch_hash_info *info,
+                                      const void *data, size_t len)
 {
-       switch (type) {
+       switch (info->type) {
        case BCH_STR_HASH_CRC32C:
                ctx->crc32c = crc32c(ctx->crc32c, data, len);
                break;
@@ -60,9 +97,9 @@ static inline void bch_str_hash_update(struct bch_str_hash_ctx *ctx,
 }
 
 static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx,
-                                  enum bch_str_hash_type type)
+                                  const struct bch_hash_info *info)
 {
-       switch (type) {
+       switch (info->type) {
        case BCH_STR_HASH_CRC32C:
                return ctx->crc32c;
        case BCH_STR_HASH_CRC64:
@@ -74,19 +111,6 @@ static inline u64 bch_str_hash_end(struct bch_str_hash_ctx *ctx,
        }
 }
 
-struct bch_hash_info {
-       u64             seed;
-       u8              type;
-};
-
-static inline struct bch_hash_info bch_hash_info_init(const struct bch_inode *bi)
-{
-       return (struct bch_hash_info) {
-               .seed = le64_to_cpu(bi->i_hash_seed),
-               .type = INODE_STR_HASH_TYPE(bi),
-       };
-}
-
 struct bch_hash_desc {
        enum btree_id   btree_id;
        u8              key_type;
diff --git a/libbcache/super-io.c b/libbcache/super-io.c
new file mode 100644 (file)
index 0000000..66338a1
--- /dev/null
@@ -0,0 +1,798 @@
+
+#include "bcache.h"
+#include "blockdev.h"
+#include "checksum.h"
+#include "error.h"
+#include "io.h"
+#include "journal.h"
+#include "super-io.h"
+#include "super.h"
+#include "vstructs.h"
+
+#include <linux/backing-dev.h>
+
+static inline void __bch_sb_layout_size_assert(void)
+{
+       BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
+}
+
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
+                                     enum bch_sb_field_types type)
+{
+       struct bch_sb_field *f;
+
+       /* XXX: need locking around superblock to access optional fields */
+
+       vstruct_for_each(sb, f)
+               if (le32_to_cpu(f->type) == type)
+                       return f;
+       return NULL;
+}
+
+void bch_free_super(struct bcache_superblock *sb)
+{
+       if (sb->bio)
+               bio_put(sb->bio);
+       if (!IS_ERR_OR_NULL(sb->bdev))
+               blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+
+       free_pages((unsigned long) sb->sb, sb->page_order);
+       memset(sb, 0, sizeof(*sb));
+}
+
+static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
+{
+       struct bch_sb *new_sb;
+       struct bio *bio;
+
+       if (sb->page_order >= order && sb->sb)
+               return 0;
+
+       if (dynamic_fault("bcache:add:super_realloc"))
+               return -ENOMEM;
+
+       bio = bio_kmalloc(GFP_KERNEL, 1 << order);
+       if (!bio)
+               return -ENOMEM;
+
+       if (sb->bio)
+               bio_put(sb->bio);
+       sb->bio = bio;
+
+       new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
+       if (!new_sb)
+               return -ENOMEM;
+
+       if (sb->sb)
+               memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
+
+       free_pages((unsigned long) sb->sb, sb->page_order);
+       sb->sb = new_sb;
+
+       sb->page_order = order;
+
+       return 0;
+}
+
+int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
+{
+       u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+       u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+       if (new_bytes > max_bytes) {
+               char buf[BDEVNAME_SIZE];
+
+               pr_err("%s: superblock too big: want %llu but have %llu",
+                      bdevname(sb->bdev, buf), new_bytes, max_bytes);
+               return -ENOSPC;
+       }
+
+       return __bch_super_realloc(sb, get_order(new_bytes));
+}
+
+static int bch_fs_sb_realloc(struct cache_set *c, unsigned u64s)
+{
+       u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
+       struct bch_sb *sb;
+       unsigned order = get_order(bytes);
+
+       if (c->disk_sb && order <= c->disk_sb_order)
+               return 0;
+
+       sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+       if (!sb)
+               return -ENOMEM;
+
+       if (c->disk_sb)
+               memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
+
+       free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
+
+       c->disk_sb = sb;
+       c->disk_sb_order = order;
+       return 0;
+}
+
+static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
+                                                 struct bch_sb_field *f,
+                                                 unsigned u64s)
+{
+       unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+
+       if (!f) {
+               f = vstruct_last(sb);
+               memset(f, 0, sizeof(u64) * u64s);
+               f->u64s = cpu_to_le32(u64s);
+               f->type = 0;
+       } else {
+               void *src, *dst;
+
+               src = vstruct_end(f);
+               f->u64s = cpu_to_le32(u64s);
+               dst = vstruct_end(f);
+
+               memmove(dst, src, vstruct_end(sb) - src);
+
+               if (dst > src)
+                       memset(src, 0, dst - src);
+       }
+
+       le32_add_cpu(&sb->u64s, u64s - old_u64s);
+
+       return f;
+
+}
+
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
+                                           struct bch_sb_field *f,
+                                           unsigned u64s)
+{
+       ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+       ssize_t d = -old_u64s + u64s;
+       struct cache *ca;
+       unsigned i;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       if (bch_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
+               return NULL;
+
+       for_each_cache(ca, c, i) {
+               struct bcache_superblock *sb = &ca->disk_sb;
+
+               if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+                       percpu_ref_put(&ca->ref);
+                       return NULL;
+               }
+       }
+
+       return __bch_sb_field_resize(c->disk_sb, f, u64s);
+}
+
+struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb,
+                                            struct bch_sb_field *f,
+                                            unsigned u64s)
+{
+       ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+       ssize_t d = -old_u64s + u64s;
+
+       if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+               return NULL;
+
+       return __bch_sb_field_resize(sb->sb, f, u64s);
+}
+
+static const char *validate_sb_layout(struct bch_sb_layout *layout)
+{
+       u64 offset, prev_offset, max_sectors;
+       unsigned i;
+
+       if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
+               return "Not a bcache superblock layout";
+
+       if (layout->layout_type != 0)
+               return "Invalid superblock layout type";
+
+       if (!layout->nr_superblocks)
+               return "Invalid superblock layout: no superblocks";
+
+       if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
+               return "Invalid superblock layout: too many superblocks";
+
+       max_sectors = 1 << layout->sb_max_size_bits;
+
+       prev_offset = le64_to_cpu(layout->sb_offset[0]);
+
+       if (prev_offset != BCH_SB_SECTOR)
+               return "Invalid superblock layout: doesn't have default superblock location";
+
+       for (i = 1; i < layout->nr_superblocks; i++) {
+               offset = le64_to_cpu(layout->sb_offset[i]);
+
+               if (offset < prev_offset + max_sectors)
+                       return "Invalid superblock layout: superblocks overlap";
+               prev_offset = offset;
+       }
+
+       return NULL;
+}
+
+const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
+{
+       struct bch_sb *sb = disk_sb->sb;
+       struct bch_sb_field *f;
+       struct bch_sb_field_members *sb_mi;
+       struct bch_sb_field_journal *journal;
+       struct cache_member_cpu mi;
+       const char *err;
+       u16 block_size;
+       unsigned i;
+
+       switch (le64_to_cpu(sb->version)) {
+       case BCACHE_SB_VERSION_CDEV_V4:
+               break;
+       default:
+               return"Unsupported superblock version";
+       }
+
+       if (BCH_SB_INITIALIZED(sb) &&
+           le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V4)
+               return "Unsupported superblock version";
+
+       block_size = le16_to_cpu(sb->block_size);
+
+       if (!is_power_of_2(block_size) ||
+           block_size > PAGE_SECTORS)
+               return "Bad block size";
+
+       if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
+               return "Bad user UUID";
+
+       if (bch_is_zero(sb->uuid.b, sizeof(uuid_le)))
+               return "Bad internal UUID";
+
+       if (!sb->nr_devices ||
+           sb->nr_devices <= sb->dev_idx ||
+           sb->nr_devices > BCH_SB_MEMBERS_MAX)
+               return "Bad cache device number in set";
+
+       if (!BCH_SB_META_REPLICAS_WANT(sb) ||
+           BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+               return "Invalid number of metadata replicas";
+
+       if (!BCH_SB_META_REPLICAS_HAVE(sb) ||
+           BCH_SB_META_REPLICAS_HAVE(sb) >
+           BCH_SB_META_REPLICAS_WANT(sb))
+               return "Invalid number of metadata replicas";
+
+       if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
+           BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+               return "Invalid number of data replicas";
+
+       if (!BCH_SB_DATA_REPLICAS_HAVE(sb) ||
+           BCH_SB_DATA_REPLICAS_HAVE(sb) >
+           BCH_SB_DATA_REPLICAS_WANT(sb))
+               return "Invalid number of data replicas";
+
+       if (!BCH_SB_BTREE_NODE_SIZE(sb))
+               return "Btree node size not set";
+
+       if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
+               return "Btree node size not a power of two";
+
+       if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
+               return "Btree node size too large";
+
+       if (BCH_SB_GC_RESERVE(sb) < 5)
+               return "gc reserve percentage too small";
+
+       if (1U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) < block_size)
+               return "max journal entry size too small";
+
+       /* 4 mb max: */
+       if (512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
+               return "max journal entry size too big";
+
+       if (!sb->time_precision ||
+           le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
+               return "invalid time precision";
+
+       /* validate layout */
+       err = validate_sb_layout(&sb->layout);
+       if (err)
+               return err;
+
+       vstruct_for_each(sb, f) {
+               if (!f->u64s)
+                       return "Invalid superblock: invalid optional field";
+
+               if (vstruct_next(f) > vstruct_last(sb))
+                       return "Invalid superblock: invalid optional field";
+
+               if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR)
+                       return "Invalid superblock: unknown optional field type";
+       }
+
+       /* Validate member info: */
+       sb_mi = bch_sb_get_members(sb);
+       if (!sb_mi)
+               return "Invalid superblock: member info area missing";
+
+       if ((void *) (sb_mi->members + sb->nr_devices) >
+           vstruct_end(&sb_mi->field))
+               return "Invalid superblock: bad member info";
+
+       mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
+
+       for (i = 0; i < sb->layout.nr_superblocks; i++) {
+               u64 offset = le64_to_cpu(sb->layout.sb_offset[i]);
+               u64 max_size = 1 << sb->layout.sb_max_size_bits;
+
+               if (offset + max_size > mi.first_bucket * mi.bucket_size)
+                       return "Invalid superblock: first bucket comes before end of super";
+       }
+
+       if (mi.nbuckets > LONG_MAX)
+               return "Too many buckets";
+
+       if (mi.nbuckets - mi.first_bucket < 1 << 10)
+               return "Not enough buckets";
+
+       if (!is_power_of_2(mi.bucket_size) ||
+           mi.bucket_size < PAGE_SECTORS ||
+           mi.bucket_size < block_size)
+               return "Bad bucket size";
+
+       if (get_capacity(disk_sb->bdev->bd_disk) <
+           mi.bucket_size * mi.nbuckets)
+               return "Invalid superblock: device too small";
+
+       /* Validate journal buckets: */
+       journal = bch_sb_get_journal(sb);
+       if (journal) {
+               for (i = 0; i < bch_nr_journal_buckets(journal); i++) {
+                       u64 b = le64_to_cpu(journal->buckets[i]);
+
+                       if (b <  mi.first_bucket || b >= mi.nbuckets)
+                               return "bad journal bucket";
+               }
+       }
+
+       return NULL;
+}
+
+/* device open: */
+
+static bool bch_is_open_cache(struct block_device *bdev)
+{
+       struct cache_set *c;
+       struct cache *ca;
+       unsigned i;
+
+       rcu_read_lock();
+       list_for_each_entry(c, &bch_cache_sets, list)
+               for_each_cache_rcu(ca, c, i)
+                       if (ca->disk_sb.bdev == bdev) {
+                               rcu_read_unlock();
+                               return true;
+                       }
+       rcu_read_unlock();
+       return false;
+}
+
+static bool bch_is_open(struct block_device *bdev)
+{
+       lockdep_assert_held(&bch_register_lock);
+
+       return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+}
+
+static const char *bch_blkdev_open(const char *path, void *holder,
+                                  struct cache_set_opts opts,
+                                  struct block_device **ret)
+{
+       struct block_device *bdev;
+       fmode_t mode = opts.nochanges > 0
+               ? FMODE_READ
+               : FMODE_READ|FMODE_WRITE|FMODE_EXCL;
+       const char *err;
+
+       *ret = NULL;
+       bdev = blkdev_get_by_path(path, mode, holder);
+
+       if (bdev == ERR_PTR(-EBUSY)) {
+               bdev = lookup_bdev(path);
+               if (IS_ERR(bdev))
+                       return "device busy";
+
+               err = bch_is_open(bdev)
+                       ? "device already registered"
+                       : "device busy";
+
+               bdput(bdev);
+               return err;
+       }
+
+       if (IS_ERR(bdev))
+               return "failed to open device";
+
+       bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+
+       *ret = bdev;
+       return NULL;
+}
+
+/* Update cached mi: */
+int bch_cache_set_mi_update(struct cache_set *c,
+                           struct bch_member *mi,
+                           unsigned nr_devices)
+{
+       struct cache_member_rcu *new, *old;
+       struct cache *ca;
+       unsigned i;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       new = kzalloc(sizeof(struct cache_member_rcu) +
+                     sizeof(struct cache_member_cpu) * nr_devices,
+                     GFP_KERNEL);
+       if (!new)
+               return -ENOMEM;
+
+       new->nr_devices = nr_devices;
+
+       for (i = 0; i < nr_devices; i++)
+               new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
+
+       rcu_read_lock();
+       for_each_cache(ca, c, i)
+               ca->mi = new->m[i];
+       rcu_read_unlock();
+
+       old = rcu_dereference_protected(c->members,
+                               lockdep_is_held(&c->sb_lock));
+
+       rcu_assign_pointer(c->members, new);
+       if (old)
+               kfree_rcu(old, rcu);
+
+       return 0;
+}
+
+static void bch_sb_update(struct cache_set *c)
+{
+       struct bch_sb *src = c->disk_sb;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       c->sb.uuid              = src->uuid;
+       c->sb.user_uuid         = src->user_uuid;
+       c->sb.block_size        = le16_to_cpu(src->block_size);
+       c->sb.btree_node_size   = BCH_SB_BTREE_NODE_SIZE(src);
+       c->sb.nr_devices        = src->nr_devices;
+       c->sb.clean             = BCH_SB_CLEAN(src);
+       c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src);
+       c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src);
+       c->sb.str_hash_type     = BCH_SB_STR_HASH_TYPE(src);
+       c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
+       c->sb.time_base_lo      = le64_to_cpu(src->time_base_lo);
+       c->sb.time_base_hi      = le32_to_cpu(src->time_base_hi);
+       c->sb.time_precision    = le32_to_cpu(src->time_precision);
+}
+
+/* doesn't copy member info */
+static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
+{
+       struct bch_sb_field *src_f, *dst_f;
+
+       dst->version            = src->version;
+       dst->seq                = src->seq;
+       dst->uuid               = src->uuid;
+       dst->user_uuid          = src->user_uuid;
+       memcpy(dst->label,      src->label, sizeof(dst->label));
+
+       dst->block_size         = src->block_size;
+       dst->nr_devices         = src->nr_devices;
+
+       dst->time_base_lo       = src->time_base_lo;
+       dst->time_base_hi       = src->time_base_hi;
+       dst->time_precision     = src->time_precision;
+
+       memcpy(dst->flags,      src->flags,     sizeof(dst->flags));
+       memcpy(dst->features,   src->features,  sizeof(dst->features));
+       memcpy(dst->compat,     src->compat,    sizeof(dst->compat));
+
+       vstruct_for_each(src, src_f) {
+               if (src_f->type == BCH_SB_FIELD_journal)
+                       continue;
+
+               dst_f = bch_sb_field_get(dst, src_f->type);
+               dst_f = __bch_sb_field_resize(dst, dst_f,
+                               le32_to_cpu(src_f->u64s));
+
+               memcpy(dst_f, src_f, vstruct_bytes(src_f));
+       }
+}
+
+int bch_sb_to_cache_set(struct cache_set *c, struct bch_sb *src)
+{
+       struct bch_sb_field_members *members =
+               bch_sb_get_members(src);
+       struct bch_sb_field_journal *journal_buckets =
+               bch_sb_get_journal(src);
+       unsigned journal_u64s = journal_buckets
+               ? le32_to_cpu(journal_buckets->field.u64s)
+               : 0;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       if (bch_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s))
+               return -ENOMEM;
+
+       if (bch_cache_set_mi_update(c, members->members, src->nr_devices))
+               return -ENOMEM;
+
+       __copy_super(c->disk_sb, src);
+       bch_sb_update(c);
+
+       return 0;
+}
+
+int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca)
+{
+       struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
+       struct bch_sb_field_journal *journal_buckets =
+               bch_sb_get_journal(dst);
+       unsigned journal_u64s = journal_buckets
+               ? le32_to_cpu(journal_buckets->field.u64s)
+               : 0;
+       unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
+       int ret;
+
+       ret = bch_dev_sb_realloc(&ca->disk_sb, u64s);
+       if (ret)
+               return ret;
+
+       __copy_super(dst, src);
+
+       return 0;
+}
+
+/* read superblock: */
+
+static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
+{
+       struct bch_csum csum;
+       size_t bytes;
+       unsigned order;
+reread:
+       bio_reset(sb->bio);
+       sb->bio->bi_bdev = sb->bdev;
+       sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR;
+       sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
+       bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+       bch_bio_map(sb->bio, sb->sb);
+
+       if (submit_bio_wait(sb->bio))
+               return "IO error";
+
+       if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
+               return "Not a bcache superblock";
+
+       if (le64_to_cpu(sb->sb->version) != BCACHE_SB_VERSION_CDEV_V4)
+               return "Unsupported superblock version";
+
+       bytes = vstruct_bytes(sb->sb);
+
+       if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
+               return "Bad superblock: too big";
+
+       order = get_order(bytes);
+       if (order > sb->page_order) {
+               if (__bch_super_realloc(sb, order))
+                       return "cannot allocate memory";
+               goto reread;
+       }
+
+       if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
+               return "unknown csum type";
+
+       /* XXX: verify MACs */
+       csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
+                           (struct nonce) { 0 }, sb->sb);
+
+       if (bch_crc_cmp(csum, sb->sb->csum))
+               return "bad checksum reading superblock";
+
+       return NULL;
+}
+
+const char *bch_read_super(struct bcache_superblock *sb,
+                          struct cache_set_opts opts,
+                          const char *path)
+{
+       struct bch_sb_layout layout;
+       const char *err;
+       unsigned i;
+
+       lockdep_assert_held(&bch_register_lock);
+
+       memset(sb, 0, sizeof(*sb));
+
+       err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
+       if (err)
+               return err;
+
+       err = "cannot allocate memory";
+       if (__bch_super_realloc(sb, 0))
+               goto err;
+
+       err = "dynamic fault";
+       if (cache_set_init_fault("read_super"))
+               goto err;
+
+       err = read_one_super(sb, BCH_SB_SECTOR);
+       if (!err)
+               goto got_super;
+
+       pr_err("error reading default super: %s", err);
+
+       /*
+        * Error reading primary superblock - read location of backup
+        * superblocks:
+        */
+       bio_reset(sb->bio);
+       sb->bio->bi_bdev = sb->bdev;
+       sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
+       sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
+       bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
+       /*
+        * use sb buffer to read layout, since sb buffer is page aligned but
+        * layout won't be:
+        */
+       bch_bio_map(sb->bio, sb->sb);
+
+       err = "IO error";
+       if (submit_bio_wait(sb->bio))
+               goto err;
+
+       memcpy(&layout, sb->sb, sizeof(layout));
+       err = validate_sb_layout(&layout);
+       if (err)
+               goto err;
+
+       for (i = 0; i < layout.nr_superblocks; i++) {
+               u64 offset = le64_to_cpu(layout.sb_offset[i]);
+
+               if (offset == BCH_SB_SECTOR)
+                       continue;
+
+               err = read_one_super(sb, offset);
+               if (!err)
+                       goto got_super;
+       }
+       goto err;
+got_super:
+       pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+                le64_to_cpu(sb->sb->version),
+                le64_to_cpu(sb->sb->flags),
+                le64_to_cpu(sb->sb->seq),
+                le16_to_cpu(sb->sb->u64s));
+
+       err = "Superblock block size smaller than device block size";
+       if (le16_to_cpu(sb->sb->block_size) << 9 <
+           bdev_logical_block_size(sb->bdev))
+               goto err;
+
+       return NULL;
+err:
+       bch_free_super(sb);
+       return err;
+}
+
+/* write superblock: */
+
+static void write_super_endio(struct bio *bio)
+{
+       struct cache *ca = bio->bi_private;
+
+       /* XXX: return errors directly */
+
+       cache_fatal_io_err_on(bio->bi_error, ca, "superblock write");
+
+       bch_account_io_completion(ca);
+
+       closure_put(&ca->set->sb_write);
+       percpu_ref_put(&ca->ref);
+}
+
+static bool write_one_super(struct cache_set *c, struct cache *ca, unsigned idx)
+{
+       struct bch_sb *sb = ca->disk_sb.sb;
+       struct bio *bio = ca->disk_sb.bio;
+
+       if (idx >= sb->layout.nr_superblocks)
+               return false;
+
+       sb->offset = sb->layout.sb_offset[idx];
+
+       SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
+       sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
+                               (struct nonce) { 0 }, sb);
+
+       bio_reset(bio);
+       bio->bi_bdev            = ca->disk_sb.bdev;
+       bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
+       bio->bi_iter.bi_size    =
+               roundup(vstruct_bytes(sb),
+                       bdev_logical_block_size(ca->disk_sb.bdev));
+       bio->bi_end_io          = write_super_endio;
+       bio->bi_private         = ca;
+       bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
+       bch_bio_map(bio, sb);
+
+       percpu_ref_get(&ca->ref);
+       closure_bio_submit_punt(bio, &c->sb_write, c);
+
+       return true;
+}
+
+void bch_write_super(struct cache_set *c)
+{
+       struct bch_sb_field_members *members =
+               bch_sb_get_members(c->disk_sb);
+       struct closure *cl = &c->sb_write;
+       struct cache *ca;
+       unsigned i, super_idx = 0;
+       bool wrote;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       closure_init_stack(cl);
+
+       le64_add_cpu(&c->disk_sb->seq, 1);
+
+       for_each_cache(ca, c, i)
+               bch_sb_from_cache_set(c, ca);
+
+       do {
+               wrote = false;
+               for_each_cache(ca, c, i)
+                       if (write_one_super(c, ca, super_idx))
+                               wrote = true;
+
+               closure_sync(cl);
+               super_idx++;
+       } while (wrote);
+
+       /* Make new options visible after they're persistent: */
+       bch_cache_set_mi_update(c, members->members, c->sb.nr_devices);
+       bch_sb_update(c);
+}
+
+void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
+                                  bool meta)
+{
+       struct bch_member *mi;
+       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+       const struct bch_extent_ptr *ptr;
+
+       mutex_lock(&c->sb_lock);
+
+       /* recheck, might have raced */
+       if (bch_check_super_marked(c, k, meta)) {
+               mutex_unlock(&c->sb_lock);
+               return;
+       }
+
+       mi = bch_sb_get_members(c->disk_sb)->members;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached)
+                       (meta
+                        ? SET_BCH_MEMBER_HAS_METADATA
+                        : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true);
+
+       bch_write_super(c);
+       mutex_unlock(&c->sb_lock);
+}
diff --git a/libbcache/super-io.h b/libbcache/super-io.h
new file mode 100644 (file)
index 0000000..1eda57b
--- /dev/null
@@ -0,0 +1,141 @@
+#ifndef _BCACHE_SUPER_IO_H
+#define _BCACHE_SUPER_IO_H
+
+#include "extents.h"
+#include "super_types.h"
+
+#include <asm/byteorder.h>
+
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types);
+
+#define BCH_SB_FIELD_TYPE(_name)                               \
+static inline struct bch_sb_field_##_name *                    \
+bch_sb_get_##_name(struct bch_sb *sb)                          \
+{                                                              \
+       struct bch_sb_field *f =                                \
+               bch_sb_field_get(sb, BCH_SB_FIELD_##_name);     \
+                                                               \
+       return container_of_or_null(f, struct bch_sb_field_##_name, field);\
+}
+
+BCH_SB_FIELD_TYPE(journal);
+BCH_SB_FIELD_TYPE(members);
+BCH_SB_FIELD_TYPE(crypt);
+
+static inline bool bch_sb_test_feature(struct bch_sb *sb,
+                                      enum bch_sb_features f)
+{
+       unsigned w = f / 64;
+       unsigned b = f % 64;
+
+       return le64_to_cpu(sb->features[w]) & (1ULL << b);
+}
+
+static inline void bch_sb_set_feature(struct bch_sb *sb,
+                                     enum bch_sb_features f)
+{
+       if (!bch_sb_test_feature(sb, f)) {
+               unsigned w = f / 64;
+               unsigned b = f % 64;
+
+               le64_add_cpu(&sb->features[w], 1ULL << b);
+       }
+}
+
+static inline __le64 bch_sb_magic(struct cache_set *c)
+{
+       __le64 ret;
+       memcpy(&ret, &c->sb.uuid, sizeof(ret));
+       return ret;
+}
+
+static inline __u64 jset_magic(struct cache_set *c)
+{
+       return __le64_to_cpu(bch_sb_magic(c) ^ JSET_MAGIC);
+}
+
+static inline __u64 pset_magic(struct cache_set *c)
+{
+       return __le64_to_cpu(bch_sb_magic(c) ^ PSET_MAGIC);
+}
+
+static inline __u64 bset_magic(struct cache_set *c)
+{
+       return __le64_to_cpu(bch_sb_magic(c) ^ BSET_MAGIC);
+}
+
+static inline struct cache_member_cpu cache_mi_to_cpu_mi(struct bch_member *mi)
+{
+       return (struct cache_member_cpu) {
+               .nbuckets       = le64_to_cpu(mi->nbuckets),
+               .first_bucket   = le16_to_cpu(mi->first_bucket),
+               .bucket_size    = le16_to_cpu(mi->bucket_size),
+               .state          = BCH_MEMBER_STATE(mi),
+               .tier           = BCH_MEMBER_TIER(mi),
+               .has_metadata   = BCH_MEMBER_HAS_METADATA(mi),
+               .has_data       = BCH_MEMBER_HAS_DATA(mi),
+               .replacement    = BCH_MEMBER_REPLACEMENT(mi),
+               .discard        = BCH_MEMBER_DISCARD(mi),
+               .valid          = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
+       };
+}
+
+int bch_cache_set_mi_update(struct cache_set *, struct bch_member *, unsigned);
+
+int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *);
+int bch_sb_from_cache_set(struct cache_set *, struct cache *);
+
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
+                               struct bch_sb_field *, unsigned);
+struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *,
+                               struct bch_sb_field *, unsigned);
+
+void bch_free_super(struct bcache_superblock *);
+int bch_super_realloc(struct bcache_superblock *, unsigned);
+
+const char *bch_validate_cache_super(struct bcache_superblock *);
+
+const char *bch_read_super(struct bcache_superblock *,
+                          struct cache_set_opts, const char *);
+void bch_write_super(struct cache_set *);
+
+void bch_check_mark_super_slowpath(struct cache_set *,
+                                  const struct bkey_i *, bool);
+
+#define cache_member_info_get(_c)                                      \
+       (rcu_read_lock(), rcu_dereference((_c)->members))
+
+#define cache_member_info_put()        rcu_read_unlock()
+
+static inline bool bch_check_super_marked(struct cache_set *c,
+                                         const struct bkey_i *k, bool meta)
+{
+       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+       const struct bch_extent_ptr *ptr;
+       struct cache_member_cpu *mi = cache_member_info_get(c)->m;
+       bool ret = true;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached &&
+                   !(meta
+                     ? mi[ptr->dev].has_metadata
+                     : mi[ptr->dev].has_data)) {
+                       ret = false;
+                       break;
+               }
+
+       cache_member_info_put();
+
+       return ret;
+}
+
+static inline void bch_check_mark_super(struct cache_set *c,
+                                       const struct bkey_i *k, bool meta)
+{
+       if (bch_check_super_marked(c, k, meta))
+               return;
+
+       bch_check_mark_super_slowpath(c, k, meta);
+}
+
+#endif /* _BCACHE_SUPER_IO_H */
index 296700b30dd54f824bdbbe8a1cd98984455341c4..c026c0ddafe548e8be37216ef8a443a7902fd312 100644 (file)
 #include "notify.h"
 #include "stats.h"
 #include "super.h"
+#include "super-io.h"
 #include "tier.h"
 #include "writeback.h"
 
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/debugfs.h>
+#include <linux/device.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
 #include <linux/kthread.h>
@@ -69,70 +71,11 @@ static struct device *bch_chardev;
 static DEFINE_IDR(bch_chardev_minor);
 static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
 struct workqueue_struct *bcache_io_wq;
-struct crypto_shash *bch_sha1;
+struct crypto_shash *bch_sha256;
 
 static void bch_cache_stop(struct cache *);
 static int bch_cache_online(struct cache *);
 
-static bool bch_is_open_cache(struct block_device *bdev)
-{
-       struct cache_set *c;
-       struct cache *ca;
-       unsigned i;
-
-       rcu_read_lock();
-       list_for_each_entry(c, &bch_cache_sets, list)
-               for_each_cache_rcu(ca, c, i)
-                       if (ca->disk_sb.bdev == bdev) {
-                               rcu_read_unlock();
-                               return true;
-                       }
-       rcu_read_unlock();
-       return false;
-}
-
-static bool bch_is_open(struct block_device *bdev)
-{
-       lockdep_assert_held(&bch_register_lock);
-
-       return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
-}
-
-static const char *bch_blkdev_open(const char *path, void *holder,
-                                  struct cache_set_opts opts,
-                                  struct block_device **ret)
-{
-       struct block_device *bdev;
-       fmode_t mode = opts.nochanges > 0
-               ? FMODE_READ
-               : FMODE_READ|FMODE_WRITE|FMODE_EXCL;
-       const char *err;
-
-       *ret = NULL;
-       bdev = blkdev_get_by_path(path, mode, holder);
-
-       if (bdev == ERR_PTR(-EBUSY)) {
-               bdev = lookup_bdev(path);
-               if (IS_ERR(bdev))
-                       return "device busy";
-
-               err = bch_is_open(bdev)
-                       ? "device already registered"
-                       : "device busy";
-
-               bdput(bdev);
-               return err;
-       }
-
-       if (IS_ERR(bdev))
-               return "failed to open device";
-
-       bdev_get_queue(bdev)->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
-
-       *ret = bdev;
-       return NULL;
-}
-
 static int bch_congested_fn(void *data, int bdi_bits)
 {
        struct backing_dev_info *bdi;
@@ -168,520 +111,6 @@ static int bch_congested_fn(void *data, int bdi_bits)
        return ret;
 }
 
-/* Superblock */
-
-static struct cache_member_cpu cache_mi_to_cpu_mi(struct cache_member *mi)
-{
-       return (struct cache_member_cpu) {
-               .nbuckets       = le64_to_cpu(mi->nbuckets),
-               .first_bucket   = le16_to_cpu(mi->first_bucket),
-               .bucket_size    = le16_to_cpu(mi->bucket_size),
-               .state          = CACHE_STATE(mi),
-               .tier           = CACHE_TIER(mi),
-               .replication_set= CACHE_REPLICATION_SET(mi),
-               .has_metadata   = CACHE_HAS_METADATA(mi),
-               .has_data       = CACHE_HAS_DATA(mi),
-               .replacement    = CACHE_REPLACEMENT(mi),
-               .discard        = CACHE_DISCARD(mi),
-               .valid          = !bch_is_zero(mi->uuid.b, sizeof(uuid_le)),
-       };
-}
-
-static const char *validate_cache_super(struct bcache_superblock *disk_sb)
-{
-       struct cache_sb *sb = disk_sb->sb;
-       struct cache_member_cpu mi;
-       u16 block_size;
-       unsigned i;
-
-       switch (le64_to_cpu(sb->version)) {
-       case BCACHE_SB_VERSION_CDEV_V0:
-       case BCACHE_SB_VERSION_CDEV_WITH_UUID:
-       case BCACHE_SB_VERSION_CDEV_V2:
-       case BCACHE_SB_VERSION_CDEV_V3:
-               break;
-       default:
-               return"Unsupported superblock version";
-       }
-
-       if (CACHE_SET_SYNC(sb) &&
-           le64_to_cpu(sb->version) != BCACHE_SB_VERSION_CDEV_V3)
-               return "Unsupported superblock version";
-
-       block_size = le16_to_cpu(sb->block_size);
-
-       if (!is_power_of_2(block_size) ||
-           block_size > PAGE_SECTORS)
-               return "Bad block size";
-
-       if (bch_is_zero(sb->disk_uuid.b, sizeof(uuid_le)))
-               return "Bad disk UUID";
-
-       if (bch_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
-               return "Bad user UUID";
-
-       if (bch_is_zero(sb->set_uuid.b, sizeof(uuid_le)))
-               return "Bad set UUID";
-
-       if (!sb->nr_in_set ||
-           sb->nr_in_set <= sb->nr_this_dev ||
-           sb->nr_in_set > MAX_CACHES_PER_SET)
-               return "Bad cache device number in set";
-
-       if (!CACHE_SET_META_REPLICAS_WANT(sb) ||
-           CACHE_SET_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
-               return "Invalid number of metadata replicas";
-
-       if (!CACHE_SET_META_REPLICAS_HAVE(sb) ||
-           CACHE_SET_META_REPLICAS_HAVE(sb) >
-           CACHE_SET_META_REPLICAS_WANT(sb))
-               return "Invalid number of metadata replicas";
-
-       if (!CACHE_SET_DATA_REPLICAS_WANT(sb) ||
-           CACHE_SET_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
-               return "Invalid number of data replicas";
-
-       if (!CACHE_SET_DATA_REPLICAS_HAVE(sb) ||
-           CACHE_SET_DATA_REPLICAS_HAVE(sb) >
-           CACHE_SET_DATA_REPLICAS_WANT(sb))
-               return "Invalid number of data replicas";
-
-       if (CACHE_SB_CSUM_TYPE(sb) >= BCH_CSUM_NR)
-               return "Invalid checksum type";
-
-       if (!CACHE_SET_BTREE_NODE_SIZE(sb))
-               return "Btree node size not set";
-
-       if (!is_power_of_2(CACHE_SET_BTREE_NODE_SIZE(sb)))
-               return "Btree node size not a power of two";
-
-       if (CACHE_SET_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
-               return "Btree node size too large";
-
-       /* Default value, for old filesystems: */
-       if (!CACHE_SET_GC_RESERVE(sb))
-               SET_CACHE_SET_GC_RESERVE(sb, 10);
-
-       if (CACHE_SET_GC_RESERVE(sb) < 5)
-               return "gc reserve percentage too small";
-
-       if (!CACHE_SET_JOURNAL_ENTRY_SIZE(sb))
-               SET_CACHE_SET_JOURNAL_ENTRY_SIZE(sb, 9);
-
-       /* 4 mb max: */
-       if (512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb) > JOURNAL_ENTRY_SIZE_MAX)
-               return "max journal entry size too big";
-
-       if (le16_to_cpu(sb->u64s) < bch_journal_buckets_offset(sb))
-               return "Invalid superblock: member info area missing";
-
-       mi = cache_mi_to_cpu_mi(sb->members + sb->nr_this_dev);
-
-       if (mi.nbuckets > LONG_MAX)
-               return "Too many buckets";
-
-       if (mi.nbuckets < 1 << 8)
-               return "Not enough buckets";
-
-       if (!is_power_of_2(mi.bucket_size) ||
-           mi.bucket_size < PAGE_SECTORS ||
-           mi.bucket_size < block_size)
-               return "Bad bucket size";
-
-       if (get_capacity(disk_sb->bdev->bd_disk) <
-           mi.bucket_size * mi.nbuckets)
-               return "Invalid superblock: device too small";
-
-       if (le64_to_cpu(sb->offset) +
-           (__set_blocks(sb, le16_to_cpu(sb->u64s),
-                         block_size << 9) * block_size) >
-           mi.first_bucket * mi.bucket_size)
-               return "Invalid superblock: first bucket comes before end of super";
-
-       for (i = 0; i < bch_nr_journal_buckets(sb); i++)
-               if (journal_bucket(sb, i) <  mi.first_bucket ||
-                   journal_bucket(sb, i) >= mi.nbuckets)
-                       return "bad journal bucket";
-
-       return NULL;
-}
-
-void free_super(struct bcache_superblock *sb)
-{
-       if (sb->bio)
-               bio_put(sb->bio);
-       if (!IS_ERR_OR_NULL(sb->bdev))
-               blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-
-       free_pages((unsigned long) sb->sb, sb->page_order);
-       memset(sb, 0, sizeof(*sb));
-}
-
-static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
-{
-       struct cache_sb *new_sb;
-       struct bio *bio;
-
-       if (sb->page_order >= order && sb->sb)
-               return 0;
-
-       new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
-       if (!new_sb)
-               return -ENOMEM;
-
-       bio = (dynamic_fault("bcache:add:super_realloc")
-              ? NULL
-              : bio_kmalloc(GFP_KERNEL, 1 << order));
-       if (!bio) {
-               free_pages((unsigned long) new_sb, order);
-               return -ENOMEM;
-       }
-
-       if (sb->sb)
-               memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
-
-       free_pages((unsigned long) sb->sb, sb->page_order);
-       sb->sb = new_sb;
-
-       if (sb->bio)
-               bio_put(sb->bio);
-       sb->bio = bio;
-
-       sb->page_order = order;
-
-       return 0;
-}
-
-int bch_super_realloc(struct bcache_superblock *sb, unsigned u64s)
-{
-       struct cache_member *mi = sb->sb->members + sb->sb->nr_this_dev;
-       char buf[BDEVNAME_SIZE];
-       size_t bytes = __set_bytes((struct cache_sb *) NULL, u64s);
-       u64 want = bytes + (SB_SECTOR << 9);
-
-       u64 first_bucket_offset = (u64) le16_to_cpu(mi->first_bucket) *
-               ((u64) le16_to_cpu(mi->bucket_size) << 9);
-
-       if (want > first_bucket_offset) {
-               pr_err("%s: superblock too big: want %llu but have %llu",
-                      bdevname(sb->bdev, buf), want, first_bucket_offset);
-               return -ENOSPC;
-       }
-
-       return __bch_super_realloc(sb, get_order(bytes));
-}
-
-static const char *read_super(struct bcache_superblock *sb,
-                             struct cache_set_opts opts,
-                             const char *path)
-{
-       const char *err;
-       unsigned order = 0;
-
-       lockdep_assert_held(&bch_register_lock);
-
-       memset(sb, 0, sizeof(*sb));
-
-       err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
-       if (err)
-               return err;
-retry:
-       err = "cannot allocate memory";
-       if (__bch_super_realloc(sb, order))
-               goto err;
-
-       err = "dynamic fault";
-       if (cache_set_init_fault("read_super"))
-               goto err;
-
-       bio_reset(sb->bio);
-       sb->bio->bi_bdev = sb->bdev;
-       sb->bio->bi_iter.bi_sector = SB_SECTOR;
-       sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
-       bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
-       bch_bio_map(sb->bio, sb->sb);
-
-       err = "IO error";
-       if (submit_bio_wait(sb->bio))
-               goto err;
-
-       err = "Not a bcache superblock";
-       if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
-               goto err;
-
-       err = "Superblock has incorrect offset";
-       if (le64_to_cpu(sb->sb->offset) != SB_SECTOR)
-               goto err;
-
-       pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
-                le64_to_cpu(sb->sb->version),
-                le64_to_cpu(sb->sb->flags),
-                le64_to_cpu(sb->sb->seq),
-                le16_to_cpu(sb->sb->u64s));
-
-       err = "Superblock block size smaller than device block size";
-       if (le16_to_cpu(sb->sb->block_size) << 9 <
-           bdev_logical_block_size(sb->bdev))
-               goto err;
-
-       order = get_order(__set_bytes(sb->sb, le16_to_cpu(sb->sb->u64s)));
-       if (order > sb->page_order)
-               goto retry;
-
-       err = "bad checksum reading superblock";
-       if (le64_to_cpu(sb->sb->csum) !=
-           __csum_set(sb->sb, le16_to_cpu(sb->sb->u64s),
-                      le64_to_cpu(sb->sb->version) <
-                      BCACHE_SB_VERSION_CDEV_V3
-                      ? BCH_CSUM_CRC64
-                      : CACHE_SB_CSUM_TYPE(sb->sb)))
-               goto err;
-
-       return NULL;
-err:
-       free_super(sb);
-       return err;
-}
-
-void __write_super(struct cache_set *c, struct bcache_superblock *disk_sb)
-{
-       struct cache_sb *sb = disk_sb->sb;
-       struct bio *bio = disk_sb->bio;
-
-       bio->bi_bdev            = disk_sb->bdev;
-       bio->bi_iter.bi_sector  = SB_SECTOR;
-       bio->bi_iter.bi_size    =
-               roundup(__set_bytes(sb, le16_to_cpu(sb->u64s)),
-                       bdev_logical_block_size(disk_sb->bdev));
-       bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
-       bch_bio_map(bio, sb);
-
-       pr_debug("ver %llu, flags %llu, seq %llu",
-                le64_to_cpu(sb->version),
-                le64_to_cpu(sb->flags),
-                le64_to_cpu(sb->seq));
-
-       bch_generic_make_request(bio, c);
-}
-
-static void write_super_endio(struct bio *bio)
-{
-       struct cache *ca = bio->bi_private;
-
-       /* XXX: return errors directly */
-
-       cache_fatal_io_err_on(bio->bi_error, ca, "superblock write");
-
-       bch_account_io_completion(ca);
-
-       closure_put(&ca->set->sb_write);
-       percpu_ref_put(&ca->ref);
-}
-
-static void bcache_write_super_unlock(struct closure *cl)
-{
-       struct cache_set *c = container_of(cl, struct cache_set, sb_write);
-
-       up(&c->sb_write_mutex);
-}
-
-/* Update cached mi: */
-static int cache_set_mi_update(struct cache_set *c,
-                              struct cache_member *mi,
-                              unsigned nr_in_set)
-{
-       struct cache_member_rcu *new, *old;
-       struct cache *ca;
-       unsigned i;
-
-       mutex_lock(&c->mi_lock);
-
-       new = kzalloc(sizeof(struct cache_member_rcu) +
-                     sizeof(struct cache_member_cpu) * nr_in_set,
-                     GFP_KERNEL);
-       if (!new) {
-               mutex_unlock(&c->mi_lock);
-               return -ENOMEM;
-       }
-
-       new->nr_in_set = nr_in_set;
-
-       for (i = 0; i < nr_in_set; i++)
-               new->m[i] = cache_mi_to_cpu_mi(&mi[i]);
-
-       rcu_read_lock();
-       for_each_cache(ca, c, i)
-               ca->mi = new->m[i];
-       rcu_read_unlock();
-
-       old = rcu_dereference_protected(c->members,
-                               lockdep_is_held(&c->mi_lock));
-
-       rcu_assign_pointer(c->members, new);
-       if (old)
-               kfree_rcu(old, rcu);
-
-       mutex_unlock(&c->mi_lock);
-       return 0;
-}
-
-/* doesn't copy member info */
-static void __copy_super(struct cache_sb *dst, struct cache_sb *src)
-{
-       dst->version            = src->version;
-       dst->seq                = src->seq;
-       dst->user_uuid          = src->user_uuid;
-       dst->set_uuid           = src->set_uuid;
-       memcpy(dst->label, src->label, SB_LABEL_SIZE);
-       dst->flags              = src->flags;
-       dst->flags2             = src->flags2;
-       dst->nr_in_set          = src->nr_in_set;
-       dst->block_size         = src->block_size;
-}
-
-static int cache_sb_to_cache_set(struct cache_set *c, struct cache_sb *src)
-{
-       struct cache_member *new;
-
-       lockdep_assert_held(&bch_register_lock);
-
-       new = kzalloc(sizeof(struct cache_member) * src->nr_in_set,
-                     GFP_KERNEL);
-       if (!new)
-               return -ENOMEM;
-
-       memcpy(new, src->members,
-              src->nr_in_set * sizeof(struct cache_member));
-
-       if (cache_set_mi_update(c, new, src->nr_in_set)) {
-               kfree(new);
-               return -ENOMEM;
-       }
-
-       kfree(c->disk_mi);
-       c->disk_mi = new;
-
-       __copy_super(&c->disk_sb, src);
-
-       c->sb.block_size        = le16_to_cpu(src->block_size);
-       c->sb.btree_node_size   = CACHE_SET_BTREE_NODE_SIZE(src);
-       c->sb.nr_in_set         = src->nr_in_set;
-       c->sb.clean             = CACHE_SET_CLEAN(src);
-       c->sb.meta_replicas_have= CACHE_SET_META_REPLICAS_HAVE(src);
-       c->sb.data_replicas_have= CACHE_SET_DATA_REPLICAS_HAVE(src);
-       c->sb.str_hash_type     = CACHE_SET_STR_HASH_TYPE(src);
-
-       return 0;
-}
-
-static int cache_sb_from_cache_set(struct cache_set *c, struct cache *ca)
-{
-       struct cache_sb *src = &c->disk_sb, *dst = ca->disk_sb.sb;
-
-       if (src->nr_in_set != dst->nr_in_set) {
-               /*
-                * We have to preserve the list of journal buckets on the
-                * cache's superblock:
-                */
-               unsigned old_offset = bch_journal_buckets_offset(dst);
-               unsigned u64s = bch_journal_buckets_offset(src)
-                       + bch_nr_journal_buckets(dst);
-               int ret = bch_super_realloc(&ca->disk_sb, u64s);
-
-               if (ret)
-                       return ret;
-
-               dst->nr_in_set  = src->nr_in_set;
-               dst->u64s       = cpu_to_le16(u64s);
-
-               memmove(dst->_data + bch_journal_buckets_offset(dst),
-                       dst->_data + old_offset,
-                       bch_nr_journal_buckets(dst) * sizeof(u64));
-       }
-
-       memcpy(dst->_data,
-              c->disk_mi,
-              src->nr_in_set * sizeof(struct cache_member));
-
-       __copy_super(dst, src);
-
-       return 0;
-}
-
-static void __bcache_write_super(struct cache_set *c)
-{
-       struct closure *cl = &c->sb_write;
-       struct cache *ca;
-       unsigned i;
-
-       cache_set_mi_update(c, c->disk_mi, c->sb.nr_in_set);
-
-       closure_init(cl, &c->cl);
-
-       if (c->opts.nochanges)
-               goto no_io;
-
-       le64_add_cpu(&c->disk_sb.seq, 1);
-
-       for_each_cache(ca, c, i) {
-               struct cache_sb *sb = ca->disk_sb.sb;
-               struct bio *bio = ca->disk_sb.bio;
-
-               cache_sb_from_cache_set(c, ca);
-
-               SET_CACHE_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
-               sb->csum = cpu_to_le64(__csum_set(sb,
-                                                 le16_to_cpu(sb->u64s),
-                                                 CACHE_SB_CSUM_TYPE(sb)));
-
-               bio_reset(bio);
-               bio->bi_bdev    = ca->disk_sb.bdev;
-               bio->bi_end_io  = write_super_endio;
-               bio->bi_private = ca;
-
-               closure_get(cl);
-               percpu_ref_get(&ca->ref);
-               __write_super(c, &ca->disk_sb);
-       }
-no_io:
-       closure_return_with_destructor(cl, bcache_write_super_unlock);
-}
-
-void bcache_write_super(struct cache_set *c)
-{
-       down(&c->sb_write_mutex);
-       __bcache_write_super(c);
-}
-
-void bch_check_mark_super_slowpath(struct cache_set *c, const struct bkey_i *k,
-                                  bool meta)
-{
-       struct cache_member *mi;
-       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
-       const struct bch_extent_ptr *ptr;
-
-       if (!CACHE_SET_SYNC(&c->disk_sb))
-               return;
-
-       down(&c->sb_write_mutex);
-
-       /* recheck, might have raced */
-       if (bch_check_super_marked(c, k, meta)) {
-               up(&c->sb_write_mutex);
-               return;
-       }
-
-       mi = c->disk_mi;
-
-       extent_for_each_ptr(e, ptr)
-               if (bch_extent_ptr_is_dirty(c, e, ptr))
-                       (meta
-                        ? SET_CACHE_HAS_METADATA
-                        : SET_CACHE_HAS_DATA)(mi + ptr->dev, true);
-
-       __bcache_write_super(c);
-}
-
 /* Cache set RO/RW: */
 
 /*
@@ -768,8 +197,10 @@ static void bch_cache_set_read_only_work(struct work_struct *work)
 
                if (!bch_journal_error(&c->journal) &&
                    !test_bit(CACHE_SET_ERROR, &c->flags)) {
-                       SET_CACHE_SET_CLEAN(&c->disk_sb, true);
-                       bcache_write_super(c);
+                       mutex_lock(&c->sb_lock);
+                       SET_BCH_SB_CLEAN(c->disk_sb, true);
+                       bch_write_super(c);
+                       mutex_unlock(&c->sb_lock);
                }
        } else {
                /*
@@ -848,7 +279,7 @@ static const char *__bch_cache_set_read_write(struct cache_set *c)
 
        err = "error starting allocator thread";
        for_each_cache(ca, c, i)
-               if (ca->mi.state == CACHE_ACTIVE &&
+               if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
                    bch_cache_allocator_start(ca)) {
                        percpu_ref_put(&ca->ref);
                        goto err;
@@ -859,7 +290,7 @@ static const char *__bch_cache_set_read_write(struct cache_set *c)
                goto err;
 
        for_each_cache(ca, c, i) {
-               if (ca->mi.state != CACHE_ACTIVE)
+               if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
                        continue;
 
                err = "error starting moving GC thread";
@@ -913,6 +344,7 @@ static void cache_set_free(struct cache_set *c)
        cancel_work_sync(&c->bio_submit_work);
        cancel_work_sync(&c->read_retry_work);
 
+       bch_cache_set_encryption_free(c);
        bch_btree_cache_free(c);
        bch_journal_free(&c->journal);
        bch_io_clock_exit(&c->io_clock[WRITE]);
@@ -939,7 +371,7 @@ static void cache_set_free(struct cache_set *c)
                destroy_workqueue(c->wq);
 
        kfree_rcu(rcu_dereference_protected(c->members, 1), rcu); /* shutting down */
-       kfree(c->disk_mi);
+       free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
        kfree(c);
        module_put(THIS_MODULE);
 }
@@ -1043,15 +475,18 @@ void bch_cache_set_unregister(struct cache_set *c)
 
 static unsigned cache_set_nr_devices(struct cache_set *c)
 {
+       struct bch_sb_field_members *mi;
        unsigned i, nr = 0;
-       struct cache_member *mi = c->disk_mi;
 
-       lockdep_assert_held(&bch_register_lock);
+       mutex_lock(&c->sb_lock);
+       mi = bch_sb_get_members(c->disk_sb);
 
-       for (i = 0; i < c->disk_sb.nr_in_set; i++)
-               if (!bch_is_zero(mi[i].uuid.b, sizeof(uuid_le)))
+       for (i = 0; i < c->disk_sb->nr_devices; i++)
+               if (!bch_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)))
                        nr++;
 
+       mutex_unlock(&c->sb_lock);
+
        return nr;
 }
 
@@ -1059,7 +494,7 @@ static unsigned cache_set_nr_online_devices(struct cache_set *c)
 {
        unsigned i, nr = 0;
 
-       for (i = 0; i < c->sb.nr_in_set; i++)
+       for (i = 0; i < c->sb.nr_devices; i++)
                if (c->cache[i])
                        nr++;
 
@@ -1069,7 +504,7 @@ static unsigned cache_set_nr_online_devices(struct cache_set *c)
 #define alloc_bucket_pages(gfp, ca)                    \
        ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(ca))))
 
-static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
+static struct cache_set *bch_cache_set_alloc(struct bch_sb *sb,
                                             struct cache_set_opts opts)
 {
        struct cache_set *c;
@@ -1083,13 +518,12 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
 
        c->minor                = -1;
 
-       sema_init(&c->sb_write_mutex, 1);
+       mutex_init(&c->sb_lock);
        INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
        mutex_init(&c->btree_cache_lock);
        mutex_init(&c->bucket_lock);
        mutex_init(&c->btree_root_lock);
        INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work);
-       mutex_init(&c->mi_lock);
 
        init_rwsem(&c->gc_lock);
 
@@ -1146,10 +580,16 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
 
        mutex_init(&c->uevent_lock);
 
-       if (cache_sb_to_cache_set(c, sb))
+       mutex_lock(&c->sb_lock);
+
+       if (bch_sb_to_cache_set(c, sb)) {
+               mutex_unlock(&c->sb_lock);
                goto err;
+       }
+
+       mutex_unlock(&c->sb_lock);
 
-       scnprintf(c->name, sizeof(c->name), "%pU", &c->disk_sb.user_uuid);
+       scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
 
        c->opts = cache_superblock_opts(sb);
        cache_set_opts_apply(&c->opts, opts);
@@ -1165,7 +605,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
        iter_size = (btree_blocks(c) + 1) * 2 *
                sizeof(struct btree_node_iter_set);
 
-       journal_entry_bytes = 512U << CACHE_SET_JOURNAL_ENTRY_SIZE(sb);
+       journal_entry_bytes = 512U << BCH_SB_JOURNAL_ENTRY_SIZE(sb);
 
        if (!(c->wq = alloc_workqueue("bcache",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
@@ -1185,7 +625,7 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
            mempool_init_page_pool(&c->bio_bounce_pages,
                                   max_t(unsigned,
                                         c->sb.btree_node_size,
-                                        CRC32_EXTENT_SIZE_MAX) /
+                                        BCH_ENCODED_EXTENT_MAX) /
                                   PAGE_SECTORS, 0) ||
            !(c->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache_set)) ||
            lg_lock_init(&c->bucket_stats_lock) ||
@@ -1196,7 +636,9 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb,
            bch_io_clock_init(&c->io_clock[WRITE]) ||
            bch_journal_alloc(&c->journal, journal_entry_bytes) ||
            bch_btree_cache_alloc(c) ||
-           bch_compress_init(c))
+           bch_cache_set_encryption_init(c) ||
+           bch_compress_init(c) ||
+           bch_check_set_has_compressed_data(c, c->opts.compression))
                goto err;
 
        c->bdi.ra_pages         = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
@@ -1247,7 +689,7 @@ static int bch_cache_set_online(struct cache_set *c)
        if (IS_ERR(c->chardev))
                return PTR_ERR(c->chardev);
 
-       if (kobject_add(&c->kobj, NULL, "%pU", c->disk_sb.user_uuid.b) ||
+       if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
            kobject_add(&c->internal, &c->kobj, "internal") ||
            kobject_add(&c->opts_dir, &c->kobj, "options") ||
            kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
@@ -1267,6 +709,7 @@ static int bch_cache_set_online(struct cache_set *c)
 static const char *run_cache_set(struct cache_set *c)
 {
        const char *err = "cannot allocate memory";
+       struct bch_sb_field_members *mi;
        struct cache *ca;
        unsigned i, id;
        time64_t now;
@@ -1285,15 +728,9 @@ static const char *run_cache_set(struct cache_set *c)
         * we start testing it.
         */
        for_each_cache(ca, c, i)
-               cache_sb_from_cache_set(c, ca);
+               bch_sb_from_cache_set(c, ca);
 
-       /*
-        * CACHE_SET_SYNC is true if the cache set has already been run
-        * and potentially has data.
-        * It is false if it is the first time it is run.
-        */
-
-       if (CACHE_SET_SYNC(&c->disk_sb)) {
+       if (BCH_SB_INITIALIZED(c->disk_sb)) {
                ret = bch_journal_read(c, &journal);
                if (ret)
                        goto err;
@@ -1363,7 +800,7 @@ static const char *run_cache_set(struct cache_set *c)
 
                err = "error starting allocator thread";
                for_each_cache(ca, c, i)
-                       if (ca->mi.state == CACHE_ACTIVE &&
+                       if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
                            bch_cache_allocator_start(ca)) {
                                percpu_ref_put(&ca->ref);
                                goto err;
@@ -1381,25 +818,16 @@ static const char *run_cache_set(struct cache_set *c)
                if (c->opts.norecovery)
                        goto recovery_done;
 
-               /*
-                * Write a new journal entry _before_ we start journalling new
-                * data - otherwise, we could end up with btree node bsets with
-                * journal seqs arbitrarily far in the future vs. the most
-                * recently written journal entry on disk, if we crash before
-                * writing the next journal entry:
-                */
-               err = "error writing journal entry";
-               if (bch_journal_meta(&c->journal))
-                       goto err;
-
                bch_verbose(c, "starting fsck:");
                err = "error in fsck";
                ret = bch_fsck(c, !c->opts.nofsck);
                if (ret)
                        goto err;
+
                bch_verbose(c, "fsck done");
        } else {
-               struct bkey_i_inode inode;
+               struct bch_inode_unpacked inode;
+               struct bkey_inode_buf packed_inode;
                struct closure cl;
 
                closure_init_stack(&cl);
@@ -1424,7 +852,7 @@ static const char *run_cache_set(struct cache_set *c)
 
                err = "error starting allocator thread";
                for_each_cache(ca, c, i)
-                       if (ca->mi.state == CACHE_ACTIVE &&
+                       if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
                            bch_cache_allocator_start(ca)) {
                                percpu_ref_put(&ca->ref);
                                goto err;
@@ -1442,10 +870,13 @@ static const char *run_cache_set(struct cache_set *c)
 
                bch_inode_init(c, &inode, 0, 0,
                               S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
-               inode.k.p.inode = BCACHE_ROOT_INO;
+               inode.inum = BCACHE_ROOT_INO;
+
+               bch_inode_pack(&packed_inode, &inode);
 
                err = "error creating root directory";
-               if (bch_btree_insert(c, BTREE_ID_INODES, &inode.k_i,
+               if (bch_btree_insert(c, BTREE_ID_INODES,
+                                    &packed_inode.inode.k_i,
                                     NULL, NULL, NULL, 0))
                        goto err;
 
@@ -1462,16 +893,21 @@ recovery_done:
                        goto err;
        }
 
+       mutex_lock(&c->sb_lock);
+       mi = bch_sb_get_members(c->disk_sb);
        now = ktime_get_seconds();
+
        rcu_read_lock();
        for_each_cache_rcu(ca, c, i)
-               c->disk_mi[ca->sb.nr_this_dev].last_mount = cpu_to_le64(now);
+               mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
        rcu_read_unlock();
 
-       /* Mark cache set as initialized: */
-       SET_CACHE_SET_SYNC(&c->disk_sb, true);
-       SET_CACHE_SET_CLEAN(&c->disk_sb, false);
-       bcache_write_super(c);
+       SET_BCH_SB_INITIALIZED(c->disk_sb, true);
+       SET_BCH_SB_CLEAN(c->disk_sb, false);
+       c->disk_sb->version = BCACHE_SB_VERSION_CDEV;
+
+       bch_write_super(c);
+       mutex_unlock(&c->sb_lock);
 
        err = "dynamic fault";
        if (cache_set_init_fault("run_cache_set"))
@@ -1527,41 +963,46 @@ err:
        goto out;
 }
 
-static const char *can_add_cache(struct cache_sb *sb,
+static const char *can_add_cache(struct bch_sb *sb,
                                 struct cache_set *c)
 {
+       struct bch_sb_field_members *sb_mi;
+
+       sb_mi = bch_sb_get_members(sb);
+       if (!sb_mi)
+               return "Invalid superblock: member info area missing";
+
        if (le16_to_cpu(sb->block_size) != c->sb.block_size)
                return "mismatched block size";
 
-       if (le16_to_cpu(sb->members[sb->nr_this_dev].bucket_size) <
-           CACHE_SET_BTREE_NODE_SIZE(&c->disk_sb))
+       if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+           BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
                return "new cache bucket_size is too small";
 
        return NULL;
 }
 
-static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
+static const char *can_attach_cache(struct bch_sb *sb, struct cache_set *c)
 {
+       struct bch_sb_field_members *mi = bch_sb_get_members(c->disk_sb);
+       struct bch_sb_field_members *dev_mi = bch_sb_get_members(sb);
+       uuid_le dev_uuid = dev_mi->members[sb->dev_idx].uuid;
        const char *err;
-       bool match;
 
        err = can_add_cache(sb, c);
        if (err)
                return err;
 
+       if (bch_is_zero(&dev_uuid, sizeof(dev_uuid)))
+               return "device has been removed";
+
        /*
         * When attaching an existing device, the cache set superblock must
         * already contain member_info with a matching UUID
         */
-       match = le64_to_cpu(sb->seq) <= le64_to_cpu(c->disk_sb.seq)
-               ? (sb->nr_this_dev < c->disk_sb.nr_in_set &&
-                  !memcmp(&c->disk_mi[sb->nr_this_dev].uuid,
-                          &sb->disk_uuid, sizeof(uuid_le)))
-               : (sb->nr_this_dev < sb->nr_in_set &&
-                  !memcmp(&sb->members[sb->nr_this_dev].uuid,
-                          &sb->disk_uuid, sizeof(uuid_le)));
-
-       if (!match)
+       if (sb->dev_idx >= c->disk_sb->nr_devices ||
+           memcmp(&mi->members[sb->dev_idx].uuid,
+                  &dev_uuid, sizeof(uuid_le)))
                return "cache sb does not match set";
 
        return NULL;
@@ -1572,13 +1013,14 @@ static const char *can_attach_cache(struct cache_sb *sb, struct cache_set *c)
 bool bch_cache_read_only(struct cache *ca)
 {
        struct cache_set *c = ca->set;
+       struct bch_sb_field_members *mi;
        char buf[BDEVNAME_SIZE];
 
        bdevname(ca->disk_sb.bdev, buf);
 
        lockdep_assert_held(&bch_register_lock);
 
-       if (ca->mi.state != CACHE_ACTIVE)
+       if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
                return false;
 
        if (!bch_cache_may_remove(ca)) {
@@ -1609,8 +1051,12 @@ bool bch_cache_read_only(struct cache *ca)
        bch_notice(c, "%s read only", bdevname(ca->disk_sb.bdev, buf));
        bch_notify_cache_read_only(ca);
 
-       SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_RO);
-       bcache_write_super(c);
+       mutex_lock(&c->sb_lock);
+       mi = bch_sb_get_members(c->disk_sb);
+       SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
+                            BCH_MEMBER_STATE_RO);
+       bch_write_super(c);
+       mutex_unlock(&c->sb_lock);
        return true;
 }
 
@@ -1618,7 +1064,7 @@ static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca)
 {
        lockdep_assert_held(&bch_register_lock);
 
-       if (ca->mi.state == CACHE_ACTIVE)
+       if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
                return NULL;
 
        if (test_bit(CACHE_DEV_REMOVING, &ca->flags))
@@ -1645,14 +1091,19 @@ static const char *__bch_cache_read_write(struct cache_set *c, struct cache *ca)
 const char *bch_cache_read_write(struct cache *ca)
 {
        struct cache_set *c = ca->set;
+       struct bch_sb_field_members *mi;
        const char *err;
 
        err = __bch_cache_read_write(c, ca);
        if (err)
                return err;
 
-       SET_CACHE_STATE(&c->disk_mi[ca->sb.nr_this_dev], CACHE_ACTIVE);
-       bcache_write_super(c);
+       mutex_lock(&c->sb_lock);
+       mi = bch_sb_get_members(c->disk_sb);
+       SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx],
+                            BCH_MEMBER_STATE_ACTIVE);
+       bch_write_super(c);
+       mutex_unlock(&c->sb_lock);
 
        return NULL;
 }
@@ -1681,14 +1132,14 @@ static void bch_cache_free_work(struct work_struct *work)
        if (c && c->kobj.state_in_sysfs) {
                char buf[12];
 
-               sprintf(buf, "cache%u", ca->sb.nr_this_dev);
+               sprintf(buf, "cache%u", ca->dev_idx);
                sysfs_remove_link(&c->kobj, buf);
        }
 
        if (ca->kobj.state_in_sysfs)
                kobject_del(&ca->kobj);
 
-       free_super(&ca->disk_sb);
+       bch_free_super(&ca->disk_sb);
 
        /*
         * bch_cache_stop can be called in the middle of initialization
@@ -1697,10 +1148,10 @@ static void bch_cache_free_work(struct work_struct *work)
         * However, they were zeroed when the object was allocated.
         */
 
+       bch_journal_free_cache(ca);
        free_percpu(ca->sectors_written);
        bioset_exit(&ca->replica_set);
        free_percpu(ca->bucket_stats_percpu);
-       kfree(ca->journal.bucket_seq);
        free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
        kfree(ca->prio_buckets);
        kfree(ca->bio_prio);
@@ -1754,8 +1205,8 @@ static void bch_cache_stop(struct cache *ca)
        lockdep_assert_held(&bch_register_lock);
 
        if (c) {
-               BUG_ON(rcu_access_pointer(c->cache[ca->sb.nr_this_dev]) != ca);
-               rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], NULL);
+               BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
+               rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
        }
 
        call_rcu(&ca->free_rcu, bch_cache_free_rcu);
@@ -1764,10 +1215,11 @@ static void bch_cache_stop(struct cache *ca)
 static void bch_cache_remove_work(struct work_struct *work)
 {
        struct cache *ca = container_of(work, struct cache, remove_work);
+       struct bch_sb_field_members *mi;
        struct cache_set *c = ca->set;
        char name[BDEVNAME_SIZE];
        bool force = test_bit(CACHE_DEV_FORCE_REMOVE, &ca->flags);
-       unsigned dev = ca->sb.nr_this_dev;
+       unsigned dev_idx = ca->dev_idx;
 
        bdevname(ca->disk_sb.bdev, name);
 
@@ -1780,17 +1232,21 @@ static void bch_cache_remove_work(struct work_struct *work)
        if (!ca->mi.has_data) {
                /* Nothing to do: */
        } else if (!bch_move_data_off_device(ca)) {
-               lockdep_assert_held(&bch_register_lock);
-               SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+               mutex_lock(&c->sb_lock);
+               mi = bch_sb_get_members(c->disk_sb);
+               SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
 
-               bcache_write_super(c);
+               bch_write_super(c);
+               mutex_unlock(&c->sb_lock);
        } else if (force) {
                bch_flag_data_bad(ca);
 
-               lockdep_assert_held(&bch_register_lock);
-               SET_CACHE_HAS_DATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+               mutex_lock(&c->sb_lock);
+               mi = bch_sb_get_members(c->disk_sb);
+               SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false);
 
-               bcache_write_super(c);
+               bch_write_super(c);
+               mutex_unlock(&c->sb_lock);
        } else {
                bch_err(c, "Remove of %s failed, unable to migrate data off",
                        name);
@@ -1803,10 +1259,12 @@ static void bch_cache_remove_work(struct work_struct *work)
        if (!ca->mi.has_metadata) {
                /* Nothing to do: */
        } else if (!bch_move_meta_data_off_device(ca)) {
-               lockdep_assert_held(&bch_register_lock);
-               SET_CACHE_HAS_METADATA(&c->disk_mi[ca->sb.nr_this_dev], false);
+               mutex_lock(&c->sb_lock);
+               mi = bch_sb_get_members(c->disk_sb);
+               SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false);
 
-               bcache_write_super(c);
+               bch_write_super(c);
+               mutex_unlock(&c->sb_lock);
        } else {
                bch_err(c, "Remove of %s failed, unable to migrate metadata off",
                        name);
@@ -1821,7 +1279,7 @@ static void bch_cache_remove_work(struct work_struct *work)
        bch_notify_cache_removed(ca);
 
        spin_lock(&c->journal.lock);
-       c->journal.prio_buckets[dev] = 0;
+       c->journal.prio_buckets[dev_idx] = 0;
        spin_unlock(&c->journal.lock);
 
        bch_journal_meta(&c->journal);
@@ -1844,12 +1302,16 @@ static void bch_cache_remove_work(struct work_struct *work)
        lockdep_assert_held(&bch_register_lock);
 
        /*
-        * Free this device's slot in the cache_member array - all pointers to
+        * Free this device's slot in the bch_member array - all pointers to
         * this device must be gone:
         */
-       memset(&c->disk_mi[dev].uuid, 0, sizeof(c->disk_mi[dev].uuid));
+       mutex_lock(&c->sb_lock);
+       mi = bch_sb_get_members(c->disk_sb);
+       memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+
+       bch_write_super(c);
+       mutex_unlock(&c->sb_lock);
 
-       bcache_write_super(c);
        mutex_unlock(&bch_register_lock);
 
        closure_put(&c->cl);
@@ -1891,7 +1353,7 @@ static int bch_cache_online(struct cache *ca)
 
        lockdep_assert_held(&bch_register_lock);
 
-       sprintf(buf, "cache%u", ca->sb.nr_this_dev);
+       sprintf(buf, "cache%u", ca->dev_idx);
 
        if (kobject_add(&ca->kobj,
                        &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj,
@@ -1907,13 +1369,14 @@ static const char *cache_alloc(struct bcache_superblock *sb,
                               struct cache_set *c,
                               struct cache **ret)
 {
+       struct bch_member *member;
        size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve;
        size_t heap_size;
-       unsigned i, journal_entry_pages;
+       unsigned i;
        const char *err = "cannot allocate memory";
        struct cache *ca;
 
-       if (c->sb.nr_in_set == 1)
+       if (c->sb.nr_devices == 1)
                bdevname(sb->bdev, c->name);
 
        if (cache_set_init_fault("cache_alloc"))
@@ -1934,7 +1397,7 @@ static const char *cache_alloc(struct bcache_superblock *sb,
        spin_lock_init(&ca->self.lock);
        ca->self.nr_devices = 1;
        rcu_assign_pointer(ca->self.d[0].dev, ca);
-       ca->sb.nr_this_dev = sb->sb->nr_this_dev;
+       ca->dev_idx = sb->sb->dev_idx;
 
        INIT_WORK(&ca->free_work, bch_cache_free_work);
        INIT_WORK(&ca->remove_work, bch_cache_remove_work);
@@ -1953,8 +1416,11 @@ static const char *cache_alloc(struct bcache_superblock *sb,
        if (cache_set_init_fault("cache_alloc"))
                goto err;
 
-       ca->mi = cache_mi_to_cpu_mi(ca->disk_sb.sb->members +
-                                   ca->disk_sb.sb->nr_this_dev);
+       member = bch_sb_get_members(ca->disk_sb.sb)->members +
+               ca->disk_sb.sb->dev_idx;
+
+       ca->mi = cache_mi_to_cpu_mi(member);
+       ca->uuid = member->uuid;
        ca->bucket_bits = ilog2(ca->mi.bucket_size);
 
        /* XXX: tune these */
@@ -1968,10 +1434,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
        free_inc_reserve = movinggc_reserve / 2;
        heap_size = movinggc_reserve * 8;
 
-       journal_entry_pages =
-               DIV_ROUND_UP(1U << CACHE_SET_JOURNAL_ENTRY_SIZE(ca->disk_sb.sb),
-                            PAGE_SECTORS);
-
        if (!init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
            !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) ||
            !init_fifo(&ca->free[RESERVE_MOVINGGC],
@@ -1987,13 +1449,11 @@ static const char *cache_alloc(struct bcache_superblock *sb,
                                          2, GFP_KERNEL)) ||
            !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)) ||
            !(ca->bucket_stats_percpu = alloc_percpu(struct bucket_stats_cache)) ||
-           !(ca->journal.bucket_seq = kcalloc(bch_nr_journal_buckets(ca->disk_sb.sb),
-                                              sizeof(u64), GFP_KERNEL)) ||
-           !(ca->journal.bio = bio_kmalloc(GFP_KERNEL, journal_entry_pages)) ||
-           !(ca->bio_prio = bio_kmalloc(GFP_KERNEL, bucket_pages(ca))) ||
+           !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) ||
            bioset_init(&ca->replica_set, 4,
                        offsetof(struct bch_write_bio, bio)) ||
-           !(ca->sectors_written = alloc_percpu(*ca->sectors_written)))
+           !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
+           bch_journal_init_cache(ca))
                goto err;
 
        ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -2006,15 +1466,6 @@ static const char *cache_alloc(struct bcache_superblock *sb,
        ca->copygc_write_point.group = &ca->self;
        ca->tiering_write_point.group = &ca->self;
 
-       kobject_get(&c->kobj);
-       ca->set = c;
-
-       kobject_get(&ca->kobj);
-       rcu_assign_pointer(c->cache[ca->sb.nr_this_dev], ca);
-
-       if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb.seq))
-               cache_sb_to_cache_set(c, ca->disk_sb.sb);
-
        /*
         * Increase journal write timeout if flushes to this device are
         * expensive:
@@ -2024,6 +1475,19 @@ static const char *cache_alloc(struct bcache_superblock *sb,
                c->journal.write_delay_ms =
                        max(c->journal.write_delay_ms, 1000U);
 
+       kobject_get(&c->kobj);
+       ca->set = c;
+
+       kobject_get(&ca->kobj);
+       rcu_assign_pointer(c->cache[ca->dev_idx], ca);
+
+       mutex_lock(&c->sb_lock);
+
+       if (le64_to_cpu(ca->disk_sb.sb->seq) > le64_to_cpu(c->disk_sb->seq))
+               bch_sb_to_cache_set(c, ca->disk_sb.sb);
+
+       mutex_unlock(&c->sb_lock);
+
        err = "error creating kobject";
        if (c->kobj.state_in_sysfs &&
            bch_cache_online(ca))
@@ -2046,7 +1510,7 @@ static struct cache_set *cache_set_lookup(uuid_le uuid)
        lockdep_assert_held(&bch_register_lock);
 
        list_for_each_entry(c, &bch_cache_sets, list)
-               if (!memcmp(&c->disk_sb.set_uuid, &uuid, sizeof(uuid_le)))
+               if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
                        return c;
 
        return NULL;
@@ -2060,13 +1524,13 @@ static const char *register_cache(struct bcache_superblock *sb,
        struct cache_set *c;
        bool allocated_cache_set = false;
 
-       err = validate_cache_super(sb);
+       err = bch_validate_cache_super(sb);
        if (err)
                return err;
 
        bdevname(sb->bdev, name);
 
-       c = cache_set_lookup(sb->sb->set_uuid);
+       c = cache_set_lookup(sb->sb->uuid);
        if (c) {
                err = can_attach_cache(sb->sb, c);
                if (err)
@@ -2106,20 +1570,23 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path)
        struct bcache_superblock sb;
        const char *err;
        struct cache *ca;
-       struct cache_member *new_mi = NULL;
-       struct cache_member mi;
-       unsigned nr_this_dev, nr_in_set, u64s;
+       struct bch_sb_field *f;
+       struct bch_sb_field_members *mi, *dev_mi;
+       struct bch_member saved_mi;
+       unsigned dev_idx, nr_devices, u64s;
        int ret = -EINVAL;
 
        mutex_lock(&bch_register_lock);
 
-       err = read_super(&sb, c->opts, path);
+       err = bch_read_super(&sb, c->opts, path);
        if (err)
-               goto err_unlock;
+               goto err_unlock_register;
 
-       err = validate_cache_super(&sb);
+       err = bch_validate_cache_super(&sb);
        if (err)
-               goto err_unlock;
+               goto err_unlock_register;
+
+       mutex_lock(&c->sb_lock);
 
        err = can_add_cache(sb.sb, c);
        if (err)
@@ -2129,8 +1596,9 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path)
         * Preserve the old cache member information (esp. tier)
         * before we start bashing the disk stuff.
         */
-       mi = sb.sb->members[sb.sb->nr_this_dev];
-       mi.last_mount = cpu_to_le64(ktime_get_seconds());
+       dev_mi = bch_sb_get_members(sb.sb);
+       saved_mi = dev_mi->members[sb.sb->dev_idx];
+       saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
 
        down_read(&c->gc_lock);
 
@@ -2140,9 +1608,10 @@ int bch_cache_set_add_cache(struct cache_set *c, const char *path)
        if (test_bit(CACHE_SET_GC_FAILURE, &c->flags))
                goto no_slot;
 
-       for (nr_this_dev = 0; nr_this_dev < MAX_CACHES_PER_SET; nr_this_dev++)
-               if (nr_this_dev >= c->sb.nr_in_set ||
-                   bch_is_zero(c->disk_mi[nr_this_dev].uuid.b,
+       mi = bch_sb_get_members(c->disk_sb);
+       for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
+               if (dev_idx >= c->sb.nr_devices ||
+                   bch_is_zero(mi->members[dev_idx].uuid.b,
                                 sizeof(uuid_le)))
                        goto have_slot;
 no_slot:
@@ -2153,52 +1622,46 @@ no_slot:
        goto err_unlock;
 
 have_slot:
-       nr_in_set = max_t(unsigned, nr_this_dev + 1, c->sb.nr_in_set);
        up_read(&c->gc_lock);
 
-       u64s = nr_in_set * (sizeof(struct cache_member) / sizeof(u64));
+       nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
+       u64s = (sizeof(struct bch_sb_field_members) +
+               sizeof(struct bch_member) * nr_devices) / sizeof(u64);
        err = "no space in superblock for member info";
-       if (bch_super_realloc(&sb, u64s))
+
+       f = bch_fs_sb_field_resize(c, &mi->field, u64s);
+       if (!f)
                goto err_unlock;
 
-       new_mi = dynamic_fault("bcache:add:member_info_realloc")
-               ? NULL
-               : kmalloc(sizeof(struct cache_member) * nr_in_set,
-                         GFP_KERNEL);
-       if (!new_mi) {
-               err = "cannot allocate memory";
-               ret = -ENOMEM;
+       mi = container_of(f, struct bch_sb_field_members, field);
+
+       f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
+       if (!f)
                goto err_unlock;
-       }
 
-       memcpy(new_mi, c->disk_mi,
-              sizeof(struct cache_member) * nr_in_set);
-       new_mi[nr_this_dev] = mi;
+       dev_mi = container_of(f, struct bch_sb_field_members, field);
+       memcpy(dev_mi, mi, u64s * sizeof(u64));
+       dev_mi->members[dev_idx] = saved_mi;
 
-       sb.sb->nr_this_dev      = nr_this_dev;
-       sb.sb->nr_in_set        = nr_in_set;
-       sb.sb->u64s             = cpu_to_le16(u64s);
-       memcpy(sb.sb->members, new_mi,
-              sizeof(struct cache_member) * nr_in_set);
+       sb.sb->dev_idx          = dev_idx;
+       sb.sb->nr_devices       = nr_devices;
 
-       if (cache_set_mi_update(c, new_mi, nr_in_set)) {
+       if (bch_cache_set_mi_update(c, dev_mi->members, nr_devices)) {
                err = "cannot allocate memory";
                ret = -ENOMEM;
                goto err_unlock;
        }
 
        /* commit new member info */
-       swap(c->disk_mi, new_mi);
-       kfree(new_mi);
-       new_mi = NULL;
-       c->disk_sb.nr_in_set = nr_in_set;
-       c->sb.nr_in_set = nr_in_set;
+       memcpy(mi, dev_mi, u64s * sizeof(u64));
+       c->disk_sb->nr_devices  = nr_devices;
+       c->sb.nr_devices        = nr_devices;
 
        err = cache_alloc(&sb, c, &ca);
        if (err)
                goto err_unlock;
 
-       bcache_write_super(c);
+       bch_write_super(c);
 
        err = "journal alloc failed";
        if (bch_cache_journal_alloc(ca))
@@ -2206,21 +1669,23 @@ have_slot:
 
        bch_notify_cache_added(ca);
 
-       if (ca->mi.state == CACHE_ACTIVE) {
+       if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE) {
                err = __bch_cache_read_write(c, ca);
                if (err)
                        goto err_put;
        }
 
        kobject_put(&ca->kobj);
+       mutex_unlock(&c->sb_lock);
        mutex_unlock(&bch_register_lock);
        return 0;
 err_put:
        bch_cache_stop(ca);
 err_unlock:
-       kfree(new_mi);
-       free_super(&sb);
+       mutex_unlock(&c->sb_lock);
+err_unlock_register:
        mutex_unlock(&bch_register_lock);
+       bch_free_super(&sb);
 
        bch_err(c, "Unable to add device: %s", err);
        return ret ?: -EINVAL;
@@ -2250,14 +1715,14 @@ const char *bch_register_cache_set(char * const *devices, unsigned nr_devices,
                goto err;
 
        /*
-        * read_super() needs to happen under register_lock, so that the
+        * bch_read_super() needs to happen under register_lock, so that the
         * exclusive open is atomic with adding the new cache set to the list of
         * cache sets:
         */
        mutex_lock(&bch_register_lock);
 
        for (i = 0; i < nr_devices; i++) {
-               err = read_super(&sb[i], opts, devices[i]);
+               err = bch_read_super(&sb[i], opts, devices[i]);
                if (err)
                        goto err_unlock;
 
@@ -2265,13 +1730,13 @@ const char *bch_register_cache_set(char * const *devices, unsigned nr_devices,
                if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
                        goto err_unlock;
 
-               err = validate_cache_super(&sb[i]);
+               err = bch_validate_cache_super(&sb[i]);
                if (err)
                        goto err_unlock;
        }
 
        err = "cache set already registered";
-       if (cache_set_lookup(sb->sb->set_uuid))
+       if (cache_set_lookup(sb->sb->uuid))
                goto err_unlock;
 
        err = "cannot allocate memory";
@@ -2317,7 +1782,7 @@ err_unlock:
        mutex_unlock(&bch_register_lock);
 err:
        for (i = 0; i < nr_devices; i++)
-               free_super(&sb[i]);
+               bch_free_super(&sb[i]);
        goto out;
 }
 
@@ -2329,7 +1794,7 @@ const char *bch_register_one(const char *path)
 
        mutex_lock(&bch_register_lock);
 
-       err = read_super(&sb, opts, path);
+       err = bch_read_super(&sb, opts, path);
        if (err)
                goto err;
 
@@ -2338,7 +1803,7 @@ const char *bch_register_one(const char *path)
        else
                err = register_cache(&sb, opts);
 
-       free_super(&sb);
+       bch_free_super(&sb);
 err:
        mutex_unlock(&bch_register_lock);
        return err;
@@ -2440,8 +1905,8 @@ static void bcache_exit(void)
                class_destroy(bch_chardev_class);
        if (bch_chardev_major > 0)
                unregister_chrdev(bch_chardev_major, "bcache");
-       if (!IS_ERR_OR_NULL(bch_sha1))
-               crypto_free_shash(bch_sha1);
+       if (!IS_ERR_OR_NULL(bch_sha256))
+               crypto_free_shash(bch_sha256);
        unregister_reboot_notifier(&reboot);
 }
 
@@ -2459,8 +1924,8 @@ static int __init bcache_init(void)
        closure_debug_init();
        bkey_pack_test();
 
-       bch_sha1 = crypto_alloc_shash("sha1", 0, 0);
-       if (IS_ERR(bch_sha1))
+       bch_sha256 = crypto_alloc_shash("sha256", 0, 0);
+       if (IS_ERR(bch_sha256))
                goto err;
 
        bch_chardev_major = register_chrdev(0, "bcache-ctl", &bch_chardev_fops);
index 635e1a6f5cf30a5966f4fe24d757c28a05cb4788..014d7aed50a2e17a77b08ae463aa9d941ace3398 100644 (file)
@@ -18,17 +18,12 @@ static inline sector_t bucket_remainder(const struct cache *ca, sector_t s)
        return s & (ca->mi.bucket_size - 1);
 }
 
-#define cache_member_info_get(_c)                                      \
-       (rcu_read_lock(), rcu_dereference((_c)->members))
-
-#define cache_member_info_put()        rcu_read_unlock()
-
 static inline struct cache *bch_next_cache_rcu(struct cache_set *c,
                                               unsigned *iter)
 {
        struct cache *ret = NULL;
 
-       while (*iter < c->sb.nr_in_set &&
+       while (*iter < c->sb.nr_devices &&
               !(ret = rcu_dereference(c->cache[*iter])))
                (*iter)++;
 
@@ -59,40 +54,6 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
             (ca = bch_get_next_cache(c, &(iter)));                     \
             percpu_ref_put(&ca->ref), (iter)++)
 
-void bch_check_mark_super_slowpath(struct cache_set *,
-                                  const struct bkey_i *, bool);
-
-static inline bool bch_check_super_marked(struct cache_set *c,
-                                         const struct bkey_i *k, bool meta)
-{
-       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
-       const struct bch_extent_ptr *ptr;
-       struct cache_member_cpu *mi = cache_member_info_get(c)->m;
-       bool ret = true;
-
-       extent_for_each_ptr(e, ptr)
-               if (!(meta
-                     ? mi[ptr->dev].has_metadata
-                     : mi[ptr->dev].has_data) &&
-                   bch_extent_ptr_is_dirty(c, e, ptr)) {
-                       ret = false;
-                       break;
-               }
-
-       cache_member_info_put();
-
-       return ret;
-}
-
-static inline void bch_check_mark_super(struct cache_set *c,
-                                       const struct bkey_i *k, bool meta)
-{
-       if (bch_check_super_marked(c, k, meta))
-               return;
-
-       bch_check_mark_super_slowpath(c, k, meta);
-}
-
 static inline bool bch_cache_may_remove(struct cache *ca)
 {
        struct cache_set *c = ca->set;
@@ -119,11 +80,6 @@ static inline bool bch_cache_may_remove(struct cache *ca)
                rcu_access_pointer(tier->d[0].dev) != ca;
 }
 
-void free_super(struct bcache_superblock *);
-int bch_super_realloc(struct bcache_superblock *, unsigned);
-void bcache_write_super(struct cache_set *);
-void __write_super(struct cache_set *, struct bcache_superblock *);
-
 void bch_cache_set_release(struct kobject *);
 void bch_cache_release(struct kobject *);
 
@@ -149,7 +105,7 @@ extern struct mutex bch_register_lock;
 extern struct list_head bch_cache_sets;
 extern struct idr bch_cache_set_minor;
 extern struct workqueue_struct *bcache_io_wq;
-extern struct crypto_shash *bch_sha1;
+extern struct crypto_shash *bch_sha256;
 
 extern struct kobj_type bch_cache_set_ktype;
 extern struct kobj_type bch_cache_set_internal_ktype;
index d89f780f544fd68db38f7216cfe583e436c62c92..41eaf0dd50d28565868b71987e96c826df471dff 100644 (file)
@@ -2,7 +2,7 @@
 #define _BCACHE_SUPER_TYPES_H
 
 struct bcache_superblock {
-       struct cache_sb         *sb;
+       struct bch_sb           *sb;
        struct block_device     *bdev;
        struct bio              *bio;
        unsigned                page_order;
index 58a712594c66a74676d92381b60c8c32fbd74f05..57b7dd9d5edbedb53296938895091e5970c5b84c 100644 (file)
@@ -8,9 +8,11 @@
 #include "bcache.h"
 #include "alloc.h"
 #include "blockdev.h"
+#include "compress.h"
 #include "sysfs.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
+#include "btree_update.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "inode.h"
@@ -19,6 +21,7 @@
 #include "move.h"
 #include "opts.h"
 #include "request.h"
+#include "super-io.h"
 #include "writeback.h"
 
 #include <linux/blkdev.h>
@@ -139,14 +142,14 @@ read_attribute(tier);
        BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)     \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)           \
        static struct attribute sysfs_opt_##_name = {                   \
                .name = #_name,                                         \
                .mode = S_IRUGO|(_perm ? S_IWUSR : 0)                   \
        };
 
-       CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+       BCH_VISIBLE_OPTS()
+#undef BCH_OPT
 
 #define BCH_TIME_STAT(name, frequency_units, duration_units)           \
        sysfs_time_stats_attribute(name, frequency_units, duration_units);
@@ -193,8 +196,8 @@ SHOW(bch_cached_dev)
        sysfs_print(state,              states[BDEV_STATE(dc->disk_sb.sb)]);
 
        if (attr == &sysfs_label) {
-               memcpy(buf, dc->disk_sb.sb->label, SB_LABEL_SIZE);
-               buf[SB_LABEL_SIZE + 1] = '\0';
+               memcpy(buf, dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
+               buf[BCH_SB_LABEL_SIZE + 1] = '\0';
                strcat(buf, "\n");
                return strlen(buf);
        }
@@ -248,24 +251,25 @@ STORE(__cached_dev)
                u64 journal_seq = 0;
                int ret = 0;
 
-               if (size > SB_LABEL_SIZE)
+               if (size > BCH_SB_LABEL_SIZE)
                        return -EINVAL;
 
                mutex_lock(&dc->disk.inode_lock);
 
                memcpy(dc->disk_sb.sb->label, buf, size);
-               if (size < SB_LABEL_SIZE)
+               if (size < BCH_SB_LABEL_SIZE)
                        dc->disk_sb.sb->label[size] = '\0';
                if (size && dc->disk_sb.sb->label[size - 1] == '\n')
                        dc->disk_sb.sb->label[size - 1] = '\0';
 
                memcpy(dc->disk.inode.v.i_label,
-                      dc->disk_sb.sb->label, SB_LABEL_SIZE);
+                      dc->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
 
                bch_write_bdev_super(dc, NULL);
 
                if (dc->disk.c)
-                       ret = bch_inode_update(dc->disk.c, &dc->disk.inode.k_i,
+                       ret = bch_btree_update(dc->disk.c, BTREE_ID_INODES,
+                                              &dc->disk.inode.k_i,
                                               &journal_seq);
 
                mutex_unlock(&dc->disk.inode_lock);
@@ -367,8 +371,8 @@ SHOW(bch_blockdev_volume)
        sysfs_hprint(size,      le64_to_cpu(d->inode.v.i_size));
 
        if (attr == &sysfs_label) {
-               memcpy(buf, d->inode.v.i_label, SB_LABEL_SIZE);
-               buf[SB_LABEL_SIZE + 1] = '\0';
+               memcpy(buf, d->inode.v.i_label, BCH_SB_LABEL_SIZE);
+               buf[BCH_SB_LABEL_SIZE + 1] = '\0';
                strcat(buf, "\n");
                return strlen(buf);
        }
@@ -397,7 +401,8 @@ STORE(__bch_blockdev_volume)
                        }
                }
                d->inode.v.i_size = cpu_to_le64(v);
-               ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq);
+               ret = bch_btree_update(d->c, BTREE_ID_INODES,
+                                      &d->inode.k_i, &journal_seq);
 
                mutex_unlock(&d->inode_lock);
 
@@ -417,8 +422,9 @@ STORE(__bch_blockdev_volume)
 
                mutex_lock(&d->inode_lock);
 
-               memcpy(d->inode.v.i_label, buf, SB_LABEL_SIZE);
-               ret = bch_inode_update(d->c, &d->inode.k_i, &journal_seq);
+               memcpy(d->inode.v.i_label, buf, BCH_SB_LABEL_SIZE);
+               ret = bch_btree_update(d->c, BTREE_ID_INODES,
+                                      &d->inode.k_i, &journal_seq);
 
                mutex_unlock(&d->inode_lock);
 
@@ -677,10 +683,8 @@ SHOW(bch_cache_set)
        sysfs_print(tiering_percent,            c->tiering_percent);
        sysfs_pd_controller_show(tiering,       &c->tiering_pd);
 
-       sysfs_printf(meta_replicas_have, "%llu",
-                    CACHE_SET_META_REPLICAS_HAVE(&c->disk_sb));
-       sysfs_printf(data_replicas_have, "%llu",
-                    CACHE_SET_DATA_REPLICAS_HAVE(&c->disk_sb));
+       sysfs_printf(meta_replicas_have, "%u",  c->sb.meta_replicas_have);
+       sysfs_printf(data_replicas_have, "%u",  c->sb.data_replicas_have);
 
        /* Debugging: */
 
@@ -705,7 +709,7 @@ SHOW(bch_cache_set)
        if (attr == &sysfs_compression_stats)
                return bch_compression_stats(c, buf);
 
-       sysfs_printf(internal_uuid, "%pU", c->disk_sb.set_uuid.b);
+       sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
 
        return 0;
 }
@@ -945,15 +949,15 @@ SHOW(bch_cache_set_opts_dir)
 {
        struct cache_set *c = container_of(kobj, struct cache_set, opts_dir);
 
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)     \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)           \
        if (attr == &sysfs_opt_##_name)                                 \
                return _choices == bch_bool_opt || _choices == bch_uint_opt\
                        ? snprintf(buf, PAGE_SIZE, "%i\n", c->opts._name)\
                        : bch_snprint_string_list(buf, PAGE_SIZE,       \
                                                _choices, c->opts._name);\
 
-       CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+       BCH_VISIBLE_OPTS()
+#undef BCH_OPT
 
        return 0;
 }
@@ -962,7 +966,7 @@ STORE(bch_cache_set_opts_dir)
 {
        struct cache_set *c = container_of(kobj, struct cache_set, opts_dir);
 
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)     \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)           \
        if (attr == &sysfs_opt_##_name) {                               \
                ssize_t v = (_choices == bch_bool_opt ||                \
                             _choices == bch_uint_opt)                  \
@@ -972,18 +976,28 @@ STORE(bch_cache_set_opts_dir)
                if (v < 0)                                              \
                        return v;                                       \
                                                                        \
-               c->opts._name = v;                                      \
+               mutex_lock(&c->sb_lock);                                \
+               if (attr == &sysfs_opt_compression) {                   \
+                       int ret = bch_check_set_has_compressed_data(c, v);\
+                       if (ret) {                                      \
+                               mutex_unlock(&c->sb_lock);              \
+                               return ret;                             \
+                       }                                               \
+               }                                                       \
                                                                        \
-               if (_sb_opt##_BITS && v != _sb_opt(&c->disk_sb)) {      \
-                       SET_##_sb_opt(&c->disk_sb, v);                  \
-                       bcache_write_super(c);                          \
+               if (_sb_opt##_BITS && v != _sb_opt(c->disk_sb)) {       \
+                       SET_##_sb_opt(c->disk_sb, v);                   \
+                       bch_write_super(c);                     \
                }                                                       \
                                                                        \
+               c->opts._name = v;                                      \
+               mutex_unlock(&c->sb_lock);                              \
+                                                                       \
                return size;                                            \
        }
 
-       CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+       BCH_VISIBLE_OPTS()
+#undef BCH_OPT
 
        return size;
 }
@@ -993,11 +1007,11 @@ static void bch_cache_set_opts_dir_release(struct kobject *k)
 }
 
 static struct attribute *bch_cache_set_opts_dir_files[] = {
-#define CACHE_SET_OPT(_name, _choices, _min, _max, _sb_opt, _perm)     \
+#define BCH_OPT(_name, _choices, _min, _max, _sb_opt, _perm)   \
        &sysfs_opt_##_name,
 
-       CACHE_SET_VISIBLE_OPTS()
-#undef CACHE_SET_OPT
+       BCH_VISIBLE_OPTS()
+#undef BCH_OPT
 
        NULL
 };
@@ -1176,7 +1190,7 @@ SHOW(bch_cache)
        struct cache_set *c = ca->set;
        struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
 
-       sysfs_printf(uuid,              "%pU\n", ca->disk_sb.sb->disk_uuid.b);
+       sysfs_printf(uuid,              "%pU\n", ca->uuid.b);
 
        sysfs_hprint(bucket_size,       bucket_bytes(ca));
        sysfs_print(bucket_size_bytes,  bucket_bytes(ca));
@@ -1242,17 +1256,21 @@ STORE(__bch_cache)
 {
        struct cache *ca = container_of(kobj, struct cache, kobj);
        struct cache_set *c = ca->set;
-       struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev];
+       struct bch_member *mi;
 
        sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
 
        if (attr == &sysfs_discard) {
                bool v = strtoul_or_return(buf);
 
-               if (v != CACHE_DISCARD(mi)) {
-                       SET_CACHE_DISCARD(mi, v);
-                       bcache_write_super(c);
+               mutex_lock(&c->sb_lock);
+               mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+
+               if (v != BCH_MEMBER_DISCARD(mi)) {
+                       SET_BCH_MEMBER_DISCARD(mi, v);
+                       bch_write_super(c);
                }
+               mutex_unlock(&c->sb_lock);
        }
 
        if (attr == &sysfs_cache_replacement_policy) {
@@ -1261,10 +1279,14 @@ STORE(__bch_cache)
                if (v < 0)
                        return v;
 
-               if ((unsigned) v != CACHE_REPLACEMENT(mi)) {
-                       SET_CACHE_REPLACEMENT(mi, v);
-                       bcache_write_super(c);
+               mutex_lock(&c->sb_lock);
+               mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+
+               if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
+                       SET_BCH_MEMBER_REPLACEMENT(mi, v);
+                       bch_write_super(c);
                }
+               mutex_unlock(&c->sb_lock);
        }
 
        if (attr == &sysfs_state_rw) {
@@ -1279,14 +1301,14 @@ STORE(__bch_cache)
                        return size;
 
                switch (v) {
-               case CACHE_ACTIVE:
+               case BCH_MEMBER_STATE_ACTIVE:
                        err = bch_cache_read_write(ca);
                        break;
-               case CACHE_RO:
+               case BCH_MEMBER_STATE_RO:
                        bch_cache_read_only(ca);
                        break;
-               case CACHE_FAILED:
-               case CACHE_SPARE:
+               case BCH_MEMBER_STATE_FAILED:
+               case BCH_MEMBER_STATE_SPARE:
                        /*
                         * XXX: need to migrate data off and set correct state
                         */
index 39b04f7b234bf99ddeb0d1da72ddb14f5463e350..4686459433e9d12725b2dde1dd79150f96ca6243 100644 (file)
@@ -8,6 +8,7 @@
 #include "io.h"
 #include "keylist.h"
 #include "move.h"
+#include "super-io.h"
 #include "tier.h"
 
 #include <linux/freezer.h>
@@ -40,7 +41,7 @@ static bool tiering_pred(struct cache_set *c,
 
                mi = cache_member_info_get(c);
                extent_for_each_ptr(e, ptr)
-                       if (ptr->dev < mi->nr_in_set &&
+                       if (ptr->dev < mi->nr_devices &&
                            mi->m[ptr->dev].tier >= s->tier_idx)
                                replicas++;
                cache_member_info_put();
diff --git a/libbcache/vstructs.h b/libbcache/vstructs.h
new file mode 100644 (file)
index 0000000..ce2cece
--- /dev/null
@@ -0,0 +1,62 @@
+#ifndef _VSTRUCTS_H
+#define _VSTRUCTS_H
+
+#include "util.h"
+
+/*
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
+ * assumes u64 is little endian:
+ */
+#define __vstruct_u64s(_s)                                             \
+({                                                                     \
+       ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s)            \
+       : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s)            \
+       : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s)            \
+       : ((_s)->u64s));                                                \
+})
+
+#define __vstruct_bytes(_type, _u64s)                                  \
+({                                                                     \
+       BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));             \
+                                                                       \
+       (offsetof(_type, _data) + (_u64s) * sizeof(u64));               \
+})
+
+#define vstruct_bytes(_s)                                              \
+       __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
+
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s)             \
+       (round_up(__vstruct_bytes(_type, _u64s),                        \
+                 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
+
+#define vstruct_blocks(_s, _sector_block_bits)                         \
+       __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
+
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s)             \
+       __vstruct_blocks(typeof(*(_s)), _sector_block_bits,             \
+                        __vstruct_u64s(_s) + (_u64s))
+
+#define vstruct_sectors(_s, _sector_block_bits)                                \
+       (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
+
+#define vstruct_next(_s)                                               \
+       ((typeof(_s))                   ((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_last(_s)                                               \
+       ((typeof(&(_s)->start[0]))      ((_s)->_data + __vstruct_u64s(_s)))
+#define vstruct_end(_s)                                                        \
+       ((void *)                       ((_s)->_data + __vstruct_u64s(_s)))
+
+#define vstruct_for_each(_s, _i)                                       \
+       for (_i = (_s)->start;                                          \
+            _i < vstruct_last(_s);                                     \
+            _i = vstruct_next(_i))
+
+#define vstruct_for_each_safe(_s, _i, _t)                              \
+       for (_i = (_s)->start;                                          \
+            _i < vstruct_last(_s) && (_t = vstruct_next(_i), true);    \
+            _i = _t)
+
+#define vstruct_idx(_s, _idx)                                          \
+       ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
+
+#endif /* _VSTRUCTS_H */
index e9e0a9a7512f91d48598b373c5438150ddd71a05..56a8e8f8964a1d34d3c504dcd1baa6a4d0f61c79 100644 (file)
@@ -9,7 +9,6 @@
 
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
-#include <crypto/hash.h>
 
 struct xattr_search_key {
        u8              type;
@@ -22,37 +21,13 @@ struct xattr_search_key {
 static u64 bch_xattr_hash(const struct bch_hash_info *info,
                          const struct xattr_search_key *key)
 {
-       switch (info->type) {
-       case BCH_STR_HASH_SHA1: {
-               SHASH_DESC_ON_STACK(desc, bch_sha1);
-               u8 digest[SHA1_DIGEST_SIZE];
-               u64 ret;
+       struct bch_str_hash_ctx ctx;
 
-               desc->tfm = bch_sha1;
-               desc->flags = 0;
-               crypto_shash_init(desc);
+       bch_str_hash_init(&ctx, info);
+       bch_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
+       bch_str_hash_update(&ctx, info, key->name.name, key->name.len);
 
-               crypto_shash_update(desc, (void *) &info->seed, sizeof(info->seed));
-
-               crypto_shash_update(desc, (void *) &key->type, sizeof(key->type));
-               crypto_shash_update(desc, (void *) key->name.name, key->name.len);
-
-               crypto_shash_final(desc, digest);
-               memcpy(&ret, &digest, sizeof(ret));
-               return ret >> 1;
-       }
-       default: {
-               struct bch_str_hash_ctx ctx;
-
-               bch_str_hash_init(&ctx, info->type);
-               bch_str_hash_update(&ctx, info->type, &info->seed, sizeof(info->seed));
-
-               bch_str_hash_update(&ctx, info->type, &key->type, sizeof(key->type));
-               bch_str_hash_update(&ctx, info->type, key->name.name, key->name.len);
-
-               return bch_str_hash_end(&ctx, info->type);
-       }
-       }
+       return bch_str_hash_end(&ctx, info);
 }
 
 #define xattr_val(_xattr)      ((_xattr)->x_name + (_xattr)->x_name_len)
diff --git a/linux/crypto/algapi.c b/linux/crypto/algapi.c
deleted file mode 100644 (file)
index 5e8e97b..0000000
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Cryptographic API for algorithms (i.e., low-level API).
- *
- * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <linux/byteorder.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-#include <linux/rtnetlink.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-
-#include "internal.h"
-
-static inline int crypto_set_driver_name(struct crypto_alg *alg)
-{
-       static const char suffix[] = "-generic";
-       char *driver_name = alg->cra_driver_name;
-       int len;
-
-       if (*driver_name)
-               return 0;
-
-       len = strlcpy(driver_name, alg->cra_name, CRYPTO_MAX_ALG_NAME);
-       if (len + sizeof(suffix) > CRYPTO_MAX_ALG_NAME)
-               return -ENAMETOOLONG;
-
-       memcpy(driver_name + len, suffix, sizeof(suffix));
-       return 0;
-}
-
-static int crypto_check_alg(struct crypto_alg *alg)
-{
-       if (alg->cra_alignmask & (alg->cra_alignmask + 1))
-               return -EINVAL;
-
-       if (alg->cra_blocksize > PAGE_SIZE / 8)
-               return -EINVAL;
-
-       if (alg->cra_priority < 0)
-               return -EINVAL;
-
-       atomic_set(&alg->cra_refcnt, 1);
-
-       return crypto_set_driver_name(alg);
-}
-
-static int __crypto_register_alg(struct crypto_alg *alg)
-{
-       struct crypto_alg *q;
-       int ret = -EAGAIN;
-
-       INIT_LIST_HEAD(&alg->cra_users);
-
-       ret = -EEXIST;
-
-       list_for_each_entry(q, &crypto_alg_list, cra_list) {
-               if (q == alg)
-                       goto err;
-
-               if (!strcmp(q->cra_driver_name, alg->cra_name) ||
-                   !strcmp(q->cra_name, alg->cra_driver_name))
-                       goto err;
-       }
-
-       list_add(&alg->cra_list, &crypto_alg_list);
-       return 0;
-err:
-       return ret;
-}
-
-void crypto_remove_final(struct list_head *list)
-{
-       struct crypto_alg *alg;
-       struct crypto_alg *n;
-
-       list_for_each_entry_safe(alg, n, list, cra_list) {
-               list_del_init(&alg->cra_list);
-               crypto_alg_put(alg);
-       }
-}
-
-int crypto_register_alg(struct crypto_alg *alg)
-{
-       int err;
-
-       err = crypto_check_alg(alg);
-       if (err)
-               return err;
-
-       down_write(&crypto_alg_sem);
-       err = __crypto_register_alg(alg);
-       up_write(&crypto_alg_sem);
-
-       return err;
-}
-
-static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list)
-{
-       if (unlikely(list_empty(&alg->cra_list)))
-               return -ENOENT;
-
-       list_del_init(&alg->cra_list);
-       return 0;
-}
-
-int crypto_unregister_alg(struct crypto_alg *alg)
-{
-       int ret;
-       LIST_HEAD(list);
-
-       down_write(&crypto_alg_sem);
-       ret = crypto_remove_alg(alg, &list);
-       up_write(&crypto_alg_sem);
-
-       if (ret)
-               return ret;
-
-       BUG_ON(atomic_read(&alg->cra_refcnt) != 1);
-       if (alg->cra_destroy)
-               alg->cra_destroy(alg);
-
-       crypto_remove_final(&list);
-       return 0;
-}
-
-int crypto_register_algs(struct crypto_alg *algs, int count)
-{
-       int i, ret;
-
-       for (i = 0; i < count; i++) {
-               ret = crypto_register_alg(&algs[i]);
-               if (ret)
-                       goto err;
-       }
-
-       return 0;
-
-err:
-       for (--i; i >= 0; --i)
-               crypto_unregister_alg(&algs[i]);
-
-       return ret;
-}
-
-int crypto_unregister_algs(struct crypto_alg *algs, int count)
-{
-       int i, ret;
-
-       for (i = 0; i < count; i++) {
-               ret = crypto_unregister_alg(&algs[i]);
-               if (ret)
-                       pr_err("Failed to unregister %s %s: %d\n",
-                              algs[i].cra_driver_name, algs[i].cra_name, ret);
-       }
-
-       return 0;
-}
-
-struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb)
-{
-       struct rtattr *rta = tb[0];
-       struct crypto_attr_type *algt;
-
-       if (!rta)
-               return ERR_PTR(-ENOENT);
-       if (RTA_PAYLOAD(rta) < sizeof(*algt))
-               return ERR_PTR(-EINVAL);
-       if (rta->rta_type != CRYPTOA_TYPE)
-               return ERR_PTR(-EINVAL);
-
-       algt = RTA_DATA(rta);
-
-       return algt;
-}
-
-int crypto_check_attr_type(struct rtattr **tb, u32 type)
-{
-       struct crypto_attr_type *algt;
-
-       algt = crypto_get_attr_type(tb);
-       if (IS_ERR(algt))
-               return PTR_ERR(algt);
-
-       if ((algt->type ^ type) & algt->mask)
-               return -EINVAL;
-
-       return 0;
-}
-
-const char *crypto_attr_alg_name(struct rtattr *rta)
-{
-       struct crypto_attr_alg *alga;
-
-       if (!rta)
-               return ERR_PTR(-ENOENT);
-       if (RTA_PAYLOAD(rta) < sizeof(*alga))
-               return ERR_PTR(-EINVAL);
-       if (rta->rta_type != CRYPTOA_ALG)
-               return ERR_PTR(-EINVAL);
-
-       alga = RTA_DATA(rta);
-       alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0;
-
-       return alga->name;
-}
-
-struct crypto_alg *crypto_attr_alg2(struct rtattr *rta,
-                                   const struct crypto_type *frontend,
-                                   u32 type, u32 mask)
-{
-       const char *name;
-
-       name = crypto_attr_alg_name(rta);
-       if (IS_ERR(name))
-               return ERR_CAST(name);
-
-       return crypto_find_alg(name, frontend, type, mask);
-}
-
-int crypto_attr_u32(struct rtattr *rta, u32 *num)
-{
-       struct crypto_attr_u32 *nu32;
-
-       if (!rta)
-               return -ENOENT;
-       if (RTA_PAYLOAD(rta) < sizeof(*nu32))
-               return -EINVAL;
-       if (rta->rta_type != CRYPTOA_U32)
-               return -EINVAL;
-
-       nu32 = RTA_DATA(rta);
-       *num = nu32->num;
-
-       return 0;
-}
-
-static inline void crypto_inc_byte(u8 *a, unsigned int size)
-{
-       u8 *b = (a + size);
-       u8 c;
-
-       for (; size; size--) {
-               c = *--b + 1;
-               *b = c;
-               if (c)
-                       break;
-       }
-}
-
-void crypto_inc(u8 *a, unsigned int size)
-{
-       __be32 *b = (__be32 *)(a + size);
-       u32 c;
-
-       for (; size >= 4; size -= 4) {
-               c = be32_to_cpu(*--b) + 1;
-               *b = cpu_to_be32(c);
-               if (c)
-                       return;
-       }
-
-       crypto_inc_byte(a, size);
-}
-
-static inline void crypto_xor_byte(u8 *a, const u8 *b, unsigned int size)
-{
-       for (; size; size--)
-               *a++ ^= *b++;
-}
-
-void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
-{
-       u32 *a = (u32 *)dst;
-       u32 *b = (u32 *)src;
-
-       for (; size >= 4; size -= 4)
-               *a++ ^= *b++;
-
-       crypto_xor_byte((u8 *)a, (u8 *)b, size);
-}
-
-unsigned int crypto_alg_extsize(struct crypto_alg *alg)
-{
-       return alg->cra_ctxsize +
-              (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1));
-}
-
-int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
-                       u32 type, u32 mask)
-{
-       int ret = 0;
-       struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask);
-
-       if (!IS_ERR(alg)) {
-               crypto_alg_put(alg);
-               ret = 1;
-       }
-
-       return ret;
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Cryptographic algorithms API");
index 513a48aa540d5f6c03d5460f5ee4b1c2a69f12b1..2d24630e00ed6169ca938594197b016247722e15 100644 (file)
@@ -1,12 +1,7 @@
 /*
- * Scatterlist Cryptographic API.
+ * Cryptographic API for algorithms (i.e., low-level API).
  *
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * Copyright (c) 2002 David S. Miller (davem@redhat.com)
- * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no>
- * and Nettle, by Niels Möller.
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/param.h>
-#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+
+#include <crypto/algapi.h>
 #include "internal.h"
 
-LIST_HEAD(crypto_alg_list);
-DECLARE_RWSEM(crypto_alg_sem);
+static LIST_HEAD(crypto_alg_list);
+static DECLARE_RWSEM(crypto_alg_sem);
 
-static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type,
-                                             u32 mask)
+static unsigned crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask)
 {
-       struct crypto_alg *q, *alg = NULL;
-       int best = -2;
-
-       list_for_each_entry(q, &crypto_alg_list, cra_list) {
-               int exact, fuzzy;
-
-               if ((q->cra_flags ^ type) & mask)
-                       continue;
-
-               exact = !strcmp(q->cra_driver_name, name);
-               fuzzy = !strcmp(q->cra_name, name);
-               if (!exact && !(fuzzy && q->cra_priority > best))
-                       continue;
-
-               if (unlikely(!crypto_alg_get(q)))
-                       continue;
-
-               best = q->cra_priority;
-               if (alg)
-                       crypto_alg_put(alg);
-               alg = q;
-
-               if (exact)
-                       break;
-       }
+       return alg->cra_type->ctxsize(alg, type, mask);
+}
 
-       return alg;
+unsigned crypto_alg_extsize(struct crypto_alg *alg)
+{
+       return alg->cra_ctxsize;
 }
 
 struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask)
 {
        struct crypto_alg *alg;
 
-       /*
-        * If the internal flag is set for a cipher, require a caller to
-        * to invoke the cipher with the internal flag to use that cipher.
-        * Also, if a caller wants to allocate a cipher that may or may
-        * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and
-        * !(mask & CRYPTO_ALG_INTERNAL).
-        */
-       if (!((type | mask) & CRYPTO_ALG_INTERNAL))
-               mask |= CRYPTO_ALG_INTERNAL;
-
        down_read(&crypto_alg_sem);
-       alg = __crypto_alg_lookup(name, type, mask);
-       up_read(&crypto_alg_sem);
+       list_for_each_entry(alg, &crypto_alg_list, cra_list)
+               if (!((alg->cra_flags ^ type) & mask) &&
+                   !strcmp(alg->cra_name, name))
+                       goto found;
 
-       return alg ?: ERR_PTR(-ENOENT);
-}
-
-static int crypto_init_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
-{
-       const struct crypto_type *type_obj = tfm->__crt_alg->cra_type;
-
-       if (type_obj)
-               return type_obj->init(tfm, type, mask);
-
-       switch (crypto_tfm_alg_type(tfm)) {
-       case CRYPTO_ALG_TYPE_CIPHER:
-               return crypto_init_cipher_ops(tfm);
-       default:
-               break;
-       }
+       alg = ERR_PTR(-ENOENT);
+found:
+       up_read(&crypto_alg_sem);
 
-       BUG();
-       return -EINVAL;
+       return alg;
 }
 
 static void crypto_exit_ops(struct crypto_tfm *tfm)
 {
-       const struct crypto_type *type = tfm->__crt_alg->cra_type;
-
-       if (type) {
-               if (tfm->exit)
-                       tfm->exit(tfm);
-               return;
-       }
-
-       switch (crypto_tfm_alg_type(tfm)) {
-       case CRYPTO_ALG_TYPE_CIPHER:
-               crypto_exit_cipher_ops(tfm);
-               break;
-
-       default:
-               BUG();
-       }
-}
-
-static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask)
-{
-       const struct crypto_type *type_obj = alg->cra_type;
-       unsigned int len;
-
-       len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1);
-       if (type_obj)
-               return len + type_obj->ctxsize(alg, type, mask);
-
-       switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
-       default:
-               BUG();
-
-       case CRYPTO_ALG_TYPE_CIPHER:
-               len += crypto_cipher_ctxsize(alg);
-               break;
-       }
-
-       return len;
+       if (tfm->exit)
+               tfm->exit(tfm);
 }
 
-struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
-                                     u32 mask)
+static struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg,
+                                            u32 type, u32 mask)
 {
        struct crypto_tfm *tfm = NULL;
-       unsigned int tfm_size;
+       unsigned tfm_size;
        int err = -ENOMEM;
 
        tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask);
        tfm = kzalloc(tfm_size, GFP_KERNEL);
        if (tfm == NULL)
-               goto out_err;
+               return ERR_PTR(-ENOMEM);
 
        tfm->__crt_alg = alg;
 
-       err = crypto_init_ops(tfm, type, mask);
+       err = alg->cra_type->init(tfm, type, mask);
        if (err)
                goto out_free_tfm;
 
        if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
                goto cra_init_failed;
 
-       goto out;
+       return tfm;
 
 cra_init_failed:
        crypto_exit_ops(tfm);
 out_free_tfm:
        kfree(tfm);
-out_err:
-       tfm = ERR_PTR(err);
-out:
-       return tfm;
+       return ERR_PTR(err);
 }
 
-/*
- *     crypto_alloc_base - Locate algorithm and allocate transform
- *     @alg_name: Name of algorithm
- *     @type: Type of algorithm
- *     @mask: Mask for type comparison
- *
- *     This function should not be used by new algorithm types.
- *     Please use crypto_alloc_tfm instead.
- *
- *     crypto_alloc_base() will first attempt to locate an already loaded
- *     algorithm.  If that fails and the kernel supports dynamically loadable
- *     modules, it will then attempt to load a module of the same name or
- *     alias.  If that fails it will send a query to any loaded crypto manager
- *     to construct an algorithm on the fly.  A refcount is grabbed on the
- *     algorithm which is then associated with the new transform.
- *
- *     The returned transform is of a non-determinate type.  Most people
- *     should use one of the more specific allocation functions such as
- *     crypto_alloc_blkcipher.
- *
- *     In case of error the return value is an error pointer.
- */
 struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask)
 {
        struct crypto_alg *alg;
@@ -208,31 +100,29 @@ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask)
        }
 
        tfm = __crypto_alloc_tfm(alg, type, mask);
-       if (IS_ERR(tfm)) {
-               crypto_alg_put(alg);
+       if (IS_ERR(tfm))
                return tfm;
-       }
 
        return tfm;
 }
 
-void *crypto_create_tfm(struct crypto_alg *alg,
-                       const struct crypto_type *frontend)
+static void *crypto_create_tfm(struct crypto_alg *alg,
+                              const struct crypto_type *frontend)
 {
-       char *mem;
        struct crypto_tfm *tfm = NULL;
-       unsigned int tfmsize;
-       unsigned int total;
+       unsigned tfmsize;
+       unsigned total;
+       void *mem;
        int err = -ENOMEM;
 
        tfmsize = frontend->tfmsize;
        total = tfmsize + sizeof(*tfm) + frontend->extsize(alg);
 
        mem = kzalloc(total, GFP_KERNEL);
-       if (mem == NULL)
+       if (!mem)
                goto out_err;
 
-       tfm = (struct crypto_tfm *)(mem + tfmsize);
+       tfm = mem + tfmsize;
        tfm->__crt_alg = alg;
 
        err = frontend->init_tfm(tfm);
@@ -254,28 +144,23 @@ out:
        return mem;
 }
 
-struct crypto_alg *crypto_find_alg(const char *alg_name,
-                                  const struct crypto_type *frontend,
-                                  u32 type, u32 mask)
+static struct crypto_alg *crypto_find_alg(const char *alg_name,
+                                         const struct crypto_type *frontend,
+                                         u32 type, u32 mask)
 {
-       struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask) =
-               crypto_alg_mod_lookup;
-
        if (frontend) {
                type &= frontend->maskclear;
                mask &= frontend->maskclear;
                type |= frontend->type;
                mask |= frontend->maskset;
-
-               if (frontend->lookup)
-                       lookup = frontend->lookup;
        }
 
-       return lookup(alg_name, type, mask);
+       return crypto_alg_mod_lookup(alg_name, type, mask);
 }
 
 void *crypto_alloc_tfm(const char *alg_name,
-                      const struct crypto_type *frontend, u32 type, u32 mask)
+                      const struct crypto_type *frontend,
+                      u32 type, u32 mask)
 {
        struct crypto_alg *alg;
        void *tfm;
@@ -285,10 +170,8 @@ void *crypto_alloc_tfm(const char *alg_name,
                return ERR_CAST(alg);
 
        tfm = crypto_create_tfm(alg, frontend);
-       if (IS_ERR(tfm)) {
-               crypto_alg_put(alg);
+       if (IS_ERR(tfm))
                return tfm;
-       }
 
        return tfm;
 }
@@ -305,22 +188,16 @@ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
        if (!tfm->exit && alg->cra_exit)
                alg->cra_exit(tfm);
        crypto_exit_ops(tfm);
-       crypto_alg_put(alg);
        kzfree(mem);
 }
 
-int crypto_has_alg(const char *name, u32 type, u32 mask)
+int crypto_register_alg(struct crypto_alg *alg)
 {
-       int ret = 0;
-       struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask);
+       INIT_LIST_HEAD(&alg->cra_users);
 
-       if (!IS_ERR(alg)) {
-               crypto_alg_put(alg);
-               ret = 1;
-       }
+       down_write(&crypto_alg_sem);
+       list_add(&alg->cra_list, &crypto_alg_list);
+       up_write(&crypto_alg_sem);
 
-       return ret;
+       return 0;
 }
-
-MODULE_DESCRIPTION("Cryptographic core API");
-MODULE_LICENSE("GPL");
diff --git a/linux/crypto/blkcipher.c b/linux/crypto/blkcipher.c
new file mode 100644 (file)
index 0000000..31f9141
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Block chaining cipher operations.
+ *
+ * Generic encrypt/decrypt wrapper for ciphers, handles operations across
+ * multiple page boundaries by using temporary blocks.  In user context,
+ * the kernel is given a chance to schedule us once per page.
+ *
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <crypto/algapi.h>
+#include "internal.h"
+
+static unsigned crypto_blkcipher_ctxsize(struct crypto_alg *alg,
+                                        u32 type, u32 mask)
+{
+       return alg->cra_ctxsize;
+}
+
+static int crypto_init_blkcipher_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
+{
+       struct blkcipher_tfm *crt = &tfm->crt_blkcipher;
+       struct blkcipher_alg *alg = &tfm->__crt_alg->cra_blkcipher;
+
+       BUG_ON((mask & CRYPTO_ALG_TYPE_MASK) != CRYPTO_ALG_TYPE_MASK);
+
+       crt->setkey     = alg->setkey;
+       crt->encrypt    = alg->encrypt;
+       crt->decrypt    = alg->decrypt;
+       return 0;
+}
+
+const struct crypto_type crypto_blkcipher_type = {
+       .ctxsize        = crypto_blkcipher_ctxsize,
+       .init           = crypto_init_blkcipher_ops,
+};
diff --git a/linux/crypto/chacha20_generic.c b/linux/crypto/chacha20_generic.c
new file mode 100644 (file)
index 0000000..7ac6832
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/byteorder.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/scatterlist.h>
+#include <asm/unaligned.h>
+
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+
+#include <sodium/crypto_stream_chacha20.h>
+
+struct chacha20_ctx {
+       u32 key[8];
+};
+
+static int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
+                                 unsigned int keysize)
+{
+       struct chacha20_ctx *ctx = crypto_tfm_ctx(tfm);
+       int i;
+
+       if (keysize != CHACHA20_KEY_SIZE)
+               return -EINVAL;
+
+       for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
+               ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+       return 0;
+}
+
+static int crypto_chacha20_crypt(struct blkcipher_desc *desc,
+                                struct scatterlist *dst,
+                                struct scatterlist *src,
+                                unsigned nbytes)
+{
+       struct chacha20_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct scatterlist *sg = src;
+       u32 iv[4];
+       int ret;
+
+       BUG_ON(src != dst);
+
+       memcpy(iv, desc->info, sizeof(iv));
+
+       while (1) {
+               ret = crypto_stream_chacha20_xor_ic(sg_virt(sg),
+                                                   sg_virt(sg),
+                                                   sg->length,
+                                                   (void *) &iv[2],
+                                                   iv[0] | ((u64) iv[1] << 32),
+                                                   (void *) ctx->key);
+               BUG_ON(ret);
+
+               nbytes -= sg->length;
+
+               if (sg_is_last(sg))
+                       break;
+
+               BUG_ON(sg->length % CHACHA20_BLOCK_SIZE);
+               iv[0] += sg->length / CHACHA20_BLOCK_SIZE;
+               sg = sg_next(sg);
+       };
+
+       BUG_ON(nbytes);
+
+       return 0;
+}
+
+static struct crypto_alg alg = {
+       .cra_name               = "chacha20",
+       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
+       .cra_type               = &crypto_blkcipher_type,
+       .cra_ctxsize            = sizeof(struct chacha20_ctx),
+       .cra_u                  = {
+               .blkcipher = {
+                       .setkey         = crypto_chacha20_setkey,
+                       .encrypt        = crypto_chacha20_crypt,
+                       .decrypt        = crypto_chacha20_crypt,
+               },
+       },
+};
+
+__attribute__((constructor(110)))
+static int chacha20_generic_mod_init(void)
+{
+       return crypto_register_alg(&alg);
+}
diff --git a/linux/crypto/cipher.c b/linux/crypto/cipher.c
deleted file mode 100644 (file)
index 6f47ac6..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Cipher operations.
- *
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/crypto.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include "internal.h"
-
-static int setkey_unaligned(struct crypto_tfm *tfm, const u8 *key,
-                           unsigned int keylen)
-{
-       struct cipher_alg *cia = &tfm->__crt_alg->cra_cipher;
-       unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
-       int ret;
-       u8 *buffer, *alignbuffer;
-       unsigned long absize;
-
-       absize = keylen + alignmask;
-       buffer = kmalloc(absize, GFP_ATOMIC);
-       if (!buffer)
-               return -ENOMEM;
-
-       alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
-       memcpy(alignbuffer, key, keylen);
-       ret = cia->cia_setkey(tfm, alignbuffer, keylen);
-       memset(alignbuffer, 0, keylen);
-       kfree(buffer);
-       return ret;
-
-}
-
-static int setkey_default(struct crypto_tfm *tfm, const u8 *key,
-                         unsigned int keylen)
-{
-       struct cipher_alg *cia = &tfm->__crt_alg->cra_cipher;
-       unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
-
-       tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
-       if (keylen < cia->cia_min_keysize || keylen > cia->cia_max_keysize) {
-               tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-               return -EINVAL;
-       }
-
-       if ((unsigned long)key & alignmask)
-               return setkey_unaligned(tfm, key, keylen);
-
-       return cia->cia_setkey(tfm, key, keylen);
-}
-
-static void cipher_crypt_unaligned(void (*fn)(struct crypto_tfm *, u8 *,
-                                             const u8 *),
-                                  struct crypto_tfm *tfm,
-                                  u8 *dst, const u8 *src)
-{
-       unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
-       unsigned int size = crypto_tfm_alg_blocksize(tfm);
-       u8 buffer[size + alignmask];
-       u8 *tmp = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
-
-       memcpy(tmp, src, size);
-       fn(tfm, tmp, tmp);
-       memcpy(dst, tmp, size);
-}
-
-static void cipher_encrypt_unaligned(struct crypto_tfm *tfm,
-                                    u8 *dst, const u8 *src)
-{
-       unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
-       struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
-
-       if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) {
-               cipher_crypt_unaligned(cipher->cia_encrypt, tfm, dst, src);
-               return;
-       }
-
-       cipher->cia_encrypt(tfm, dst, src);
-}
-
-static void cipher_decrypt_unaligned(struct crypto_tfm *tfm,
-                                    u8 *dst, const u8 *src)
-{
-       unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
-       struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
-
-       if (unlikely(((unsigned long)dst | (unsigned long)src) & alignmask)) {
-               cipher_crypt_unaligned(cipher->cia_decrypt, tfm, dst, src);
-               return;
-       }
-
-       cipher->cia_decrypt(tfm, dst, src);
-}
-
-int crypto_init_cipher_ops(struct crypto_tfm *tfm)
-{
-       struct cipher_tfm *ops = &tfm->crt_cipher;
-       struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
-
-       ops->cit_setkey = setkey_default;
-       ops->cit_encrypt_one = crypto_tfm_alg_alignmask(tfm) ?
-               cipher_encrypt_unaligned : cipher->cia_encrypt;
-       ops->cit_decrypt_one = crypto_tfm_alg_alignmask(tfm) ?
-               cipher_decrypt_unaligned : cipher->cia_decrypt;
-
-       return 0;
-}
-
-void crypto_exit_cipher_ops(struct crypto_tfm *tfm)
-{
-}
index b00dcea2529bff22e753ad0f6739157d65a3f38c..5b21f836f6f6a43156f0c060a7faa944d70b54dd 100644 (file)
 #ifndef _CRYPTO_INTERNAL_H
 #define _CRYPTO_INTERNAL_H
 
-#include <crypto/algapi.h>
-#include <linux/completion.h>
-#include <linux/mm.h>
-#include <linux/list.h>
-#include <linux/kernel.h>
-#include <linux/notifier.h>
-#include <linux/rwsem.h>
-#include <linux/slab.h>
+struct crypto_type;
+struct crypto_alg;
 
-struct crypto_instance;
-struct crypto_template;
-
-struct crypto_larval {
-       struct crypto_alg alg;
-       struct crypto_alg *adult;
-       struct completion completion;
-       u32 mask;
-};
-
-extern struct list_head crypto_alg_list;
-extern struct rw_semaphore crypto_alg_sem;
-
-static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg)
-{
-       return alg->cra_ctxsize;
-}
-
-int crypto_init_cipher_ops(struct crypto_tfm *tfm);
-void crypto_exit_cipher_ops(struct crypto_tfm *tfm);
-
-void crypto_remove_final(struct list_head *list);
-struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
-                                     u32 mask);
-void *crypto_create_tfm(struct crypto_alg *alg,
-                       const struct crypto_type *frontend);
-struct crypto_alg *crypto_find_alg(const char *alg_name,
-                                  const struct crypto_type *frontend,
-                                  u32 type, u32 mask);
-void *crypto_alloc_tfm(const char *alg_name,
-                      const struct crypto_type *frontend, u32 type, u32 mask);
-
-int crypto_register_notifier(struct notifier_block *nb);
-int crypto_unregister_notifier(struct notifier_block *nb);
-
-unsigned int crypto_alg_extsize(struct crypto_alg *alg);
-
-int crypto_type_has_alg(const char *name, const struct crypto_type *frontend,
-                       u32 type, u32 mask);
-
-static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
-{
-       atomic_inc(&alg->cra_refcnt);
-       return alg;
-}
-
-static inline void crypto_alg_put(struct crypto_alg *alg)
-{
-       if (atomic_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy)
-               alg->cra_destroy(alg);
-}
+void *crypto_alloc_tfm(const char *, const struct crypto_type *, u32, u32);
+unsigned int crypto_alg_extsize(struct crypto_alg *);
 
 #endif /* _CRYPTO_INTERNAL_H */
 
diff --git a/linux/crypto/poly1305_generic.c b/linux/crypto/poly1305_generic.c
new file mode 100644 (file)
index 0000000..5d385d5
--- /dev/null
@@ -0,0 +1,76 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/byteorder.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <asm/unaligned.h>
+
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/poly1305.h>
+
+struct poly1305_desc_ctx {
+       bool                                    key_done;
+       crypto_onetimeauth_poly1305_state       s;
+};
+
+
+static int poly1305_init(struct shash_desc *desc)
+{
+       struct poly1305_desc_ctx *state = shash_desc_ctx(desc);
+
+       state->key_done = false;
+       return 0;
+}
+
+static int poly1305_update(struct shash_desc *desc,
+                          const u8 *src, unsigned len)
+{
+       struct poly1305_desc_ctx *state = shash_desc_ctx(desc);
+
+       if (!state->key_done) {
+               BUG_ON(len != crypto_onetimeauth_poly1305_KEYBYTES);
+
+               state->key_done = true;
+               return crypto_onetimeauth_poly1305_init(&state->s, src);
+       }
+
+       return crypto_onetimeauth_poly1305_update(&state->s, src, len);
+}
+
+static int poly1305_final(struct shash_desc *desc, u8 *out)
+{
+       struct poly1305_desc_ctx *state = shash_desc_ctx(desc);
+
+       return crypto_onetimeauth_poly1305_final(&state->s, out);
+}
+
+static struct shash_alg poly1305_alg = {
+       .digestsize     = crypto_onetimeauth_poly1305_BYTES,
+       .init           = poly1305_init,
+       .update         = poly1305_update,
+       .final          = poly1305_final,
+       .descsize       = sizeof(struct poly1305_desc_ctx),
+       .base           = {
+               .cra_name       = "poly1305",
+               .cra_flags      = CRYPTO_ALG_TYPE_SHASH,
+       },
+};
+
+__attribute__((constructor(110)))
+static int poly1305_mod_init(void)
+{
+       return crypto_register_shash(&poly1305_alg);
+}
diff --git a/linux/crypto/sha1_generic.c b/linux/crypto/sha1_generic.c
deleted file mode 100644 (file)
index 31b5d12..0000000
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Cryptographic API.
- *
- * SHA1 Secure Hash Algorithm.
- *
- * Derived from cryptoapi implementation, adapted for in-place
- * scatterlist interface.
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-#include <crypto/internal/hash.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/cryptohash.h>
-#include <linux/types.h>
-#include <crypto/sha.h>
-#include <crypto/sha1_base.h>
-#include <asm/byteorder.h>
-
-const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = {
-       0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d,
-       0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90,
-       0xaf, 0xd8, 0x07, 0x09
-};
-
-static void sha1_generic_block_fn(struct sha1_state *sst, u8 const *src,
-                                 int blocks)
-{
-       u32 temp[SHA_WORKSPACE_WORDS];
-
-       while (blocks--) {
-               sha_transform(sst->state, src, temp);
-               src += SHA1_BLOCK_SIZE;
-       }
-       memzero_explicit(temp, sizeof(temp));
-}
-
-int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
-                      unsigned int len)
-{
-       return sha1_base_do_update(desc, data, len, sha1_generic_block_fn);
-}
-
-static int sha1_final(struct shash_desc *desc, u8 *out)
-{
-       sha1_base_do_finalize(desc, sha1_generic_block_fn);
-       return sha1_base_finish(desc, out);
-}
-
-int crypto_sha1_finup(struct shash_desc *desc, const u8 *data,
-                     unsigned int len, u8 *out)
-{
-       sha1_base_do_update(desc, data, len, sha1_generic_block_fn);
-       return sha1_final(desc, out);
-}
-
-static struct shash_alg alg = {
-       .digestsize     =       SHA1_DIGEST_SIZE,
-       .init           =       sha1_base_init,
-       .update         =       crypto_sha1_update,
-       .final          =       sha1_final,
-       .finup          =       crypto_sha1_finup,
-       .descsize       =       sizeof(struct sha1_state),
-       .base           =       {
-               .cra_name       =       "sha1",
-               .cra_driver_name=       "sha1-generic",
-               .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
-               .cra_blocksize  =       SHA1_BLOCK_SIZE,
-               .cra_module     =       THIS_MODULE,
-       }
-};
-
-__attribute__((constructor(110)))
-static int __init sha1_generic_mod_init(void)
-{
-       return crypto_register_shash(&alg);
-}
diff --git a/linux/crypto/sha256_generic.c b/linux/crypto/sha256_generic.c
new file mode 100644 (file)
index 0000000..0bd272f
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * Cryptographic API.
+ *
+ * SHA-256, as specified in
+ * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
+ *
+ * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/byteorder.h>
+#include <linux/types.h>
+#include <asm/unaligned.h>
+
+#include <linux/crypto.h>
+#include <crypto/internal/hash.h>
+
+#include <sodium/crypto_hash_sha256.h>
+
+static int sha256_init(struct shash_desc *desc)
+{
+       crypto_hash_sha256_state *state = shash_desc_ctx(desc);
+
+       return crypto_hash_sha256_init(state);
+}
+
+static int sha256_update(struct shash_desc *desc, const u8 *data,
+                         unsigned int len)
+{
+       crypto_hash_sha256_state *state = shash_desc_ctx(desc);
+
+       return crypto_hash_sha256_update(state, data, len);
+}
+
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+       crypto_hash_sha256_state *state = shash_desc_ctx(desc);
+
+       return crypto_hash_sha256_final(state, out);
+}
+
+static struct shash_alg sha256_alg = {
+       .digestsize     = crypto_hash_sha256_BYTES,
+       .init           = sha256_init,
+       .update         = sha256_update,
+       .final          = sha256_final,
+       .descsize       = sizeof(crypto_hash_sha256_state),
+       .base           = {
+               .cra_name       = "sha256",
+               .cra_flags      = CRYPTO_ALG_TYPE_SHASH,
+       }
+};
+
+__attribute__((constructor(110)))
+static int __init sha256_generic_mod_init(void)
+{
+       return crypto_register_shash(&sha256_alg);
+}
index 406ddfe82a1e38b3a8c0906bb84f71f6ae49bf56..4f07a8b8221c3cefe267e0c7ec5e9ca032eee11f 100644 (file)
 #include <crypto/internal/hash.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
 
 #include "internal.h"
 
-static int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
-                          unsigned int keylen)
-{
-       return -ENOSYS;
-}
-
-static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
-                                 unsigned int keylen)
-{
-       struct shash_alg *shash = crypto_shash_alg(tfm);
-       unsigned long alignmask = crypto_shash_alignmask(tfm);
-       unsigned long absize;
-       u8 *buffer, *alignbuffer;
-       int err;
-
-       absize = keylen + (alignmask & ~(crypto_tfm_ctx_alignment() - 1));
-       buffer = kmalloc(absize, GFP_KERNEL);
-       if (!buffer)
-               return -ENOMEM;
-
-       alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
-       memcpy(alignbuffer, key, keylen);
-       err = shash->setkey(tfm, alignbuffer, keylen);
-       kzfree(buffer);
-       return err;
-}
-
-int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
-                       unsigned int keylen)
-{
-       struct shash_alg *shash = crypto_shash_alg(tfm);
-       unsigned long alignmask = crypto_shash_alignmask(tfm);
-
-       if ((unsigned long)key & alignmask)
-               return shash_setkey_unaligned(tfm, key, keylen);
-
-       return shash->setkey(tfm, key, keylen);
-}
-
-static inline unsigned int shash_align_buffer_size(unsigned len,
-                                                  unsigned long mask)
-{
-       typedef u8 __attribute__ ((aligned)) u8_aligned;
-       return len + (mask & ~(__alignof__(u8_aligned) - 1));
-}
-
-static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
-                                 unsigned int len)
-{
-       struct crypto_shash *tfm = desc->tfm;
-       struct shash_alg *shash = crypto_shash_alg(tfm);
-       unsigned long alignmask = crypto_shash_alignmask(tfm);
-       unsigned int unaligned_len = alignmask + 1 -
-                                    ((unsigned long)data & alignmask);
-       u8 ubuf[shash_align_buffer_size(unaligned_len, alignmask)]
-               __attribute__ ((aligned));
-       u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
-       int err;
-
-       if (unaligned_len > len)
-               unaligned_len = len;
-
-       memcpy(buf, data, unaligned_len);
-       err = shash->update(desc, buf, unaligned_len);
-       memset(buf, 0, unaligned_len);
-
-       return err ?:
-              shash->update(desc, data + unaligned_len, len - unaligned_len);
-}
-
-int crypto_shash_update(struct shash_desc *desc, const u8 *data,
-                       unsigned int len)
-{
-       struct crypto_shash *tfm = desc->tfm;
-       struct shash_alg *shash = crypto_shash_alg(tfm);
-       unsigned long alignmask = crypto_shash_alignmask(tfm);
-
-       if ((unsigned long)data & alignmask)
-               return shash_update_unaligned(desc, data, len);
-
-       return shash->update(desc, data, len);
-}
-
-static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
-{
-       struct crypto_shash *tfm = desc->tfm;
-       unsigned long alignmask = crypto_shash_alignmask(tfm);
-       struct shash_alg *shash = crypto_shash_alg(tfm);
-       unsigned int ds = crypto_shash_digestsize(tfm);
-       u8 ubuf[shash_align_buffer_size(ds, alignmask)]
-               __attribute__ ((aligned));
-       u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
-       int err;
-
-       err = shash->final(desc, buf);
-       if (err)
-               goto out;
-
-       memcpy(out, buf, ds);
-
-out:
-       memset(buf, 0, ds);
-       return err;
-}
-
-int crypto_shash_final(struct shash_desc *desc, u8 *out)
-{
-       struct crypto_shash *tfm = desc->tfm;
-       struct shash_alg *shash = crypto_shash_alg(tfm);
-       unsigned long alignmask = crypto_shash_alignmask(tfm);
-
-       if ((unsigned long)out & alignmask)
-               return shash_final_unaligned(desc, out);
-
-       return shash->final(desc, out);
-}
-
-static int shash_finup_unaligned(struct shash_desc *desc, const u8 *data,
-                                unsigned int len, u8 *out)
+static int shash_finup(struct shash_desc *desc, const u8 *data,
+                      unsigned len, u8 *out)
 {
        return crypto_shash_update(desc, data, len) ?:
               crypto_shash_final(desc, out);
 }
 
-int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
-                      unsigned int len, u8 *out)
-{
-       struct crypto_shash *tfm = desc->tfm;
-       struct shash_alg *shash = crypto_shash_alg(tfm);
-       unsigned long alignmask = crypto_shash_alignmask(tfm);
-
-       if (((unsigned long)data | (unsigned long)out) & alignmask)
-               return shash_finup_unaligned(desc, data, len, out);
-
-       return shash->finup(desc, data, len, out);
-}
-
-static int shash_digest_unaligned(struct shash_desc *desc, const u8 *data,
-                                 unsigned int len, u8 *out)
+static int shash_digest(struct shash_desc *desc, const u8 *data,
+                                 unsigned len, u8 *out)
 {
        return crypto_shash_init(desc) ?:
               crypto_shash_finup(desc, data, len, out);
 }
 
-int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
-                       unsigned int len, u8 *out)
-{
-       struct crypto_shash *tfm = desc->tfm;
-       struct shash_alg *shash = crypto_shash_alg(tfm);
-       unsigned long alignmask = crypto_shash_alignmask(tfm);
-
-       if (((unsigned long)data | (unsigned long)out) & alignmask)
-               return shash_digest_unaligned(desc, data, len, out);
-
-       return shash->digest(desc, data, len, out);
-}
-
-static int shash_default_export(struct shash_desc *desc, void *out)
-{
-       memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm));
-       return 0;
-}
-
-static int shash_default_import(struct shash_desc *desc, const void *in)
-{
-       memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(desc->tfm));
-       return 0;
-}
-
 static int crypto_shash_init_tfm(struct crypto_tfm *tfm)
 {
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
@@ -197,98 +41,32 @@ static int crypto_shash_init_tfm(struct crypto_tfm *tfm)
 }
 
 static const struct crypto_type crypto_shash_type = {
-       .extsize = crypto_alg_extsize,
-       .init_tfm = crypto_shash_init_tfm,
-       .maskclear = ~CRYPTO_ALG_TYPE_MASK,
-       .maskset = CRYPTO_ALG_TYPE_MASK,
-       .type = CRYPTO_ALG_TYPE_SHASH,
-       .tfmsize = offsetof(struct crypto_shash, base),
+       .extsize        = crypto_alg_extsize,
+       .init_tfm       = crypto_shash_init_tfm,
+       .maskclear      = ~CRYPTO_ALG_TYPE_MASK,
+       .maskset        = CRYPTO_ALG_TYPE_MASK,
+       .type           = CRYPTO_ALG_TYPE_SHASH,
+       .tfmsize        = offsetof(struct crypto_shash, base),
 };
 
-struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
-                                       u32 mask)
+struct crypto_shash *crypto_alloc_shash(const char *alg_name,
+                                       u32 type, u32 mask)
 {
        return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask);
 }
 
-static int shash_prepare_alg(struct shash_alg *alg)
+int crypto_register_shash(struct shash_alg *alg)
 {
        struct crypto_alg *base = &alg->base;
 
-       if (alg->digestsize > PAGE_SIZE / 8 ||
-           alg->descsize > PAGE_SIZE / 8 ||
-           alg->statesize > PAGE_SIZE / 8)
-               return -EINVAL;
-
        base->cra_type = &crypto_shash_type;
        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
        base->cra_flags |= CRYPTO_ALG_TYPE_SHASH;
 
        if (!alg->finup)
-               alg->finup = shash_finup_unaligned;
+               alg->finup = shash_finup;
        if (!alg->digest)
-               alg->digest = shash_digest_unaligned;
-       if (!alg->export) {
-               alg->export = shash_default_export;
-               alg->import = shash_default_import;
-               alg->statesize = alg->descsize;
-       }
-       if (!alg->setkey)
-               alg->setkey = shash_no_setkey;
-
-       return 0;
-}
-
-int crypto_register_shash(struct shash_alg *alg)
-{
-       struct crypto_alg *base = &alg->base;
-       int err;
-
-       err = shash_prepare_alg(alg);
-       if (err)
-               return err;
+               alg->digest = shash_digest;
 
        return crypto_register_alg(base);
 }
-
-int crypto_unregister_shash(struct shash_alg *alg)
-{
-       return crypto_unregister_alg(&alg->base);
-}
-
-int crypto_register_shashes(struct shash_alg *algs, int count)
-{
-       int i, ret;
-
-       for (i = 0; i < count; i++) {
-               ret = crypto_register_shash(&algs[i]);
-               if (ret)
-                       goto err;
-       }
-
-       return 0;
-
-err:
-       for (--i; i >= 0; --i)
-               crypto_unregister_shash(&algs[i]);
-
-       return ret;
-}
-
-int crypto_unregister_shashes(struct shash_alg *algs, int count)
-{
-       int i, ret;
-
-       for (i = count - 1; i >= 0; --i) {
-               ret = crypto_unregister_shash(&algs[i]);
-               if (ret)
-                       pr_err("Failed to unregister %s %s: %d\n",
-                              algs[i].base.cra_driver_name,
-                              algs[i].base.cra_name, ret);
-       }
-
-       return 0;
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Synchronous cryptographic hash type");
diff --git a/linux/lz4hc_compress.c b/linux/lz4hc_compress.c
deleted file mode 100644 (file)
index b64ded0..0000000
+++ /dev/null
@@ -1,454 +0,0 @@
-/*
- * LZ4 HC - High Compression Mode of LZ4
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You can contact the author at :
- * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- * - LZ4 source repository : http://code.google.com/p/lz4/
- *
- *  Changed for kernel use by:
- *  Chanho Min <chanho.min@lge.com>
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/lz4.h>
-#include <asm/unaligned.h>
-#include "lz4defs.h"
-
-struct lz4hc_data {
-       const u8 *base;
-       HTYPE hashtable[HASHTABLESIZE];
-       u16 chaintable[MAXD];
-       const u8 *nexttoupdate;
-} __attribute__((__packed__));
-
-static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base)
-{
-       memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable));
-       memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable));
-
-#if LZ4_ARCH64
-       hc4->nexttoupdate = base + 1;
-#else
-       hc4->nexttoupdate = base;
-#endif
-       hc4->base = base;
-       return 1;
-}
-
-/* Update chains up to ip (excluded) */
-static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip)
-{
-       u16 *chaintable = hc4->chaintable;
-       HTYPE *hashtable  = hc4->hashtable;
-#if LZ4_ARCH64
-       const u8 * const base = hc4->base;
-#else
-       const int base = 0;
-#endif
-
-       while (hc4->nexttoupdate < ip) {
-               const u8 *p = hc4->nexttoupdate;
-               size_t delta = p - (hashtable[HASH_VALUE(p)] + base);
-               if (delta > MAX_DISTANCE)
-                       delta = MAX_DISTANCE;
-               chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta;
-               hashtable[HASH_VALUE(p)] = (p) - base;
-               hc4->nexttoupdate++;
-       }
-}
-
-static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4,
-               const u8 *ip, const u8 *const matchlimit, const u8 **matchpos)
-{
-       u16 *const chaintable = hc4->chaintable;
-       HTYPE *const hashtable = hc4->hashtable;
-       const u8 *ref;
-#if LZ4_ARCH64
-       const u8 * const base = hc4->base;
-#else
-       const int base = 0;
-#endif
-       int nbattempts = MAX_NB_ATTEMPTS;
-       size_t repl = 0, ml = 0;
-       u16 delta;
-
-       /* HC4 match finder */
-       lz4hc_insert(hc4, ip);
-       ref = hashtable[HASH_VALUE(ip)] + base;
-
-       /* potential repetition */
-       if (ref >= ip-4) {
-               /* confirmed */
-               if (A32(ref) == A32(ip)) {
-                       delta = (u16)(ip-ref);
-                       repl = ml  = common_length(ip + MINMATCH,
-                                       ref + MINMATCH, matchlimit) + MINMATCH;
-                       *matchpos = ref;
-               }
-               ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
-       }
-
-       while ((ref >= ip - MAX_DISTANCE) && nbattempts) {
-               nbattempts--;
-               if (*(ref + ml) == *(ip + ml)) {
-                       if (A32(ref) == A32(ip)) {
-                               size_t mlt =
-                                       common_length(ip + MINMATCH,
-                                       ref + MINMATCH, matchlimit) + MINMATCH;
-                               if (mlt > ml) {
-                                       ml = mlt;
-                                       *matchpos = ref;
-                               }
-                       }
-               }
-               ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
-       }
-
-       /* Complete table */
-       if (repl) {
-               const u8 *ptr = ip;
-               const u8 *end;
-               end = ip + repl - (MINMATCH-1);
-               /* Pre-Load */
-               while (ptr < end - delta) {
-                       chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
-                       ptr++;
-               }
-               do {
-                       chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
-                       /* Head of chain */
-                       hashtable[HASH_VALUE(ptr)] = (ptr) - base;
-                       ptr++;
-               } while (ptr < end);
-               hc4->nexttoupdate = end;
-       }
-
-       return (int)ml;
-}
-
-static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4,
-       const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest,
-       const u8 **matchpos, const u8 **startpos)
-{
-       u16 *const chaintable = hc4->chaintable;
-       HTYPE *const hashtable = hc4->hashtable;
-#if LZ4_ARCH64
-       const u8 * const base = hc4->base;
-#else
-       const int base = 0;
-#endif
-       const u8 *ref;
-       int nbattempts = MAX_NB_ATTEMPTS;
-       int delta = (int)(ip - startlimit);
-
-       /* First Match */
-       lz4hc_insert(hc4, ip);
-       ref = hashtable[HASH_VALUE(ip)] + base;
-
-       while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base)
-               && (nbattempts)) {
-               nbattempts--;
-               if (*(startlimit + longest) == *(ref - delta + longest)) {
-                       if (A32(ref) == A32(ip)) {
-                               const u8 *reft = ref;
-                               const u8 *startt = ip;
-                               unsigned length =
-                                       common_length(ip + MINMATCH,
-                                                     ref + MINMATCH,
-                                                     matchlimit);
-
-                               while ((startt > startlimit)
-                                       && (reft > hc4->base)
-                                       && (startt[-1] == reft[-1])) {
-                                       startt--;
-                                       reft--;
-                                       length++;
-                               }
-
-                               if (length > longest) {
-                                       longest = length;
-                                       *matchpos = reft;
-                                       *startpos = startt;
-                               }
-                       }
-               }
-               ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
-       }
-       return longest;
-}
-
-static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor,
-               int ml, const u8 *ref)
-{
-       unsigned length;
-       u8 *token;
-
-       /* Encode Literal length */
-       length = *ip - *anchor;
-       token = (*op)++;
-       *token = encode_length(op, length) << ML_BITS;
-
-       /* Copy Literals */
-       MEMCPY_ADVANCE_CHUNKED(*op, *anchor, length);
-
-       /* Encode Offset */
-       PUT_LE16_ADVANCE(*op, (u16)(*ip - ref));
-
-       *token += encode_length(op, ml - MINMATCH);
-
-       /* Prepare next loop */
-       *ip += ml;
-       *anchor = *ip;
-
-       return 0;
-}
-
-static int lz4_compresshcctx(struct lz4hc_data *ctx,
-               const char *source,
-               char *dest,
-               int isize)
-{
-       const u8 *ip = (const u8 *)source;
-       const u8 *anchor = ip;
-       const u8 *const iend = ip + isize;
-       const u8 *const mflimit = iend - MFLIMIT;
-       const u8 *const matchlimit = (iend - LASTLITERALS);
-
-       u8 *op = (u8 *)dest;
-
-       int ml, ml2, ml3, ml0;
-       const u8 *ref = NULL;
-       const u8 *start2 = NULL;
-       const u8 *ref2 = NULL;
-       const u8 *start3 = NULL;
-       const u8 *ref3 = NULL;
-       const u8 *start0;
-       const u8 *ref0;
-       int lastrun;
-
-       ip++;
-
-       /* Main Loop */
-       while (ip < mflimit) {
-               ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref));
-               if (!ml) {
-                       ip++;
-                       continue;
-               }
-
-               /* saved, in case we would skip too much */
-               start0 = ip;
-               ref0 = ref;
-               ml0 = ml;
-_search2:
-               if (ip+ml < mflimit)
-                       ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2,
-                               ip + 1, matchlimit, ml, &ref2, &start2);
-               else
-                       ml2 = ml;
-               /* No better match */
-               if (ml2 == ml) {
-                       lz4_encodesequence(&ip, &op, &anchor, ml, ref);
-                       continue;
-               }
-
-               if (start0 < ip) {
-                       /* empirical */
-                       if (start2 < ip + ml0) {
-                               ip = start0;
-                               ref = ref0;
-                               ml = ml0;
-                       }
-               }
-               /*
-                * Here, start0==ip
-                * First Match too small : removed
-                */
-               if ((start2 - ip) < 3) {
-                       ml = ml2;
-                       ip = start2;
-                       ref = ref2;
-                       goto _search2;
-               }
-
-_search3:
-               /*
-                * Currently we have :
-                * ml2 > ml1, and
-                * ip1+3 <= ip2 (usually < ip1+ml1)
-                */
-               if ((start2 - ip) < OPTIMAL_ML) {
-                       int correction;
-                       int new_ml = ml;
-                       if (new_ml > OPTIMAL_ML)
-                               new_ml = OPTIMAL_ML;
-                       if (ip + new_ml > start2 + ml2 - MINMATCH)
-                               new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
-                       correction = new_ml - (int)(start2 - ip);
-                       if (correction > 0) {
-                               start2 += correction;
-                               ref2 += correction;
-                               ml2 -= correction;
-                       }
-               }
-               /*
-                * Now, we have start2 = ip+new_ml,
-                * with new_ml=min(ml, OPTIMAL_ML=18)
-                */
-               if (start2 + ml2 < mflimit)
-                       ml3 = lz4hc_insertandgetwidermatch(ctx,
-                               start2 + ml2 - 3, start2, matchlimit,
-                               ml2, &ref3, &start3);
-               else
-                       ml3 = ml2;
-
-               /* No better match : 2 sequences to encode */
-               if (ml3 == ml2) {
-                       /* ip & ref are known; Now for ml */
-                       if (start2 < ip+ml)
-                               ml = (int)(start2 - ip);
-
-                       /* Now, encode 2 sequences */
-                       lz4_encodesequence(&ip, &op, &anchor, ml, ref);
-                       ip = start2;
-                       lz4_encodesequence(&ip, &op, &anchor, ml2, ref2);
-                       continue;
-               }
-
-               /* Not enough space for match 2 : remove it */
-               if (start3 < ip + ml + 3) {
-                       /*
-                        * can write Seq1 immediately ==> Seq2 is removed,
-                        * so Seq3 becomes Seq1
-                        */
-                       if (start3 >= (ip + ml)) {
-                               if (start2 < ip + ml) {
-                                       int correction =
-                                               (int)(ip + ml - start2);
-                                       start2 += correction;
-                                       ref2 += correction;
-                                       ml2 -= correction;
-                                       if (ml2 < MINMATCH) {
-                                               start2 = start3;
-                                               ref2 = ref3;
-                                               ml2 = ml3;
-                                       }
-                               }
-
-                               lz4_encodesequence(&ip, &op, &anchor, ml, ref);
-                               ip  = start3;
-                               ref = ref3;
-                               ml  = ml3;
-
-                               start0 = start2;
-                               ref0 = ref2;
-                               ml0 = ml2;
-                               goto _search2;
-                       }
-
-                       start2 = start3;
-                       ref2 = ref3;
-                       ml2 = ml3;
-                       goto _search3;
-               }
-
-               /*
-                * OK, now we have 3 ascending matches; let's write at least
-                * the first one ip & ref are known; Now for ml
-                */
-               if (start2 < ip + ml) {
-                       if ((start2 - ip) < (int)ML_MASK) {
-                               int correction;
-                               if (ml > OPTIMAL_ML)
-                                       ml = OPTIMAL_ML;
-                               if (ip + ml > start2 + ml2 - MINMATCH)
-                                       ml = (int)(start2 - ip) + ml2
-                                               - MINMATCH;
-                               correction = ml - (int)(start2 - ip);
-                               if (correction > 0) {
-                                       start2 += correction;
-                                       ref2 += correction;
-                                       ml2 -= correction;
-                               }
-                       } else
-                               ml = (int)(start2 - ip);
-               }
-               lz4_encodesequence(&ip, &op, &anchor, ml, ref);
-
-               ip = start2;
-               ref = ref2;
-               ml = ml2;
-
-               start2 = start3;
-               ref2 = ref3;
-               ml2 = ml3;
-
-               goto _search3;
-       }
-
-       /* Encode Last Literals */
-       lastrun = (int)(iend - anchor);
-       if (lastrun >= (int)RUN_MASK) {
-               *op++ = (RUN_MASK << ML_BITS);
-               lastrun -= RUN_MASK;
-               for (; lastrun > 254 ; lastrun -= 255)
-                       *op++ = 255;
-               *op++ = (u8) lastrun;
-       } else
-               *op++ = (lastrun << ML_BITS);
-       memcpy(op, anchor, iend - anchor);
-       op += iend - anchor;
-       /* End */
-       return (int) (((char *)op) - dest);
-}
-
-int lz4hc_compress(const unsigned char *src, size_t src_len,
-                       unsigned char *dst, size_t *dst_len, void *wrkmem)
-{
-       int ret = -1;
-       int out_len = 0;
-
-       struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem;
-       lz4hc_init(hc4, (const u8 *)src);
-       out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src,
-               (char *)dst, (int)src_len);
-
-       if (out_len < 0)
-               goto exit;
-
-       *dst_len = out_len;
-       return 0;
-
-exit:
-       return ret;
-}
-EXPORT_SYMBOL(lz4hc_compress);
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("LZ4HC compressor");
diff --git a/linux/sha1.c b/linux/sha1.c
deleted file mode 100644 (file)
index 5a56dfd..0000000
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * SHA1 routine optimized to do word accesses rather than byte accesses,
- * and to avoid unnecessary copies into the context array.
- *
- * This was based on the git SHA1 implementation.
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/bitops.h>
-#include <linux/cryptohash.h>
-#include <asm/unaligned.h>
-
-/*
- * If you have 32 registers or more, the compiler can (and should)
- * try to change the array[] accesses into registers. However, on
- * machines with less than ~25 registers, that won't really work,
- * and at least gcc will make an unholy mess of it.
- *
- * So to avoid that mess which just slows things down, we force
- * the stores to memory to actually happen (we might be better off
- * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
- * suggested by Artur Skawina - that will also make gcc unable to
- * try to do the silly "optimize away loads" part because it won't
- * see what the value will be).
- *
- * Ben Herrenschmidt reports that on PPC, the C version comes close
- * to the optimized asm with this (ie on PPC you don't want that
- * 'volatile', since there are lots of registers).
- *
- * On ARM we get the best code generation by forcing a full memory barrier
- * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
- * the stack frame size simply explode and performance goes down the drain.
- */
-
-#ifdef CONFIG_X86
-  #define setW(x, val) (*(volatile __u32 *)&W(x) = (val))
-#elif defined(CONFIG_ARM)
-  #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0)
-#else
-  #define setW(x, val) (W(x) = (val))
-#endif
-
-/* This "rolls" over the 512-bit array */
-#define W(x) (array[(x)&15])
-
-/*
- * Where do we get the source from? The first 16 iterations get it from
- * the input data, the next mix it from the 512-bit array.
- */
-#define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t)
-#define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
-
-#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
-       __u32 TEMP = input(t); setW(t, TEMP); \
-       E += TEMP + rol32(A,5) + (fn) + (constant); \
-       B = ror32(B, 2); } while (0)
-
-#define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
-#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
-#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E )
-#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
-#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, A, B, C, D, E )
-
-/**
- * sha_transform - single block SHA1 transform
- *
- * @digest: 160 bit digest to update
- * @data:   512 bits of data to hash
- * @array:  16 words of workspace (see note)
- *
- * This function generates a SHA1 digest for a single 512-bit block.
- * Be warned, it does not handle padding and message digest, do not
- * confuse it with the full FIPS 180-1 digest algorithm for variable
- * length messages.
- *
- * Note: If the hash is security sensitive, the caller should be sure
- * to clear the workspace. This is left to the caller to avoid
- * unnecessary clears between chained hashing operations.
- */
-void sha_transform(__u32 *digest, const char *data, __u32 *array)
-{
-       __u32 A, B, C, D, E;
-
-       A = digest[0];
-       B = digest[1];
-       C = digest[2];
-       D = digest[3];
-       E = digest[4];
-
-       /* Round 1 - iterations 0-16 take their input from 'data' */
-       T_0_15( 0, A, B, C, D, E);
-       T_0_15( 1, E, A, B, C, D);
-       T_0_15( 2, D, E, A, B, C);
-       T_0_15( 3, C, D, E, A, B);
-       T_0_15( 4, B, C, D, E, A);
-       T_0_15( 5, A, B, C, D, E);
-       T_0_15( 6, E, A, B, C, D);
-       T_0_15( 7, D, E, A, B, C);
-       T_0_15( 8, C, D, E, A, B);
-       T_0_15( 9, B, C, D, E, A);
-       T_0_15(10, A, B, C, D, E);
-       T_0_15(11, E, A, B, C, D);
-       T_0_15(12, D, E, A, B, C);
-       T_0_15(13, C, D, E, A, B);
-       T_0_15(14, B, C, D, E, A);
-       T_0_15(15, A, B, C, D, E);
-
-       /* Round 1 - tail. Input from 512-bit mixing array */
-       T_16_19(16, E, A, B, C, D);
-       T_16_19(17, D, E, A, B, C);
-       T_16_19(18, C, D, E, A, B);
-       T_16_19(19, B, C, D, E, A);
-
-       /* Round 2 */
-       T_20_39(20, A, B, C, D, E);
-       T_20_39(21, E, A, B, C, D);
-       T_20_39(22, D, E, A, B, C);
-       T_20_39(23, C, D, E, A, B);
-       T_20_39(24, B, C, D, E, A);
-       T_20_39(25, A, B, C, D, E);
-       T_20_39(26, E, A, B, C, D);
-       T_20_39(27, D, E, A, B, C);
-       T_20_39(28, C, D, E, A, B);
-       T_20_39(29, B, C, D, E, A);
-       T_20_39(30, A, B, C, D, E);
-       T_20_39(31, E, A, B, C, D);
-       T_20_39(32, D, E, A, B, C);
-       T_20_39(33, C, D, E, A, B);
-       T_20_39(34, B, C, D, E, A);
-       T_20_39(35, A, B, C, D, E);
-       T_20_39(36, E, A, B, C, D);
-       T_20_39(37, D, E, A, B, C);
-       T_20_39(38, C, D, E, A, B);
-       T_20_39(39, B, C, D, E, A);
-
-       /* Round 3 */
-       T_40_59(40, A, B, C, D, E);
-       T_40_59(41, E, A, B, C, D);
-       T_40_59(42, D, E, A, B, C);
-       T_40_59(43, C, D, E, A, B);
-       T_40_59(44, B, C, D, E, A);
-       T_40_59(45, A, B, C, D, E);
-       T_40_59(46, E, A, B, C, D);
-       T_40_59(47, D, E, A, B, C);
-       T_40_59(48, C, D, E, A, B);
-       T_40_59(49, B, C, D, E, A);
-       T_40_59(50, A, B, C, D, E);
-       T_40_59(51, E, A, B, C, D);
-       T_40_59(52, D, E, A, B, C);
-       T_40_59(53, C, D, E, A, B);
-       T_40_59(54, B, C, D, E, A);
-       T_40_59(55, A, B, C, D, E);
-       T_40_59(56, E, A, B, C, D);
-       T_40_59(57, D, E, A, B, C);
-       T_40_59(58, C, D, E, A, B);
-       T_40_59(59, B, C, D, E, A);
-
-       /* Round 4 */
-       T_60_79(60, A, B, C, D, E);
-       T_60_79(61, E, A, B, C, D);
-       T_60_79(62, D, E, A, B, C);
-       T_60_79(63, C, D, E, A, B);
-       T_60_79(64, B, C, D, E, A);
-       T_60_79(65, A, B, C, D, E);
-       T_60_79(66, E, A, B, C, D);
-       T_60_79(67, D, E, A, B, C);
-       T_60_79(68, C, D, E, A, B);
-       T_60_79(69, B, C, D, E, A);
-       T_60_79(70, A, B, C, D, E);
-       T_60_79(71, E, A, B, C, D);
-       T_60_79(72, D, E, A, B, C);
-       T_60_79(73, C, D, E, A, B);
-       T_60_79(74, B, C, D, E, A);
-       T_60_79(75, A, B, C, D, E);
-       T_60_79(76, E, A, B, C, D);
-       T_60_79(77, D, E, A, B, C);
-       T_60_79(78, C, D, E, A, B);
-       T_60_79(79, B, C, D, E, A);
-
-       digest[0] += A;
-       digest[1] += B;
-       digest[2] += C;
-       digest[3] += D;
-       digest[4] += E;
-}
-EXPORT_SYMBOL(sha_transform);
-
-/**
- * sha_init - initialize the vectors for a SHA1 digest
- * @buf: vector to initialize
- */
-void sha_init(__u32 *buf)
-{
-       buf[0] = 0x67452301;
-       buf[1] = 0xefcdab89;
-       buf[2] = 0x98badcfe;
-       buf[3] = 0x10325476;
-       buf[4] = 0xc3d2e1f0;
-}
-EXPORT_SYMBOL(sha_init);