]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 386f00b639 bcachefs: Snapshot creation, deletion
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 26 Sep 2021 22:19:46 +0000 (18:19 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Sun, 26 Sep 2021 23:50:47 +0000 (19:50 -0400)
47 files changed:
.bcachefs_revision
cmd_debug.c
cmd_migrate.c
libbcachefs/acl.c
libbcachefs/acl.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bcachefs_ioctl.h
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-common.c
libbcachefs/fs-common.h
libbcachefs/fs-io.c
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/io_types.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/reflink.h
libbcachefs/str_hash.h
libbcachefs/subvolume.c [new file with mode: 0644]
libbcachefs/subvolume.h [new file with mode: 0644]
libbcachefs/super.c
libbcachefs/xattr.c
libbcachefs/xattr.h

index d53addfb80e73e22d40dbb7055d92044ce75c4be..76bc725683cf45cc9927ca7886a191635fa5986e 100644 (file)
@@ -1 +1 @@
-bd6ed9fb42c0aa36d1f4a21eeab45fe12e1fb792
+386f00b6399a1eb38053c236aae87678f3535df7
index b3a6ea0c6d804de8afef5ec2c642780b9be9d798..aee19fbf48b4ea6c80550b9814d2f215de43a1b3 100644 (file)
@@ -191,6 +191,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
        bch2_trans_init(&trans, c, 0, 0);
 
        for_each_btree_key(&trans, iter, btree_id, start,
+                          BTREE_ITER_ALL_SNAPSHOTS|
                           BTREE_ITER_PREFETCH, k, ret) {
                if (bkey_cmp(k.k->p, end) > 0)
                        break;
index 51260906dccd95f71ed71627095dbe187777ba47..41cfe5d9ab4d9d9567bfe098ac56cd4a93030c23 100644 (file)
@@ -138,8 +138,9 @@ static void create_link(struct bch_fs *c,
        struct bch_inode_unpacked inode;
 
        int ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_link_trans(&trans, parent->bi_inum, inum,
-                               &parent_u, &inode, &qstr));
+               bch2_link_trans(&trans,
+                               (subvol_inum) { 1, parent->bi_inum }, &parent_u,
+                               (subvol_inum) { 1, inum }, &inode, &qstr));
        if (ret)
                die("error creating hardlink: %s", strerror(-ret));
 }
@@ -155,9 +156,10 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
 
        int ret = bch2_trans_do(c, NULL, NULL, 0,
                bch2_create_trans(&trans,
-                                 parent->bi_inum, parent,
+                                 (subvol_inum) { 1, parent->bi_inum }, parent,
                                  &new_inode, &qstr,
-                                 uid, gid, mode, rdev, NULL, NULL));
+                                 uid, gid, mode, rdev, NULL, NULL,
+                                 (subvol_inum) {}, 0));
        if (ret)
                die("error creating file: %s", strerror(-ret));
 
@@ -225,7 +227,9 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
                const struct xattr_handler *h = xattr_resolve_name(&attr);
 
                int ret = bch2_trans_do(c, NULL, NULL, 0,
-                               bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr,
+                               bch2_xattr_set(&trans,
+                                              (subvol_inum) { 1, dst->bi_inum },
+                                              &hash_info, attr,
                                               val, val_size, h->flags, 0));
                if (ret < 0)
                        die("error creating xattr: %s", strerror(-ret));
@@ -569,7 +573,8 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
        syncfs(src_fd);
 
        struct bch_inode_unpacked root_inode;
-       int ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, &root_inode);
+       int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO },
+                                         &root_inode);
        if (ret)
                die("error looking up root directory: %s", strerror(-ret));
 
index 2146a63d1846353fbac2badd7e028a702bf24e84..f92b52e49254d32c34ae1316bb24e685011064fc 100644 (file)
@@ -229,7 +229,7 @@ retry:
        bch2_trans_begin(&trans);
 
        ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
-                       &hash, inode->v.i_ino,
+                       &hash, inode_inum(inode),
                        &X_SEARCH(acl_to_xattr_type(type), "", 0),
                        0);
        if (ret) {
@@ -259,11 +259,11 @@ out:
        return acl;
 }
 
-int bch2_set_acl_trans(struct btree_trans *trans,
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
                       struct bch_inode_unpacked *inode_u,
-                      const struct bch_hash_info *hash_info,
                       struct posix_acl *acl, int type)
 {
+       struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
        int ret;
 
        if (type == ACL_TYPE_DEFAULT &&
@@ -276,14 +276,14 @@ int bch2_set_acl_trans(struct btree_trans *trans,
                if (IS_ERR(xattr))
                        return PTR_ERR(xattr);
 
-               ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
-                                   inode_u->bi_inum, &xattr->k_i, 0);
+               ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+                                   inum, &xattr->k_i, 0);
        } else {
                struct xattr_search_key search =
                        X_SEARCH(acl_to_xattr_type(type), "", 0);
 
-               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
-                                      inode_u->bi_inum, &search);
+               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+                                      inum, &search);
        }
 
        return ret == -ENOENT ? 0 : ret;
@@ -297,7 +297,6 @@ int bch2_set_acl(struct user_namespace *mnt_userns,
        struct btree_trans trans;
        struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
-       struct bch_hash_info hash_info;
        struct posix_acl *acl;
        umode_t mode;
        int ret;
@@ -308,7 +307,7 @@ retry:
        bch2_trans_begin(&trans);
        acl = _acl;
 
-       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
                              BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
@@ -321,9 +320,7 @@ retry:
                        goto btree_err;
        }
 
-       hash_info = bch2_hash_info_init(c, &inode_u);
-
-       ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
+       ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
        if (ret)
                goto btree_err;
 
@@ -352,7 +349,7 @@ err:
        return ret;
 }
 
-int bch2_acl_chmod(struct btree_trans *trans,
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
                   struct bch_inode_unpacked *inode,
                   umode_t mode,
                   struct posix_acl **new_acl)
@@ -366,7 +363,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
        int ret;
 
        ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
-                       &hash_info, inode->bi_inum,
+                              &hash_info, inum,
                        &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
                        BTREE_ITER_INTENT);
        if (ret)
index 25fc54dd08845884dd0b8e0fad064920ae230741..2ad214bd64aa5fd06290293ce2b62d45c878e41e 100644 (file)
@@ -28,25 +28,24 @@ typedef struct {
 
 struct posix_acl *bch2_get_acl(struct inode *, int);
 
-int bch2_set_acl_trans(struct btree_trans *,
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
                       struct bch_inode_unpacked *,
-                      const struct bch_hash_info *,
                       struct posix_acl *, int);
 int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+                  struct bch_inode_unpacked *,
                   umode_t, struct posix_acl **);
 
 #else
 
-static inline int bch2_set_acl_trans(struct btree_trans *trans,
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
                                     struct bch_inode_unpacked *inode_u,
-                                    const struct bch_hash_info *hash_info,
                                     struct posix_acl *acl, int type)
 {
        return 0;
 }
 
-static inline int bch2_acl_chmod(struct btree_trans *trans,
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
                                 struct bch_inode_unpacked *inode,
                                 umode_t mode,
                                 struct posix_acl **new_acl)
index 9975fc173ccb7a3a87030353386f719bdc3bcb0f..0efb1aaa4bf680e0873b76a76422f6367514bfd0 100644 (file)
@@ -380,6 +380,8 @@ enum gc_phase {
        GC_PHASE_BTREE_alloc,
        GC_PHASE_BTREE_quotas,
        GC_PHASE_BTREE_reflink,
+       GC_PHASE_BTREE_subvolumes,
+       GC_PHASE_BTREE_snapshots,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -563,6 +565,21 @@ struct btree_path_buf {
 
 #define REPLICAS_DELTA_LIST_MAX        (1U << 16)
 
+struct snapshot_t {
+       u32                     parent;
+       u32                     children[2];
+       u32                     subvol; /* Nonzero only if a subvolume points to this node: */
+       u32                     equiv;
+};
+
+typedef struct {
+       u32             subvol;
+       u64             inum;
+} subvol_inum;
+
+#define BCACHEFS_ROOT_SUBVOL_INUM                                      \
+       ((subvol_inum) { BCACHEFS_ROOT_SUBVOL,  BCACHEFS_ROOT_INO })
+
 struct bch_fs {
        struct closure          cl;
 
@@ -634,6 +651,12 @@ struct bch_fs {
        struct closure          sb_write;
        struct mutex            sb_lock;
 
+       /* snapshot.c: */
+       GENRADIX(struct snapshot_t) snapshots;
+       struct bch_snapshot_table __rcu *snapshot_table;
+       struct mutex            snapshot_table_lock;
+       struct work_struct      snapshot_delete_work;
+
        /* BTREE CACHE */
        struct bio_set          btree_bio;
        struct workqueue_struct *io_complete_wq;
index 98779e46bbd0878d3773023aeaf2d276bb252a87..c082d5fce79aed80932debd7df3b0b22a1d614a3 100644 (file)
@@ -323,7 +323,7 @@ static inline void bkey_init(struct bkey *k)
 */
 #define BCH_BKEY_TYPES()                               \
        x(deleted,              0)                      \
-       x(discard,              1)                      \
+       x(whiteout,             1)                      \
        x(error,                2)                      \
        x(cookie,               3)                      \
        x(hash_whiteout,        4)                      \
@@ -342,7 +342,9 @@ static inline void bkey_init(struct bkey *k)
        x(inline_data,          17)                     \
        x(btree_ptr_v2,         18)                     \
        x(indirect_inline_data, 19)                     \
-       x(alloc_v2,             20)
+       x(alloc_v2,             20)                     \
+       x(subvolume,            21)                     \
+       x(snapshot,             22)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -355,7 +357,7 @@ struct bch_deleted {
        struct bch_val          v;
 };
 
-struct bch_discard {
+struct bch_whiteout {
        struct bch_val          v;
 };
 
@@ -686,6 +688,10 @@ struct bch_inode_generation {
        __le32                  pad;
 } __attribute__((packed, aligned(8)));
 
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
 #define BCH_INODE_FIELDS()                     \
        x(bi_atime,                     96)     \
        x(bi_ctime,                     96)     \
@@ -709,7 +715,9 @@ struct bch_inode_generation {
        x(bi_erasure_code,              16)     \
        x(bi_fields_set,                16)     \
        x(bi_dir,                       64)     \
-       x(bi_dir_offset,                64)
+       x(bi_dir_offset,                64)     \
+       x(bi_subvol,                    32)     \
+       x(bi_parent_subvol,             32)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()                       \
@@ -792,6 +800,9 @@ struct bch_dirent {
        __u8                    d_name[];
 } __attribute__((packed, aligned(8)));
 
+#define DT_SUBVOL      16
+#define BCH_DT_MAX     17
+
 #define BCH_NAME_MAX   (U8_MAX * sizeof(u64) -                         \
                         sizeof(struct bkey) -                          \
                         offsetof(struct bch_dirent, d_name))
@@ -928,6 +939,42 @@ struct bch_inline_data {
        u8                      data[0];
 };
 
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN         POS(0, 1)
+#define SUBVOL_POS_MAX         POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL   1
+
+struct bch_subvolume {
+       struct bch_val          v;
+       __le32                  flags;
+       __le32                  snapshot;
+       __le64                  inode;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO,         struct bch_subvolume, flags,  0,  1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP,       struct bch_subvolume, flags,  1,  2)
+
+/* Snapshots */
+
+struct bch_snapshot {
+       struct bch_val          v;
+       __le32                  flags;
+       __le32                  parent;
+       __le32                  children[2];
+       __le32                  subvol;
+       __le32                  pad;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED,     struct bch_snapshot, flags,  0,  1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,      struct bch_snapshot, flags,  1,  2)
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1695,7 +1742,9 @@ LE32_BITMASK(JSET_NO_FLUSH,       struct jset, flags, 5, 6);
        x(alloc,        4)                      \
        x(quotas,       5)                      \
        x(stripes,      6)                      \
-       x(reflink,      7)
+       x(reflink,      7)                      \
+       x(subvolumes,   8)                      \
+       x(snapshots,    9)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
index f679fc2151bc4cfdd2e18a42674352f87e7fba7e..930981ad55355a2ad64eea94681fc6010b50fbb7 100644 (file)
@@ -78,6 +78,9 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_DISK_RESIZE  _IOW(0xbc,      14,  struct bch_ioctl_disk_resize)
 #define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
 
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,  16,  struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17,  struct bch_ioctl_subvolume)
+
 /* ioctl below act on a particular file, not the filesystem as a whole: */
 
 #define BCHFS_IOC_REINHERIT_ATTRS      _IOR(0xbc, 64, const char __user *)
@@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal {
        __u64                   nbuckets;
 };
 
+struct bch_ioctl_subvolume {
+       __u32                   flags;
+       __u32                   dirfd;
+       __u16                   mode;
+       __u16                   pad[3];
+       __u64                   dst_ptr;
+       __u64                   src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE     (1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO         (1U << 1)
+
 #endif /* _BCACHEFS_IOCTL_H */
index c4a66f28ef4be08682064acac15cd1cb8f641656..7dee3d8e0a3d169160fab7018c6fe1ef55660eb5 100644 (file)
@@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 #define bkey_deleted(_k)       ((_k)->type == KEY_TYPE_deleted)
 
 #define bkey_whiteout(_k)                              \
-       ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
+       ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
 
 enum bkey_lr_packed {
        BKEY_PACKED_BOTH,
index a03b5514a802288fefe91c18e6e99f8951aefabe..874defd8aff8b81e931723e489a2f1a4181c3593 100644 (file)
@@ -11,6 +11,7 @@
 #include "inode.h"
 #include "quota.h"
 #include "reflink.h"
+#include "subvolume.h"
 #include "xattr.h"
 
 const char * const bch2_bkey_types[] = {
@@ -30,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
        .key_invalid = deleted_key_invalid,             \
 }
 
-#define bch2_bkey_ops_discard (struct bkey_ops) {      \
+#define bch2_bkey_ops_whiteout (struct bkey_ops) {     \
        .key_invalid = deleted_key_invalid,             \
 }
 
@@ -100,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
 
 static unsigned bch2_key_types_allowed[] = {
        [BKEY_TYPE_extents] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_error)|
                (1U << KEY_TYPE_cookie)|
                (1U << KEY_TYPE_extent)|
@@ -107,26 +110,43 @@ static unsigned bch2_key_types_allowed[] = {
                (1U << KEY_TYPE_reflink_p)|
                (1U << KEY_TYPE_inline_data),
        [BKEY_TYPE_inodes] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_inode)|
                (1U << KEY_TYPE_inode_generation),
        [BKEY_TYPE_dirents] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_hash_whiteout)|
                (1U << KEY_TYPE_dirent),
        [BKEY_TYPE_xattrs] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_cookie)|
                (1U << KEY_TYPE_hash_whiteout)|
                (1U << KEY_TYPE_xattr),
        [BKEY_TYPE_alloc] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_alloc)|
                (1U << KEY_TYPE_alloc_v2),
        [BKEY_TYPE_quotas] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_quota),
        [BKEY_TYPE_stripes] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_stripe),
        [BKEY_TYPE_reflink] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_reflink_v)|
                (1U << KEY_TYPE_indirect_inline_data),
+       [BKEY_TYPE_subvolumes] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_subvolume),
+       [BKEY_TYPE_snapshots] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_snapshot),
        [BKEY_TYPE_btree] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_btree_ptr)|
                (1U << KEY_TYPE_btree_ptr_v2),
 };
@@ -134,21 +154,18 @@ static unsigned bch2_key_types_allowed[] = {
 const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                                enum btree_node_type type)
 {
-       unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
-               bch2_key_types_allowed[type] ;
-
        if (k.k->u64s < BKEY_U64s)
                return "u64s too small";
 
-       if (!(key_types_allowed & (1U << k.k->type)))
+       if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
                return "invalid key type for this btree";
 
        if (type == BKEY_TYPE_btree &&
            bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
                return "value too big";
 
-       if (btree_node_type_is_extents(type)) {
-               if ((k.k->size == 0) != bkey_deleted(k.k))
+       if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+               if (k.k->size == 0)
                        return "bad size field";
 
                if (k.k->size > k.k->p.offset)
@@ -165,7 +182,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
        if (type != BKEY_TYPE_btree &&
            btree_type_has_snapshots(type) &&
-           k.k->p.snapshot != U32_MAX)
+           !k.k->p.snapshot)
                return "invalid snapshot field";
 
        if (type != BKEY_TYPE_btree &&
index ce4d7c7e6f9b655dff015aa3093653579264965e..b5484d7702a49277de8f607153cc3f9633481f74 100644 (file)
@@ -13,6 +13,7 @@
 #include "extents.h"
 #include "journal.h"
 #include "replicas.h"
+#include "subvolume.h"
 
 #include <linux/prefetch.h>
 #include <trace/events/bcachefs.h>
@@ -152,7 +153,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans,
        if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
            (btree_node_lock_seq_matches(path, b, level) &&
             btree_node_lock_increment(trans, b, level, want))) {
-               mark_btree_node_locked(trans, path, level, want);
+               mark_btree_node_locked(path, level, want);
                return true;
        } else {
                return false;
@@ -188,7 +189,7 @@ static bool bch2_btree_node_upgrade(struct btree_trans *trans,
 
        return false;
 success:
-       mark_btree_node_intent_locked(trans, path, level);
+       mark_btree_node_intent_locked(path, level);
        return true;
 }
 
@@ -674,6 +675,9 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
+       BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+              !iter->pos.snapshot);
+
        BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
               iter->pos.snapshot != iter->snapshot);
 
@@ -681,6 +685,55 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
               bkey_cmp(iter->pos, iter->k.p) > 0);
 }
 
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+{
+       struct btree_trans *trans = iter->trans;
+       struct btree_iter copy;
+       struct bkey_s_c prev;
+       int ret = 0;
+
+       if (!bch2_debug_check_iterators)
+               return 0;
+
+       if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+               return 0;
+
+       if (bkey_err(k) || !k.k)
+               return 0;
+
+       BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+                                         iter->snapshot,
+                                         k.k->p.snapshot));
+
+       bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       prev = bch2_btree_iter_prev(&copy);
+       if (!prev.k)
+               goto out;
+
+       ret = bkey_err(prev);
+       if (ret)
+               goto out;
+
+       if (!bkey_cmp(prev.k->p, k.k->p) &&
+           bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+                                     prev.k->p.snapshot) > 0) {
+               char buf1[100], buf2[200];
+
+               bch2_bkey_to_text(&PBUF(buf1), k.k);
+               bch2_bkey_to_text(&PBUF(buf2), prev.k);
+
+               panic("iter snap %u\n"
+                     "k    %s\n"
+                     "prev %s\n",
+                     iter->snapshot,
+                     buf1, buf2);
+       }
+out:
+       bch2_trans_iter_exit(trans, &copy);
+       return ret;
+}
+
 #else
 
 static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
@@ -689,6 +742,7 @@ static inline void bch2_btree_path_verify(struct btree_trans *trans,
                                          struct btree_path *path) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
 
 #endif
 
@@ -896,12 +950,12 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
                        bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
+static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
                                                    struct btree_path *path,
                                                    struct btree_path_level *l,
                                                    struct bkey *u)
 {
-       struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
+       struct bkey_s_c k = __btree_iter_unpack(c, l, u,
                        bch2_btree_node_iter_peek(&l->iter, l->b));
 
        path->pos = k.k ? k.k->p : l->b->key.k.p;
@@ -1041,7 +1095,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
                            t != BTREE_NODE_UNLOCKED) {
                                btree_node_unlock(path, b->c.level);
                                six_lock_increment(&b->c.lock, t);
-                               mark_btree_node_locked(trans, path, b->c.level, t);
+                               mark_btree_node_locked(path, b->c.level, t);
                        }
 
                        btree_path_level_init(trans, path, b);
@@ -1118,7 +1172,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
                        for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
                                path->l[i].b = NULL;
 
-                       mark_btree_node_locked(trans, path, path->level, lock_type);
+                       mark_btree_node_locked(path, path->level, lock_type);
                        btree_path_level_init(trans, path, b);
                        return 0;
                }
@@ -1210,7 +1264,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
        if (unlikely(ret))
                goto err;
 
-       mark_btree_node_locked(trans, path, level, lock_type);
+       mark_btree_node_locked(path, level, lock_type);
        btree_path_level_init(trans, path, b);
 
        if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
@@ -1252,10 +1306,6 @@ retry_all:
 
        btree_trans_verify_sorted(trans);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans->traverse_all_idx = U8_MAX;
-#endif
-
        for (i = trans->nr_sorted - 2; i >= 0; --i) {
                struct btree_path *path1 = trans->paths + trans->sorted[i];
                struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
@@ -1294,9 +1344,6 @@ retry_all:
                path = trans->paths + trans->sorted[i];
 
                EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-#ifdef CONFIG_BCACHEFS_DEBUG
-               trans->traverse_all_idx = path->idx;
-#endif
 
                ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
                if (ret)
@@ -1985,11 +2032,25 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                }
 
                if (likely(k.k)) {
-                       if (likely(!bkey_deleted(k.k)))
-                               break;
+                       /*
+                        * We can never have a key in a leaf node at POS_MAX, so
+                        * we don't have to check these successor() calls:
+                        */
+                       if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+                           !bch2_snapshot_is_ancestor(trans->c,
+                                                      iter->snapshot,
+                                                      k.k->p.snapshot)) {
+                               search_key = bpos_successor(k.k->p);
+                               continue;
+                       }
+
+                       if (bkey_whiteout(k.k) &&
+                           !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+                               search_key = bkey_successor(iter, k.k->p);
+                               continue;
+                       }
 
-                       /* Advance to next key: */
-                       search_key = bkey_successor(iter, k.k->p);
+                       break;
                } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
                        /* Advance to next leaf node: */
                        search_key = bpos_successor(iter->path->l[0].b->key.k.p);
@@ -2010,6 +2071,9 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
        else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
                iter->pos = bkey_start_pos(k.k);
 
+       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+               iter->pos.snapshot = iter->snapshot;
+
        cmp = bpos_cmp(k.k->p, iter->path->pos);
        if (cmp) {
                iter->path = bch2_btree_path_make_mut(trans, iter->path,
@@ -2022,6 +2086,10 @@ out:
 
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
+       ret = bch2_btree_iter_verify_ret(iter, k);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
+
        return k;
 }
 
@@ -2045,7 +2113,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
        struct btree_trans *trans = iter->trans;
        struct bpos search_key = iter->pos;
+       struct btree_path *saved_path = NULL;
        struct bkey_s_c k;
+       struct bkey saved_k;
+       const struct bch_val *saved_v;
        int ret;
 
        EBUG_ON(iter->path->cached || iter->path->level);
@@ -2053,6 +2124,9 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
 
+       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+               search_key.snapshot = U32_MAX;
+
        while (1) {
                iter->path = btree_path_set_pos(trans, iter->path, search_key,
                                                iter->flags & BTREE_ITER_INTENT);
@@ -2065,18 +2139,61 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
                        goto out;
                }
 
-               k = btree_path_level_peek(trans, iter->path,
+               k = btree_path_level_peek(trans->c, iter->path,
                                          &iter->path->l[0], &iter->k);
                if (!k.k ||
                    ((iter->flags & BTREE_ITER_IS_EXTENTS)
-                    ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
-                    : bkey_cmp(k.k->p, iter->pos) > 0))
+                    ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
+                    : bpos_cmp(k.k->p, search_key) > 0))
                        k = btree_path_level_prev(trans->c, iter->path,
                                                  &iter->path->l[0], &iter->k);
 
                btree_path_check_sort(trans, iter->path, 0);
 
                if (likely(k.k)) {
+                       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+                               if (k.k->p.snapshot == iter->snapshot)
+                                       goto got_key;
+
+                               /*
+                                * If we have a saved candidate, and we're no
+                                * longer at the same _key_ (not pos), return
+                                * that candidate
+                                */
+                               if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
+                                       bch2_path_put(trans, iter->path,
+                                                     iter->flags & BTREE_ITER_INTENT);
+                                       iter->path = saved_path;
+                                       saved_path = NULL;
+                                       iter->k = saved_k;
+                                       k.v     = saved_v;
+                                       goto got_key;
+                               }
+
+                               if (bch2_snapshot_is_ancestor(iter->trans->c,
+                                                             iter->snapshot,
+                                                             k.k->p.snapshot)) {
+                                       if (saved_path)
+                                               bch2_path_put(trans, saved_path,
+                                                     iter->flags & BTREE_ITER_INTENT);
+                                       saved_path = btree_path_clone(trans, iter->path,
+                                                               iter->flags & BTREE_ITER_INTENT);
+                                       saved_k = *k.k;
+                                       saved_v = k.v;
+                               }
+
+                               search_key = bpos_predecessor(k.k->p);
+                               continue;
+                       }
+got_key:
+                       if (bkey_whiteout(k.k) &&
+                           !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+                               search_key = bkey_predecessor(iter, k.k->p);
+                               if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+                                       search_key.snapshot = U32_MAX;
+                               continue;
+                       }
+
                        break;
                } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
                        /* Advance to previous leaf node: */
@@ -2094,7 +2211,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
        /* Extents can straddle iter->pos: */
        if (bkey_cmp(k.k->p, iter->pos) < 0)
                iter->pos = k.k->p;
+
+       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+               iter->pos.snapshot = iter->snapshot;
 out:
+       if (saved_path)
+               bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
        iter->path->should_be_locked = true;
 
        bch2_btree_iter_verify_entry_exit(iter);
@@ -2143,7 +2265,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        if (unlikely(ret))
                return bkey_s_c_err(ret);
 
-       if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+       if ((iter->flags & BTREE_ITER_CACHED) ||
+           !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
                struct bkey_i *next_update;
 
                next_update = iter->flags & BTREE_ITER_WITH_UPDATES
@@ -2202,6 +2325,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
+       ret = bch2_btree_iter_verify_ret(iter, k);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
 
        return k;
 }
@@ -2352,13 +2478,13 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
            btree_node_type_is_extents(btree_id))
                flags |= BTREE_ITER_IS_EXTENTS;
 
-       if (!btree_type_has_snapshots(btree_id) &&
-           !(flags & __BTREE_ITER_ALL_SNAPSHOTS))
+       if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+           !btree_type_has_snapshots(btree_id))
                flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
 
-       if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
-               pos.snapshot = btree_type_has_snapshots(btree_id)
-                       ? U32_MAX : 0;
+       if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+           btree_type_has_snapshots(btree_id))
+               flags |= BTREE_ITER_FILTER_SNAPSHOTS;
 
        iter->trans     = trans;
        iter->path      = NULL;
index be1bb489f3d63816b325368935c3a78b54a96a70..19ca73f5ea2265ad8dee726c1a1c97c7c6584687 100644 (file)
@@ -234,6 +234,15 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
        iter->pos = bkey_start_pos(&iter->k);
 }
 
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+{
+       struct bpos pos = iter->pos;
+
+       iter->snapshot = snapshot;
+       pos.snapshot = snapshot;
+       bch2_btree_iter_set_pos(iter, pos);
+}
+
 /*
  * Unlocks before scheduling
  * Note: does not revalidate iterator
index 938ced36af73c3eb6d7be666d55cc953890f5371..4f1bc1d165aa6331fe140c507987b9786b6f3f11 100644 (file)
@@ -163,6 +163,11 @@ btree_key_cache_create(struct btree_key_cache *c,
                was_new = false;
        }
 
+       if (btree_id == BTREE_ID_subvolumes)
+               six_lock_pcpu_alloc(&ck->c.lock);
+       else
+               six_lock_pcpu_free(&ck->c.lock);
+
        ck->c.level             = 0;
        ck->c.btree_id          = btree_id;
        ck->key.btree_id        = btree_id;
@@ -296,7 +301,7 @@ retry:
                if (!ck)
                        goto retry;
 
-               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(path, 0, SIX_LOCK_intent);
                path->locks_want = 1;
        } else {
                enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -318,7 +323,7 @@ retry:
                        goto retry;
                }
 
-               mark_btree_node_locked(trans, path, 0, lock_want);
+               mark_btree_node_locked(path, 0, lock_want);
        }
 
        path->l[0].lock_seq     = ck->c.lock.state.seq;
@@ -366,7 +371,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 
        bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
                             BTREE_ITER_SLOTS|
-                            BTREE_ITER_INTENT);
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_ALL_SNAPSHOTS);
        bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
                             BTREE_ITER_CACHED|
                             BTREE_ITER_CACHED_NOFILL|
index 5c6b758070e165c040214c1d15d85b675507f685..d599008c5fc18c6ac7d8480d5862c50e059a9a62 100644 (file)
@@ -58,8 +58,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
        path->nodes_intent_locked &= ~(1 << level);
 }
 
-static inline void mark_btree_node_locked(struct btree_trans *trans,
-                                         struct btree_path *path,
+static inline void mark_btree_node_locked(struct btree_path *path,
                                          unsigned level,
                                          enum six_lock_type type)
 {
@@ -69,19 +68,12 @@ static inline void mark_btree_node_locked(struct btree_trans *trans,
 
        path->nodes_locked |= 1 << level;
        path->nodes_intent_locked |= type << level;
-#ifdef CONFIG_BCACHEFS_DEBUG
-       path->ip_locked = _RET_IP_;
-       BUG_ON(trans->in_traverse_all &&
-              trans->traverse_all_idx != U8_MAX &&
-              path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx);
-#endif
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
-                                                struct btree_path *path,
+static inline void mark_btree_node_intent_locked(struct btree_path *path,
                                                 unsigned level)
 {
-       mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
+       mark_btree_node_locked(path, level, SIX_LOCK_intent);
 }
 
 static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -120,9 +112,6 @@ static inline void __bch2_btree_path_unlock(struct btree_path *path)
 
        while (path->nodes_locked)
                btree_node_unlock(path, __ffs(path->nodes_locked));
-#ifdef CONFIG_BCACHEFS_DEBUG
-       path->ip_locked = 0;
-#endif
 }
 
 static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
index ccf91ebd94aa15410928bbcb2d685e949f9f736b..7fcd2ceb51e93a4f9e119776f91c7afcf3b003dc 100644 (file)
@@ -209,6 +209,7 @@ struct btree_node_iter {
 #define BTREE_ITER_WITH_UPDATES                (1 << 10)
 #define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 11)
 #define BTREE_ITER_ALL_SNAPSHOTS       (1 << 12)
+#define BTREE_ITER_FILTER_SNAPSHOTS    (1 << 13)
 
 enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -255,7 +256,6 @@ struct btree_path {
        }                       l[BTREE_MAX_DEPTH];
 #ifdef CONFIG_BCACHEFS_DEBUG
        unsigned long           ip_allocated;
-       unsigned long           ip_locked;
 #endif
 };
 
@@ -369,7 +369,6 @@ struct btree_trans {
        struct bpos             locking_pos;
        u8                      locking_btree_id;
        u8                      locking_level;
-       u8                      traverse_all_idx;
        pid_t                   pid;
 #endif
        unsigned long           ip;
@@ -607,7 +606,8 @@ static inline bool btree_node_is_extents(struct btree *b)
 
 #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS               \
        ((1U << BKEY_TYPE_alloc)|                       \
-        (1U << BKEY_TYPE_stripes))
+        (1U << BKEY_TYPE_stripes)|                     \
+        (1U << BKEY_TYPE_snapshots))
 
 #define BTREE_NODE_TYPE_HAS_TRIGGERS                   \
        (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
@@ -654,7 +654,8 @@ enum btree_update_flags {
 
 #define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
        ((1U << KEY_TYPE_stripe)|               \
-        (1U << KEY_TYPE_inode))
+        (1U << KEY_TYPE_inode)|                \
+        (1U << KEY_TYPE_snapshot))
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
@@ -671,11 +672,6 @@ struct btree_root {
        s8                      error;
 };
 
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
 enum btree_insert_ret {
        BTREE_INSERT_OK,
        /* leaf node needs to be split */
@@ -696,8 +692,4 @@ enum btree_node_sibling {
        btree_next_sib,
 };
 
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
-                                                       struct btree *,
-                                                       struct btree_node_iter *);
-
 #endif /* _BCACHEFS_BTREE_TYPES_H */
index 23b73d3a172cf2ff71d6ad3f85acb1255dddea17..4d0ece342cf6270c24864dc08fe54c388617f510 100644 (file)
@@ -61,7 +61,7 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
                     struct disk_reservation *, u64 *, int flags);
 
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
-                                 struct bpos, struct bpos, u64 *);
+                                 struct bpos, struct bpos, unsigned, u64 *);
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
                            struct bpos, struct bpos, u64 *);
 
index a0da96737700b85f5bb82dcdba8b66c267649f78..f69f919d83ac91f0d81a541d8360cbbced57d461 100644 (file)
@@ -15,6 +15,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "subvolume.h"
 #include "replicas.h"
 
 #include <linux/prefetch.h>
@@ -245,6 +246,11 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
        BUG_ON(i->cached        != i->path->cached);
        BUG_ON(i->level         != i->path->level);
        BUG_ON(i->btree_id      != i->path->btree_id);
+       EBUG_ON(!i->level &&
+               !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+               test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+               i->k->k.p.snapshot &&
+               bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
 }
 
 static noinline int
@@ -934,6 +940,43 @@ err:
        goto retry;
 }
 
+static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+                                         enum btree_id id,
+                                         struct bpos pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       if (!snapshot_t(c, pos.snapshot)->children[0])
+               return 0;
+
+       bch2_trans_iter_init(trans, &iter, id, pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while (1) {
+               k = bch2_btree_iter_prev(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               if (!k.k)
+                       break;
+
+               if (bkey_cmp(pos, k.k->p))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+                       ret = 1;
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
 static int bch2_trans_update_extent(struct btree_trans *trans,
                                    struct btree_iter *orig_iter,
                                    struct bkey_i *insert,
@@ -958,6 +1001,28 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
                goto out;
 
        if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+               /*
+                * We can't merge extents if they belong to interior snapshot
+                * tree nodes, and there's a snapshot in which one extent is
+                * visible and the other is not - i.e. if visibility is
+                * different.
+                *
+                * Instead of checking if visibilitiy of the two extents is
+                * different, for now we just check if either has been
+                * overwritten:
+                */
+               ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+               if (ret < 0)
+                       goto err;
+               if (ret)
+                       goto nomerge1;
+
+               ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+               if (ret < 0)
+                       goto err;
+               if (ret)
+                       goto nomerge1;
+
                update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
                if ((ret = PTR_ERR_OR_ZERO(update)))
                        goto err;
@@ -973,22 +1038,26 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
                        goto next;
                }
        }
-
-       if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k)))
+nomerge1:
+       ret = 0;
+       if (!bkey_cmp(k.k->p, start))
                goto next;
 
        while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
+               bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
+               bool back_split  = bkey_cmp(k.k->p, insert->k.p) > 0;
+
                /*
                 * If we're going to be splitting a compressed extent, note it
                 * so that __bch2_trans_commit() can increase our disk
                 * reservation:
                 */
-               if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
-                   bkey_cmp(k.k->p, insert->k.p) > 0 &&
+               if (((front_split && back_split) ||
+                    ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
                    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
                        trans->extra_journal_res += compressed_sectors;
 
-               if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+               if (front_split) {
                        update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
                        if ((ret = PTR_ERR_OR_ZERO(update)))
                                goto err;
@@ -999,6 +1068,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
 
                        bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
                                             BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_ALL_SNAPSHOTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                                 flags);
+                       bch2_trans_iter_exit(trans, &update_iter);
+
+                       if (ret)
+                               goto err;
+               }
+
+               if (k.k->p.snapshot != insert->k.p.snapshot &&
+                   (front_split || back_split)) {
+                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+
+                       bkey_reassemble(update, k);
+
+                       bch2_cut_front(start, update);
+                       bch2_cut_back(insert->k.p, update);
+
+                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_ALL_SNAPSHOTS|
                                             BTREE_ITER_INTENT);
                        ret   = bch2_btree_iter_traverse(&update_iter) ?:
                                bch2_trans_update(trans, &update_iter, update,
@@ -1010,12 +1105,32 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
                }
 
                if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
-                       ret = bch2_btree_delete_at(trans, &iter, flags);
+                       update = bch2_trans_kmalloc(trans, sizeof(*update));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+
+                       bkey_init(&update->k);
+                       update->k.p = k.k->p;
+
+                       if (insert->k.p.snapshot != k.k->p.snapshot) {
+                               update->k.p.snapshot = insert->k.p.snapshot;
+                               update->k.type = KEY_TYPE_whiteout;
+                       }
+
+                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                                 flags);
+                       bch2_trans_iter_exit(trans, &update_iter);
+
                        if (ret)
                                goto err;
                }
 
-               if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+               if (back_split) {
                        update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
                        if ((ret = PTR_ERR_OR_ZERO(update)))
                                goto err;
@@ -1023,10 +1138,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
                        bkey_reassemble(update, k);
                        bch2_cut_front(insert->k.p, update);
 
-                       ret = bch2_trans_update(trans, &iter, update, flags);
+                       bch2_trans_copy_iter(&update_iter, &iter);
+                       update_iter.pos = update->k.p;
+                       ret   = bch2_trans_update(trans, &update_iter, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                                 flags);
+                       bch2_trans_iter_exit(trans, &update_iter);
+
                        if (ret)
                                goto err;
-
                        goto out;
                }
 next:
@@ -1037,7 +1157,23 @@ next:
                        goto out;
        }
 
-       bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+       if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+               ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+               if (ret < 0)
+                       goto out;
+               if (ret)
+                       goto nomerge2;
+
+               ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+               if (ret < 0)
+                       goto out;
+               if (ret)
+                       goto nomerge2;
+
+               bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+       }
+nomerge2:
+       ret = 0;
 out:
        if (!bkey_deleted(&insert->k)) {
                /*
@@ -1057,6 +1193,39 @@ err:
        return ret;
 }
 
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+                                     enum btree_id btree_id, struct bpos pos)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u32 snapshot = pos.snapshot;
+       int ret;
+
+       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+               return 0;
+
+       pos.snapshot++;
+
+       for_each_btree_key(trans, iter, btree_id, pos,
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               if (bkey_cmp(k.k->p, pos))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+                                             k.k->p.snapshot)) {
+                       ret = !bkey_whiteout(k.k);
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
 int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                      struct bkey_i *k, enum btree_update_flags flags)
 {
@@ -1089,6 +1258,16 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                       btree_insert_entry_cmp(i - 1, i) >= 0);
 #endif
 
+       if (bkey_deleted(&n.k->k) &&
+           (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+               int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
+               if (unlikely(ret < 0))
+                       return ret;
+
+               if (ret)
+                       n.k->k.type = KEY_TYPE_whiteout;
+       }
+
        /*
         * Pending updates are kept sorted: first, find position of new update,
         * then delete/trim any updates the new update overwrites:
@@ -1175,13 +1354,14 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
                                  struct bpos start, struct bpos end,
+                                 unsigned iter_flags,
                                  u64 *journal_seq)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
-       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
 retry:
        while ((bch2_trans_begin(trans),
               (k = bch2_btree_iter_peek(&iter)).k) &&
@@ -1248,5 +1428,5 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                            u64 *journal_seq)
 {
        return bch2_trans_do(c, NULL, journal_seq, 0,
-                            bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
+                            bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
 }
index df12416eff8ecb041c67b8e64574a4fc901d8264..5fd3aabb76692b45b74f2d317bbb951f9aa3939b 100644 (file)
@@ -16,6 +16,7 @@
 #include "movinggc.h"
 #include "reflink.h"
 #include "replicas.h"
+#include "subvolume.h"
 
 #include <linux/preempt.h>
 #include <trace/events/bcachefs.h>
@@ -1200,6 +1201,8 @@ static int bch2_mark_key_locked(struct bch_fs *c,
                return bch2_mark_reservation(c, old, new, journal_seq, flags);
        case KEY_TYPE_reflink_p:
                return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
+       case KEY_TYPE_snapshot:
+               return bch2_mark_snapshot(c, old, new, journal_seq, flags);
        default:
                return 0;
        }
index 1d510f7728b6853bb89f6dd1bc60e25352f6273c..8653a106809df91683ceb25326ea61a7d3102983 100644 (file)
@@ -8,6 +8,7 @@
 #include "fs.h"
 #include "keylist.h"
 #include "str_hash.h"
+#include "subvolume.h"
 
 #include <linux/dcache.h>
 
@@ -99,7 +100,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
        if (memchr(d.v->d_name, '/', len))
                return "invalid name";
 
-       if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
+       if (d.v->d_type != DT_SUBVOL &&
+           le64_to_cpu(d.v->d_inum) == d.k->p.inode)
                return "dirent points to own directory";
 
        return NULL;
@@ -113,7 +115,7 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
        bch_scnmemcpy(out, d.v->d_name,
                      bch2_dirent_name_bytes(d));
        pr_buf(out, " -> %llu type %s", d.v->d_inum,
-              d.v->d_type < DT_MAX
+              d.v->d_type < BCH_DT_MAX
               ? bch2_d_types[d.v->d_type]
               : "(bad d_type)");
 }
@@ -149,8 +151,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
        return dirent;
 }
 
-int bch2_dirent_create(struct btree_trans *trans,
-                      u64 dir_inum, const struct bch_hash_info *hash_info,
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+                      const struct bch_hash_info *hash_info,
                       u8 type, const struct qstr *name, u64 dst_inum,
                       u64 *dir_offset, int flags)
 {
@@ -163,7 +165,7 @@ int bch2_dirent_create(struct btree_trans *trans,
                return ret;
 
        ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-                           dir_inum, &dirent->k_i, flags);
+                           dir, &dirent->k_i, flags);
        *dir_offset = dirent->k.p.offset;
 
        return ret;
@@ -176,22 +178,86 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
        dst->v.d_type = src.v->d_type;
 }
 
+int __bch2_dirent_read_target(struct btree_trans *trans,
+                             struct bkey_s_c_dirent d,
+                             u32 *subvol, u32 *snapshot, u64 *inum,
+                             bool is_fsck)
+{
+       int ret = 0;
+
+       *subvol         = 0;
+       *snapshot       = d.k->p.snapshot;
+
+       if (likely(d.v->d_type != DT_SUBVOL)) {
+               *inum = le64_to_cpu(d.v->d_inum);
+       } else {
+               struct btree_iter iter;
+               struct bkey_s_c k;
+               struct bkey_s_c_subvolume s;
+               int ret;
+
+               *subvol = le64_to_cpu(d.v->d_inum);
+               bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+                                    POS(0, *subvol),
+                                    BTREE_ITER_CACHED);
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_subvolume) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               s = bkey_s_c_to_subvolume(k);
+               *snapshot       = le32_to_cpu(s.v->snapshot);
+               *inum           = le64_to_cpu(s.v->inode);
+err:
+               if (ret == -ENOENT && !is_fsck)
+                       bch2_fs_inconsistent(trans->c, "pointer to missing subvolume %u",
+                                            *subvol);
+
+               bch2_trans_iter_exit(trans, &iter);
+       }
+
+       return ret;
+}
+
+static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+                                  struct bkey_s_c_dirent d, subvol_inum *target)
+{
+       u32 snapshot;
+       int ret = 0;
+
+       ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot,
+                                       &target->inum, false);
+       if (!target->subvol)
+               target->subvol = dir.subvol;
+
+       return ret;
+}
+
 int bch2_dirent_rename(struct btree_trans *trans,
-                      u64 src_dir, struct bch_hash_info *src_hash,
-                      u64 dst_dir, struct bch_hash_info *dst_hash,
-                      const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
-                      const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
-                      enum bch_rename_mode mode)
+               subvol_inum src_dir, struct bch_hash_info *src_hash,
+               subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+               const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+               const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+               enum bch_rename_mode mode)
 {
        struct btree_iter src_iter = { NULL };
        struct btree_iter dst_iter = { NULL };
        struct bkey_s_c old_src, old_dst;
        struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
        struct bpos dst_pos =
-               POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
+               POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
        int ret = 0;
 
-       *src_inum = *dst_inum = 0;
+       if (src_dir.subvol != dst_dir.subvol)
+               return -EXDEV;
+
+       memset(src_inum, 0, sizeof(*src_inum));
+       memset(dst_inum, 0, sizeof(*dst_inum));
 
        /*
         * Lookup dst:
@@ -214,8 +280,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
        if (ret)
                goto out;
 
-       if (mode != BCH_RENAME)
-               *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
+       if (mode != BCH_RENAME) {
+               ret = bch2_dirent_read_target(trans, dst_dir,
+                               bkey_s_c_to_dirent(old_dst), dst_inum);
+               if (ret)
+                       goto out;
+       }
        if (mode != BCH_RENAME_EXCHANGE)
                *src_offset = dst_iter.pos.offset;
 
@@ -231,7 +301,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
        if (ret)
                goto out;
 
-       *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
+       ret = bch2_dirent_read_target(trans, src_dir,
+                       bkey_s_c_to_dirent(old_src), src_inum);
+       if (ret)
+               goto out;
 
        /* Create new dst key: */
        new_dst = dirent_create_key(trans, 0, dst_name, 0);
@@ -310,63 +383,79 @@ out:
        return ret;
 }
 
-int bch2_dirent_delete_at(struct btree_trans *trans,
-                         const struct bch_hash_info *hash_info,
-                         struct btree_iter *iter)
-{
-       return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                  hash_info, iter);
-}
-
 int __bch2_dirent_lookup_trans(struct btree_trans *trans,
                               struct btree_iter *iter,
-                              u64 dir_inum,
+                              subvol_inum dir,
                               const struct bch_hash_info *hash_info,
-                              const struct qstr *name, unsigned flags)
+                              const struct qstr *name, subvol_inum *inum,
+                              unsigned flags)
 {
-       return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-                               hash_info, dir_inum, name, flags);
+       struct bkey_s_c k;
+       struct bkey_s_c_dirent d;
+       u32 snapshot;
+       int ret;
+
+       ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+       if (ret)
+               return ret;
+
+       ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+                              hash_info, dir, name, flags);
+       if (ret)
+               return ret;
+
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret) {
+               bch2_trans_iter_exit(trans, iter);
+               return ret;
+       }
+
+       d = bkey_s_c_to_dirent(k);
+
+       ret = bch2_dirent_read_target(trans, dir, d, inum);
+       if (ret)
+               bch2_trans_iter_exit(trans, iter);
+
+       return ret;
 }
 
-u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
                       const struct bch_hash_info *hash_info,
-                      const struct qstr *name)
+                      const struct qstr *name, subvol_inum *inum)
 {
        struct btree_trans trans;
        struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 inum = 0;
-       int ret = 0;
+       int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
 
-       ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum,
-                                        hash_info, name, 0);
-       if (ret)
-               goto out;
-
-       k = bch2_btree_iter_peek_slot(&iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto out;
+       ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+                                         name, inum, 0);
 
-       inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
        bch2_trans_iter_exit(&trans, &iter);
-out:
-       BUG_ON(ret == -EINTR);
+       if (ret == -EINTR)
+               goto retry;
        bch2_trans_exit(&trans);
-       return inum;
+       return ret;
 }
 
-int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
+       ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+       if (ret)
+               return ret;
+
        for_each_btree_key(trans, iter, BTREE_ID_dirents,
-                          POS(dir_inum, 0), 0, k, ret) {
-               if (k.k->p.inode > dir_inum)
+                          SPOS(dir.inum, 0, snapshot), 0, k, ret) {
+               if (k.k->p.inode > dir.inum)
                        break;
 
                if (k.k->type == KEY_TYPE_dirent) {
@@ -379,19 +468,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
        return ret;
 }
 
-int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_dirent dirent;
+       u32 snapshot;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
 
        for_each_btree_key(&trans, iter, BTREE_ID_dirents,
-                          POS(inum, ctx->pos), 0, k, ret) {
-               if (k.k->p.inode > inum)
+                          SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
+               if (k.k->p.inode > inum.inum)
                        break;
 
                if (k.k->type != KEY_TYPE_dirent)
@@ -407,11 +503,14 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
                if (!dir_emit(ctx, dirent.v->d_name,
                              bch2_dirent_name_bytes(dirent),
                              le64_to_cpu(dirent.v->d_inum),
-                             dirent.v->d_type))
+                             vfs_d_type(dirent.v->d_type)))
                        break;
                ctx->pos = dirent.k->p.offset + 1;
        }
        bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
        ret = bch2_trans_exit(&trans) ?: ret;
 
index c14f6029e1c98b7064440eab94564079f00a11bf..e7f65fbd8e65fa984050619058388f25bfca0404 100644 (file)
@@ -29,13 +29,17 @@ static inline unsigned dirent_val_u64s(unsigned len)
                            sizeof(u64));
 }
 
-int bch2_dirent_create(struct btree_trans *, u64,
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
                       const struct bch_hash_info *, u8,
                       const struct qstr *, u64, u64 *, int);
 
-int bch2_dirent_delete_at(struct btree_trans *,
-                         const struct bch_hash_info *,
-                         struct btree_iter *);
+int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
+                             u32 *, u32 *, u64 *, bool);
+
+static inline unsigned vfs_d_type(unsigned type)
+{
+       return type == DT_SUBVOL ? DT_DIR : type;
+}
 
 enum bch_rename_mode {
        BCH_RENAME,
@@ -44,19 +48,20 @@ enum bch_rename_mode {
 };
 
 int bch2_dirent_rename(struct btree_trans *,
-                      u64, struct bch_hash_info *,
-                      u64, struct bch_hash_info *,
-                      const struct qstr *, u64 *, u64 *,
-                      const struct qstr *, u64 *, u64 *,
+                      subvol_inum, struct bch_hash_info *,
+                      subvol_inum, struct bch_hash_info *,
+                      const struct qstr *, subvol_inum *, u64 *,
+                      const struct qstr *, subvol_inum *, u64 *,
                       enum bch_rename_mode);
 
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64,
-                          const struct bch_hash_info *,
-                          const struct qstr *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
-                      const struct qstr *);
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+                              subvol_inum, const struct bch_hash_info *,
+                              const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+                      const struct bch_hash_info *,
+                      const struct qstr *, subvol_inum *);
 
-int bch2_empty_dir_trans(struct btree_trans *, u64);
-int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
 
 #endif /* _BCACHEFS_DIRENT_H */
index f66640c2a5edd73ad8c17059caed61df37a241b4..6c2eed77a3267da7fc70ca5df1a2c40a8accf17c 100644 (file)
@@ -612,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
        return false;
 }
 
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
-                               unsigned nr_replicas, bool compressed)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bpos end = pos;
-       struct bkey_s_c k;
-       bool ret = true;
-       int err;
-
-       end.offset += size;
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
-                          BTREE_ITER_SLOTS, k, err) {
-               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-                       break;
-
-               if (nr_replicas > bch2_bkey_replicas(c, k) ||
-                   (!compressed && bch2_bkey_sectors_compressed(k))) {
-                       ret = false;
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-
-       bch2_trans_exit(&trans);
-
-       return ret;
-}
-
 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
index 43cef0a3bdf3870f47afc849f5f54700e3664824..afd3067bb64eb83be7c16f954094f9c148c72be7 100644 (file)
@@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
 bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
 
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
index 6bc82559c9b17e64866a3f11a74d9e246320b011..3e8e3c5bf87038db8252608b80a200116ec74080 100644 (file)
 #include "dirent.h"
 #include "fs-common.h"
 #include "inode.h"
+#include "subvolume.h"
 #include "xattr.h"
 
 #include <linux/posix_acl.h>
 
-int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+       return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+                     subvol_inum dir,
                      struct bch_inode_unpacked *dir_u,
                      struct bch_inode_unpacked *new_inode,
                      const struct qstr *name,
                      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
                      struct posix_acl *default_acl,
-                     struct posix_acl *acl)
+                     struct posix_acl *acl,
+                     subvol_inum snapshot_src,
+                     unsigned flags)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter dir_iter = { NULL };
        struct btree_iter inode_iter = { NULL };
-       struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+       subvol_inum new_inum = dir;
        u64 now = bch2_current_time(c);
        u64 cpu = raw_smp_processor_id();
-       u64 dir_offset = 0;
+       u64 dir_target;
+       u32 snapshot;
+       unsigned dir_type = mode_to_type(mode);
        int ret;
 
-       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+       ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
        if (ret)
                goto err;
 
-       bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
-
-       if (!name)
-               new_inode->bi_flags |= BCH_INODE_UNLINKED;
-
-       ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       if (default_acl) {
-               ret = bch2_set_acl_trans(trans, new_inode, &hash,
-                                        default_acl, ACL_TYPE_DEFAULT);
+       if (!(flags & BCH_CREATE_SNAPSHOT)) {
+               /* Normal create path - allocate a new inode: */
+               bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+               if (flags & BCH_CREATE_TMPFILE)
+                       new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+               ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
                if (ret)
                        goto err;
+
+               snapshot_src = (subvol_inum) { 0 };
+       } else {
+               /*
+                * Creating a snapshot - we're not allocating a new inode, but
+                * we do have to lookup the root inode of the subvolume we're
+                * snapshotting and update it (in the new snapshot):
+                */
+
+               if (!snapshot_src.inum) {
+                       /* Inode wasn't specified, just snapshot: */
+                       struct btree_iter subvol_iter;
+                       struct bkey_s_c k;
+
+                       bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes,
+                                            POS(0, snapshot_src.subvol), 0);
+                       k = bch2_btree_iter_peek_slot(&subvol_iter);
+
+                       ret = bkey_err(k);
+                       if (!ret && k.k->type != KEY_TYPE_subvolume) {
+                               bch_err(c, "subvolume %u not found",
+                                       snapshot_src.subvol);
+                               ret = -ENOENT;
+                       }
+
+                       if (!ret)
+                               snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
+                       bch2_trans_iter_exit(trans, &subvol_iter);
+
+                       if (ret)
+                               goto err;
+               }
+
+               ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+                                     BTREE_ITER_INTENT);
+               if (ret)
+                       goto err;
+
+               if (new_inode->bi_subvol != snapshot_src.subvol) {
+                       /* Not a subvolume root: */
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               /*
+                * If we're not root, we have to own the subvolume being
+                * snapshotted:
+                */
+               if (uid && new_inode->bi_uid != uid) {
+                       ret = -EPERM;
+                       goto err;
+               }
+
+               flags |= BCH_CREATE_SUBVOL;
        }
 
-       if (acl) {
-               ret = bch2_set_acl_trans(trans, new_inode, &hash,
-                                        acl, ACL_TYPE_ACCESS);
+       new_inum.inum   = new_inode->bi_inum;
+       dir_target      = new_inode->bi_inum;
+
+       if (flags & BCH_CREATE_SUBVOL) {
+               u32 new_subvol, dir_snapshot;
+
+               ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+                                           snapshot_src.subvol,
+                                           &new_subvol, &snapshot,
+                                           (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
                if (ret)
                        goto err;
+
+               new_inode->bi_parent_subvol     = dir.subvol;
+               new_inode->bi_subvol            = new_subvol;
+               new_inum.subvol                 = new_subvol;
+               dir_target                      = new_subvol;
+               dir_type                        = DT_SUBVOL;
+
+               ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+               if (ret)
+                       goto err;
+
+               bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+               ret = bch2_btree_iter_traverse(&dir_iter);
+               if (ret)
+                       goto err;
+       }
+
+       if (!(flags & BCH_CREATE_SNAPSHOT)) {
+               if (default_acl) {
+                       ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+                                                default_acl, ACL_TYPE_DEFAULT);
+                       if (ret)
+                               goto err;
+               }
+
+               if (acl) {
+                       ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+                                                acl, ACL_TYPE_ACCESS);
+                       if (ret)
+                               goto err;
+               }
        }
 
-       if (name) {
+       if (!(flags & BCH_CREATE_TMPFILE)) {
                struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
-               dir_u->bi_mtime = dir_u->bi_ctime = now;
+               u64 dir_offset;
 
-               if (S_ISDIR(new_inode->bi_mode))
+               if (is_subdir_for_nlink(new_inode))
                        dir_u->bi_nlink++;
+               dir_u->bi_mtime = dir_u->bi_ctime = now;
 
                ret = bch2_inode_write(trans, &dir_iter, dir_u);
                if (ret)
                        goto err;
 
-               ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
-                                        mode_to_type(new_inode->bi_mode),
-                                        name, new_inode->bi_inum,
+               ret = bch2_dirent_create(trans, dir, &dir_hash,
+                                        dir_type,
+                                        name,
+                                        dir_target,
                                         &dir_offset,
                                         BCH_HASH_SET_MUST_CREATE);
                if (ret)
                        goto err;
-       }
 
-       if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
-               new_inode->bi_dir               = dir_u->bi_inum;
-               new_inode->bi_dir_offset        = dir_offset;
+               if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+                       new_inode->bi_dir               = dir_u->bi_inum;
+                       new_inode->bi_dir_offset        = dir_offset;
+               }
        }
 
-       /* XXX use bch2_btree_iter_set_snapshot() */
-       inode_iter.snapshot = U32_MAX;
-       bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+       inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+       bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
 
        ret   = bch2_btree_iter_traverse(&inode_iter) ?:
                bch2_inode_write(trans, &inode_iter, new_inode);
@@ -91,9 +195,10 @@ err:
        return ret;
 }
 
-int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
-                   u64 inum, struct bch_inode_unpacked *dir_u,
-                   struct bch_inode_unpacked *inode_u, const struct qstr *name)
+int bch2_link_trans(struct btree_trans *trans,
+                   subvol_inum dir,  struct bch_inode_unpacked *dir_u,
+                   subvol_inum inum, struct bch_inode_unpacked *inode_u,
+                   const struct qstr *name)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter dir_iter = { NULL };
@@ -103,6 +208,9 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
        u64 dir_offset = 0;
        int ret;
 
+       if (dir.subvol != inum.subvol)
+               return -EXDEV;
+
        ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
@@ -110,7 +218,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
        inode_u->bi_ctime = now;
        bch2_inode_nlink_inc(inode_u);
 
-       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -118,15 +226,15 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 
        dir_hash = bch2_hash_info_init(c, dir_u);
 
-       ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+       ret = bch2_dirent_create(trans, dir, &dir_hash,
                                 mode_to_type(inode_u->bi_mode),
-                                name, inum, &dir_offset,
+                                name, inum.inum, &dir_offset,
                                 BCH_HASH_SET_MUST_CREATE);
        if (ret)
                goto err;
 
        if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
-               inode_u->bi_dir         = dir_inum;
+               inode_u->bi_dir         = dir.inum;
                inode_u->bi_dir_offset  = dir_offset;
        }
 
@@ -139,55 +247,83 @@ err:
 }
 
 int bch2_unlink_trans(struct btree_trans *trans,
-                     u64 dir_inum, struct bch_inode_unpacked *dir_u,
+                     subvol_inum dir,
+                     struct bch_inode_unpacked *dir_u,
                      struct bch_inode_unpacked *inode_u,
-                     const struct qstr *name)
+                     const struct qstr *name,
+                     int deleting_snapshot)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter dir_iter = { NULL };
        struct btree_iter dirent_iter = { NULL };
        struct btree_iter inode_iter = { NULL };
        struct bch_hash_info dir_hash;
-       u64 inum, now = bch2_current_time(c);
+       subvol_inum inum;
+       u64 now = bch2_current_time(c);
        struct bkey_s_c k;
        int ret;
 
-       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        dir_hash = bch2_hash_info_init(c, dir_u);
 
-       ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash,
-                                        name, BTREE_ITER_INTENT);
+       ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+                                        name, &inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       k = bch2_btree_iter_peek_slot(&dirent_iter);
-       ret = bkey_err(k);
+       ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+                             BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-
-       ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
-       if (ret)
+       if (deleting_snapshot == 1 && !inode_u->bi_subvol) {
+               ret = -ENOENT;
                goto err;
+       }
+
+       if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) {
+               ret = bch2_empty_dir_trans(trans, inum);
+               if (ret)
+                       goto err;
+       }
+
+       if (inode_u->bi_subvol) {
+               ret = bch2_subvolume_delete(trans, inode_u->bi_subvol,
+                                           deleting_snapshot);
+               if (ret)
+                       goto err;
+
+               k = bch2_btree_iter_peek_slot(&dirent_iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               /*
+                * If we're deleting a subvolume, we need to really delete the
+                * dirent, not just emit a whiteout in the current snapshot:
+                */
+               bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+               ret = bch2_btree_iter_traverse(&dirent_iter);
+               if (ret)
+                       goto err;
+       }
 
-       if (inode_u->bi_dir             == k.k->p.inode &&
-           inode_u->bi_dir_offset      == k.k->p.offset) {
+       if (inode_u->bi_dir             == dirent_iter.pos.inode &&
+           inode_u->bi_dir_offset      == dirent_iter.pos.offset) {
                inode_u->bi_dir         = 0;
                inode_u->bi_dir_offset  = 0;
        }
 
        dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
-       dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
+       dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
        bch2_inode_nlink_dec(inode_u);
 
-       ret =   (S_ISDIR(inode_u->bi_mode)
-                ? bch2_empty_dir_trans(trans, inum)
-                : 0) ?:
-               bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
+       ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+                                   &dir_hash, &dirent_iter,
+                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                bch2_inode_write(trans, &dir_iter, dir_u) ?:
                bch2_inode_write(trans, &inode_iter, inode_u);
 err:
@@ -222,8 +358,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
 }
 
 int bch2_rename_trans(struct btree_trans *trans,
-                     u64 src_dir, struct bch_inode_unpacked *src_dir_u,
-                     u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+                     subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+                     subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
                      struct bch_inode_unpacked *src_inode_u,
                      struct bch_inode_unpacked *dst_inode_u,
                      const struct qstr *src_name,
@@ -236,7 +372,8 @@ int bch2_rename_trans(struct btree_trans *trans,
        struct btree_iter src_inode_iter = { NULL };
        struct btree_iter dst_inode_iter = { NULL };
        struct bch_hash_info src_hash, dst_hash;
-       u64 src_inode, src_offset, dst_inode, dst_offset;
+       subvol_inum src_inum, dst_inum;
+       u64 src_offset, dst_offset;
        u64 now = bch2_current_time(c);
        int ret;
 
@@ -247,7 +384,8 @@ int bch2_rename_trans(struct btree_trans *trans,
 
        src_hash = bch2_hash_info_init(c, src_dir_u);
 
-       if (dst_dir != src_dir) {
+       if (dst_dir.inum        != src_dir.inum ||
+           dst_dir.subvol      != src_dir.subvol) {
                ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
                                      BTREE_ITER_INTENT);
                if (ret)
@@ -262,19 +400,19 @@ int bch2_rename_trans(struct btree_trans *trans,
        ret = bch2_dirent_rename(trans,
                                 src_dir, &src_hash,
                                 dst_dir, &dst_hash,
-                                src_name, &src_inode, &src_offset,
-                                dst_name, &dst_inode, &dst_offset,
+                                src_name, &src_inum, &src_offset,
+                                dst_name, &dst_inum, &dst_offset,
                                 mode);
        if (ret)
                goto err;
 
-       ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode,
+       ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
                              BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       if (dst_inode) {
-               ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode,
+       if (dst_inum.inum) {
+               ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
                                      BTREE_ITER_INTENT);
                if (ret)
                        goto err;
@@ -305,7 +443,7 @@ int bch2_rename_trans(struct btree_trans *trans,
                }
 
                if (S_ISDIR(dst_inode_u->bi_mode) &&
-                   bch2_empty_dir_trans(trans, dst_inode)) {
+                   bch2_empty_dir_trans(trans, dst_inum)) {
                        ret = -ENOTEMPTY;
                        goto err;
                }
@@ -324,12 +462,12 @@ int bch2_rename_trans(struct btree_trans *trans,
                goto err;
        }
 
-       if (S_ISDIR(src_inode_u->bi_mode)) {
+       if (is_subdir_for_nlink(src_inode_u)) {
                src_dir_u->bi_nlink--;
                dst_dir_u->bi_nlink++;
        }
 
-       if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+       if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
                dst_dir_u->bi_nlink--;
                src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
        }
@@ -340,22 +478,22 @@ int bch2_rename_trans(struct btree_trans *trans,
        src_dir_u->bi_mtime             = now;
        src_dir_u->bi_ctime             = now;
 
-       if (src_dir != dst_dir) {
+       if (src_dir.inum != dst_dir.inum) {
                dst_dir_u->bi_mtime     = now;
                dst_dir_u->bi_ctime     = now;
        }
 
        src_inode_u->bi_ctime           = now;
 
-       if (dst_inode)
+       if (dst_inum.inum)
                dst_inode_u->bi_ctime   = now;
 
        ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
-               (src_dir != dst_dir
+               (src_dir.inum != dst_dir.inum
                 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
                 : 0 ) ?:
                bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
-               (dst_inode
+               (dst_inum.inum
                 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
                 : 0 );
 err:
index 2273b7961c9be6ab2d8960fda7f86068d4e44cad..9bb0a9676147216aea603aeb11493c139699ac49 100644 (file)
@@ -4,27 +4,33 @@
 
 struct posix_acl;
 
-int bch2_create_trans(struct btree_trans *, u64,
+#define BCH_CREATE_TMPFILE             (1U << 0)
+#define BCH_CREATE_SUBVOL              (1U << 1)
+#define BCH_CREATE_SNAPSHOT            (1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO         (1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
                      struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
                      const struct qstr *,
                      uid_t, gid_t, umode_t, dev_t,
                      struct posix_acl *,
-                     struct posix_acl *);
+                     struct posix_acl *,
+                     subvol_inum, unsigned);
 
-int bch2_link_trans(struct btree_trans *, u64,
-                   u64, struct bch_inode_unpacked *,
-                   struct bch_inode_unpacked *,
+int bch2_link_trans(struct btree_trans *,
+                   subvol_inum, struct bch_inode_unpacked *,
+                   subvol_inum, struct bch_inode_unpacked *,
                    const struct qstr *);
 
-int bch2_unlink_trans(struct btree_trans *,
-                     u64, struct bch_inode_unpacked *,
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+                     struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
-                     const struct qstr *);
+                     const struct qstr *, int);
 
 int bch2_rename_trans(struct btree_trans *,
-                     u64, struct bch_inode_unpacked *,
-                     u64, struct bch_inode_unpacked *,
+                     subvol_inum, struct bch_inode_unpacked *,
+                     subvol_inum, struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
                      const struct qstr *,
index 2921037713d12677d1e02ba7d2151411469aa64e..c07755c6916d2a6b010679e55b13cd35f8bc4713 100644 (file)
@@ -786,23 +786,35 @@ static void readpage_bio_extend(struct readpages_iter *iter,
        }
 }
 
-static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
-                      struct bch_read_bio *rbio, u64 inum,
+static void bchfs_read(struct btree_trans *trans,
+                      struct bch_read_bio *rbio,
+                      subvol_inum inum,
                       struct readpages_iter *readpages_iter)
 {
        struct bch_fs *c = trans->c;
+       struct btree_iter iter;
        struct bkey_buf sk;
        int flags = BCH_READ_RETRY_IF_STALE|
                BCH_READ_MAY_PROMOTE;
+       u32 snapshot;
        int ret = 0;
 
        rbio->c = c;
        rbio->start_time = local_clock();
+       rbio->subvol = inum.subvol;
 
        bch2_bkey_buf_init(&sk);
 retry:
        bch2_trans_begin(trans);
+       iter = (struct btree_iter) { NULL };
 
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+                            BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
        while (1) {
                struct bkey_s_c k;
                unsigned bytes, sectors, offset_into_extent;
@@ -817,15 +829,15 @@ retry:
                        break;
                }
 
-               bch2_btree_iter_set_pos(iter,
-                               POS(inum, rbio->bio.bi_iter.bi_sector));
+               bch2_btree_iter_set_pos(&iter,
+                               POS(inum.inum, rbio->bio.bi_iter.bi_sector));
 
-               k = bch2_btree_iter_peek_slot(iter);
+               k = bch2_btree_iter_peek_slot(&iter);
                ret = bkey_err(k);
                if (ret)
                        break;
 
-               offset_into_extent = iter->pos.offset -
+               offset_into_extent = iter.pos.offset -
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
@@ -855,7 +867,7 @@ retry:
                if (bkey_extent_is_allocation(k.k))
                        bch2_add_page_sectors(&rbio->bio, k);
 
-               bch2_read_extent(trans, rbio, iter->pos,
+               bch2_read_extent(trans, rbio, iter.pos,
                                 data_btree, k, offset_into_extent, flags);
 
                if (flags & BCH_READ_LAST_FRAGMENT)
@@ -864,12 +876,14 @@ retry:
                swap(rbio->bio.bi_iter.bi_size, bytes);
                bio_advance(&rbio->bio, bytes);
        }
+err:
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret == -EINTR)
                goto retry;
 
        if (ret) {
-               bch_err_inum_ratelimited(c, inum,
+               bch_err_inum_ratelimited(c, inum.inum,
                                "read error %i from btree lookup", ret);
                rbio->bio.bi_status = BLK_STS_IOERR;
                bio_endio(&rbio->bio);
@@ -884,7 +898,6 @@ void bch2_readahead(struct readahead_control *ractl)
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
        struct btree_trans trans;
-       struct btree_iter iter;
        struct page *page;
        struct readpages_iter readpages_iter;
        int ret;
@@ -893,8 +906,6 @@ void bch2_readahead(struct readahead_control *ractl)
        BUG_ON(ret);
 
        bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
-                            BTREE_ITER_SLOTS);
 
        bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
@@ -915,22 +926,20 @@ void bch2_readahead(struct readahead_control *ractl)
                rbio->bio.bi_end_io = bch2_readpages_end_io;
                BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
-               bchfs_read(&trans, &iter, rbio, inode->v.i_ino,
+               bchfs_read(&trans, rbio, inode_inum(inode),
                           &readpages_iter);
        }
 
        bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
-       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        kfree(readpages_iter.pages);
 }
 
 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
-                            u64 inum, struct page *page)
+                            subvol_inum inum, struct page *page)
 {
        struct btree_trans trans;
-       struct btree_iter iter;
 
        bch2_page_state_create(page, __GFP_NOFAIL);
 
@@ -940,12 +949,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
        BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
        bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
-                            BTREE_ITER_SLOTS);
-
-       bchfs_read(&trans, &iter, rbio, inum, NULL);
-
-       bch2_trans_iter_exit(&trans, &iter);
+       bchfs_read(&trans, rbio, inum, NULL);
        bch2_trans_exit(&trans);
 }
 
@@ -959,7 +963,7 @@ int bch2_readpage(struct file *file, struct page *page)
        rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
        rbio->bio.bi_end_io = bch2_readpages_end_io;
 
-       __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+       __bchfs_readpage(c, rbio, inode_inum(inode), page);
        return 0;
 }
 
@@ -982,7 +986,7 @@ static int bch2_read_single_page(struct page *page,
        rbio->bio.bi_private = &done;
        rbio->bio.bi_end_io = bch2_read_single_page_end_io;
 
-       __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+       __bchfs_readpage(c, rbio, inode_inum(inode), page);
        wait_for_completion(&done);
 
        ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -1126,6 +1130,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
        op->nr_replicas         = nr_replicas;
        op->res.nr_replicas     = nr_replicas;
        op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
+       op->subvol              = inode->ei_subvol;
        op->pos                 = POS(inode->v.i_ino, sector);
        op->wbio.bio.bi_iter.bi_sector = sector;
        op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
@@ -1758,7 +1763,7 @@ start:
                if (iter->count)
                        closure_get(&dio->cl);
 
-               bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+               bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
        }
 
        iter->count += shorten;
@@ -1813,6 +1818,50 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 /* O_DIRECT writes */
 
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+                                      u64 offset, u64 size,
+                                      unsigned nr_replicas, bool compressed)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 end = offset + size;
+       u32 snapshot;
+       bool ret = true;
+       int err;
+
+       bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (err)
+               goto err;
+
+       for_each_btree_key(&trans, iter, BTREE_ID_extents,
+                          SPOS(inum.inum, offset, snapshot),
+                          BTREE_ITER_SLOTS, k, err) {
+               if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
+                       break;
+
+               if (k.k->p.snapshot != snapshot ||
+                   nr_replicas > bch2_bkey_replicas(c, k) ||
+                   (!compressed && bch2_bkey_sectors_compressed(k))) {
+                       ret = false;
+                       break;
+               }
+       }
+
+       offset = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (err == -EINTR)
+               goto retry;
+       bch2_trans_exit(&trans);
+
+       return err ? false : ret;
+}
+
 static void bch2_dio_write_loop_async(struct bch_write_op *);
 
 static long bch2_dio_write_loop(struct dio_write *dio)
@@ -1891,6 +1940,7 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
                dio->op.write_point     = writepoint_hashed((unsigned long) current);
                dio->op.nr_replicas     = dio->op.opts.data_replicas;
+               dio->op.subvol          = inode->ei_subvol;
                dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
 
                if ((req->ki_flags & IOCB_DSYNC) &&
@@ -1901,8 +1951,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
                                                dio->op.opts.data_replicas, 0);
                if (unlikely(ret) &&
-                   !bch2_check_range_allocated(c, dio->op.pos,
-                               bio_sectors(bio),
+                   !bch2_check_range_allocated(c, inode_inum(inode),
+                               dio->op.pos.offset, bio_sectors(bio),
                                dio->op.opts.data_replicas,
                                dio->op.opts.compression != 0))
                        goto err;
@@ -2146,9 +2196,9 @@ out:
 
 /* truncate: */
 
-static inline int range_has_data(struct bch_fs *c,
-                                 struct bpos start,
-                                 struct bpos end)
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+                                struct bpos start,
+                                struct bpos end)
 {
        struct btree_trans trans;
        struct btree_iter iter;
@@ -2156,6 +2206,12 @@ static inline int range_has_data(struct bch_fs *c,
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+       if (ret)
+               goto err;
 
        for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
@@ -2166,7 +2222,11 @@ static inline int range_has_data(struct bch_fs *c,
                        break;
                }
        }
+       start = iter.pos;
        bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
        return bch2_trans_exit(&trans) ?: ret;
 }
@@ -2198,7 +2258,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
                 * XXX: we're doing two index lookups when we end up reading the
                 * page
                 */
-               ret = range_has_data(c,
+               ret = range_has_data(c, inode->ei_subvol,
                                POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
                                POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
                if (ret <= 0)
@@ -2332,7 +2392,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
        inode_dio_wait(&inode->v);
        bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
-       ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u);
+       ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
        if (ret)
                goto err;
 
@@ -2390,7 +2450,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
 
        truncate_setsize(&inode->v, iattr->ia_size);
 
-       ret = bch2_fpunch(c, inode->v.i_ino,
+       ret = bch2_fpunch(c, inode_inum(inode),
                        round_up(iattr->ia_size, block_bytes(c)) >> 9,
                        U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
        i_sectors_acct(c, inode, NULL, i_sectors_delta);
@@ -2450,7 +2510,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
        if (discard_start < discard_end) {
                s64 i_sectors_delta = 0;
 
-               ret = bch2_fpunch(c, inode->v.i_ino,
+               ret = bch2_fpunch(c, inode_inum(inode),
                                  discard_start, discard_end,
                                  &inode->ei_journal_seq,
                                  &i_sectors_delta);
@@ -2529,7 +2589,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        } else {
                s64 i_sectors_delta = 0;
 
-               ret = bch2_fpunch(c, inode->v.i_ino,
+               ret = bch2_fpunch(c, inode_inum(inode),
                                  offset >> 9, (offset + len) >> 9,
                                  &inode->ei_journal_seq,
                                  &i_sectors_delta);
@@ -2556,6 +2616,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
                struct bpos atomic_end;
                unsigned trigger_flags = 0;
+               u32 snapshot;
+
+               bch2_trans_begin(&trans);
+
+               ret = bch2_subvolume_get_snapshot(&trans,
+                                       inode->ei_subvol, &snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(&src, snapshot);
+               bch2_btree_iter_set_snapshot(&dst, snapshot);
+               bch2_btree_iter_set_snapshot(&del, snapshot);
 
                bch2_trans_begin(&trans);
 
@@ -2676,9 +2748,17 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                struct bkey_i_reservation reservation;
                struct bkey_s_c k;
                unsigned sectors;
+               u32 snapshot;
 
                bch2_trans_begin(&trans);
 
+               ret = bch2_subvolume_get_snapshot(&trans,
+                                       inode->ei_subvol, &snapshot);
+               if (ret)
+                       goto bkey_err;
+
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
+
                k = bch2_btree_iter_peek_slot(&iter);
                if ((ret = bkey_err(k)))
                        goto bkey_err;
@@ -2725,7 +2805,8 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                        reservation.v.nr_replicas = disk_res.nr_replicas;
                }
 
-               ret = bch2_extent_update(&trans, &iter, &reservation.k_i,
+               ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
+                                        &reservation.k_i,
                                &disk_res, &inode->ei_journal_seq,
                                0, &i_sectors_delta, true);
                i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
@@ -2927,8 +3008,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
        mark_range_unallocated(src, pos_src, pos_src + aligned_len);
 
        ret = bch2_remap_range(c,
-                              POS(dst->v.i_ino, pos_dst >> 9),
-                              POS(src->v.i_ino, pos_src >> 9),
+                              inode_inum(dst), pos_dst >> 9,
+                              inode_inum(src), pos_src >> 9,
                               aligned_len >> 9,
                               &dst->ei_journal_seq,
                               pos_dst + len, &i_sectors_delta);
@@ -3019,7 +3100,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
+       subvol_inum inum = inode_inum(inode);
        u64 isize, next_data = MAX_LFS_FILESIZE;
+       u32 snapshot;
        int ret;
 
        isize = i_size_read(&inode->v);
@@ -3027,9 +3110,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
                return -ENXIO;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
 
        for_each_btree_key(&trans, iter, BTREE_ID_extents,
-                          POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
+                          SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
                        break;
                } else if (bkey_extent_is_data(k.k)) {
@@ -3039,6 +3128,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
                        break;
        }
        bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
@@ -3115,7 +3207,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
+       subvol_inum inum = inode_inum(inode);
        u64 isize, next_hole = MAX_LFS_FILESIZE;
+       u32 snapshot;
        int ret;
 
        isize = i_size_read(&inode->v);
@@ -3123,9 +3217,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
                return -ENXIO;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
 
        for_each_btree_key(&trans, iter, BTREE_ID_extents,
-                          POS(inode->v.i_ino, offset >> 9),
+                          SPOS(inode->v.i_ino, offset >> 9, snapshot),
                           BTREE_ITER_SLOTS, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
                        next_hole = bch2_seek_pagecache_hole(&inode->v,
@@ -3143,6 +3243,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
                }
        }
        bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
index 91a0e761c8e70d5179d42a29fade38ed61666350..3ed53f420e7e713d6b8d51a19bc42d604610a61a 100644 (file)
 #include "quota.h"
 
 #include <linux/compat.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
 
 #define FS_IOC_GOINGDOWN            _IOR('X', 125, __u32)
 #define FSOP_GOING_FLAGS_DEFAULT       0x0     /* going down */
@@ -192,7 +196,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
        char *kname = NULL;
        struct qstr qstr;
        int ret = 0;
-       u64 inum;
+       subvol_inum inum;
 
        kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
        if (!kname)
@@ -205,10 +209,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
        qstr.len        = ret;
        qstr.name       = kname;
 
-       ret = -ENOENT;
-       inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
-                                 &qstr);
-       if (!inum)
+       ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+       if (ret)
                goto err1;
 
        vinode = bch2_vfs_inode_get(c, inum);
@@ -294,6 +296,154 @@ err:
        return ret;
 }
 
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+                               struct bch_ioctl_subvolume arg)
+{
+       struct inode *dir;
+       struct bch_inode_info *inode;
+       struct user_namespace *s_user_ns;
+       struct dentry *dst_dentry;
+       struct path src_path, dst_path;
+       int how = LOOKUP_FOLLOW;
+       int error;
+       subvol_inum snapshot_src = { 0 };
+       unsigned lookup_flags = 0;
+       unsigned create_flags = BCH_CREATE_SUBVOL;
+
+       if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+                         BCH_SUBVOL_SNAPSHOT_RO))
+               return -EINVAL;
+
+       if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+           (arg.src_ptr ||
+            (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+               return -EINVAL;
+
+       if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+               create_flags |= BCH_CREATE_SNAPSHOT;
+
+       if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+               create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+       /* why do we need this lock? */
+       down_read(&c->vfs_sb->s_umount);
+
+       if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+               sync_inodes_sb(c->vfs_sb);
+retry:
+       if (arg.src_ptr) {
+               error = user_path_at(arg.dirfd,
+                               (const char __user *)(unsigned long)arg.src_ptr,
+                               how, &src_path);
+               if (error)
+                       goto err1;
+
+               if (src_path.dentry->d_sb->s_fs_info != c) {
+                       path_put(&src_path);
+                       error = -EXDEV;
+                       goto err1;
+               }
+
+               snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+       }
+
+       dst_dentry = user_path_create(arg.dirfd,
+                       (const char __user *)(unsigned long)arg.dst_ptr,
+                       &dst_path, lookup_flags);
+       error = PTR_ERR_OR_ZERO(dst_dentry);
+       if (error)
+               goto err2;
+
+       if (dst_dentry->d_sb->s_fs_info != c) {
+               error = -EXDEV;
+               goto err3;
+       }
+
+       if (dst_dentry->d_inode) {
+               error = -EEXIST;
+               goto err3;
+       }
+
+       dir = dst_path.dentry->d_inode;
+       if (IS_DEADDIR(dir)) {
+               error = -ENOENT;
+               goto err3;
+       }
+
+       s_user_ns = dir->i_sb->s_user_ns;
+       if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+           !kgid_has_mapping(s_user_ns, current_fsgid())) {
+               error = -EOVERFLOW;
+               goto err3;
+       }
+
+       error = inode_permission(file_mnt_user_ns(filp),
+                                dir, MAY_WRITE | MAY_EXEC);
+       if (error)
+               goto err3;
+
+       if (!IS_POSIXACL(dir))
+               arg.mode &= ~current_umask();
+
+       error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+       if (error)
+               goto err3;
+
+       if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+           !arg.src_ptr)
+               snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+
+       inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir),
+                             dst_dentry, arg.mode|S_IFDIR,
+                             0, snapshot_src, create_flags);
+       error = PTR_ERR_OR_ZERO(inode);
+       if (error)
+               goto err3;
+
+       d_instantiate(dst_dentry, &inode->v);
+       fsnotify_mkdir(dir, dst_dentry);
+err3:
+       done_path_create(&dst_path, dst_dentry);
+err2:
+       if (arg.src_ptr)
+               path_put(&src_path);
+
+       if (retry_estale(error, lookup_flags)) {
+               lookup_flags |= LOOKUP_REVAL;
+               goto retry;
+       }
+err1:
+       up_read(&c->vfs_sb->s_umount);
+
+       return error;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+                               struct bch_ioctl_subvolume arg)
+{
+       struct path path;
+       int ret = 0;
+
+       if (arg.flags)
+               return -EINVAL;
+
+       ret = user_path_at(arg.dirfd,
+                       (const char __user *)(unsigned long)arg.dst_ptr,
+                       LOOKUP_FOLLOW, &path);
+       if (ret)
+               return ret;
+
+       if (path.dentry->d_sb->s_fs_info != c) {
+               path_put(&path);
+               return -EXDEV;
+       }
+
+       ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1);
+       path_put(&path);
+
+       return ret;
+}
+
 long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
@@ -324,6 +474,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case FS_IOC_GOINGDOWN:
                return bch2_ioc_goingdown(c, (u32 __user *) arg);
 
+       case BCH_IOCTL_SUBVOLUME_CREATE: {
+               struct bch_ioctl_subvolume i;
+
+               if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+                       return -EFAULT;
+               return bch2_ioctl_subvolume_create(c, file, i);
+       }
+
+       case BCH_IOCTL_SUBVOLUME_DESTROY: {
+               struct bch_ioctl_subvolume i;
+
+               if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+                       return -EFAULT;
+               return bch2_ioctl_subvolume_destroy(c, file, i);
+       }
+
        default:
                return bch2_fs_ioctl(c, cmd, (void __user *) arg);
        }
index 6cc56871d26d85bea6bcd72bd88cb3f289fd5942..2094c18cd87412a9c6307910aecc419537f305d8 100644 (file)
@@ -36,7 +36,7 @@
 
 static struct kmem_cache *bch2_inode_cache;
 
-static void bch2_vfs_inode_init(struct bch_fs *,
+static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
                                struct bch_inode_info *,
                                struct bch_inode_unpacked *);
 
@@ -149,7 +149,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 retry:
        bch2_trans_begin(&trans);
 
-       ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino,
+       ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
                                BTREE_ITER_INTENT) ?:
                (set ? set(inode, &inode_u, p) : 0) ?:
                bch2_inode_write(&trans, &iter, &inode_u) ?:
@@ -208,13 +208,42 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
        return ret;
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       subvol_inum *inum = p;
+
+       return inode->ei_subvol == inum->subvol &&
+               inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       subvol_inum *inum = p;
+
+       inode->v.i_ino          = inum->inum;
+       inode->ei_subvol        = inum->subvol;
+       inode->ei_inode.bi_inum = inum->inum;
+       return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+       return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
        struct bch_inode_unpacked inode_u;
        struct bch_inode_info *inode;
        int ret;
 
-       inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+       inode = to_bch_ei(iget5_locked(c->vfs_sb,
+                                      bch2_inode_hash(inum),
+                                      bch2_iget5_test,
+                                      bch2_iget5_set,
+                                      &inum));
        if (unlikely(!inode))
                return ERR_PTR(-ENOMEM);
        if (!(inode->v.i_state & I_NEW))
@@ -226,26 +255,20 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
                return ERR_PTR(ret);
        }
 
-       bch2_vfs_inode_init(c, inode, &inode_u);
+       bch2_vfs_inode_init(c, inum, inode, &inode_u);
 
-       inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
+       inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum);
 
        unlock_new_inode(&inode->v);
 
        return &inode->v;
 }
 
-static int inum_test(struct inode *inode, void *p)
-{
-       unsigned long *ino = p;
-
-       return *ino == inode->i_ino;
-}
-
-static struct bch_inode_info *
+struct bch_inode_info *
 __bch2_create(struct user_namespace *mnt_userns,
              struct bch_inode_info *dir, struct dentry *dentry,
-             umode_t mode, dev_t rdev, bool tmpfile)
+             umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+             unsigned flags)
 {
        struct bch_fs *c = dir->v.i_sb->s_fs_info;
        struct btree_trans trans;
@@ -253,6 +276,7 @@ __bch2_create(struct user_namespace *mnt_userns,
        struct bch_inode_info *inode, *old;
        struct bch_inode_unpacked inode_u;
        struct posix_acl *default_acl = NULL, *acl = NULL;
+       subvol_inum inum;
        u64 journal_seq = 0;
        int ret;
 
@@ -273,20 +297,23 @@ __bch2_create(struct user_namespace *mnt_userns,
 
        bch2_inode_init_early(c, &inode_u);
 
-       if (!tmpfile)
+       if (!(flags & BCH_CREATE_TMPFILE))
                mutex_lock(&dir->ei_update_lock);
 
        bch2_trans_init(&trans, c, 8,
-                       2048 + (!tmpfile ? dentry->d_name.len : 0));
+                       2048 + (!(flags & BCH_CREATE_TMPFILE)
+                               ? dentry->d_name.len : 0));
 retry:
        bch2_trans_begin(&trans);
 
-       ret   = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
-                                 !tmpfile ? &dentry->d_name : NULL,
+       ret   = bch2_create_trans(&trans,
+                                 inode_inum(dir), &dir_u, &inode_u,
+                                 !(flags & BCH_CREATE_TMPFILE)
+                                 ? &dentry->d_name : NULL,
                                  from_kuid(mnt_userns, current_fsuid()),
                                  from_kgid(mnt_userns, current_fsgid()),
                                  mode, rdev,
-                                 default_acl, acl) ?:
+                                 default_acl, acl, snapshot_src, flags) ?:
                bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
                                KEY_TYPE_QUOTA_PREALLOC);
        if (unlikely(ret))
@@ -302,14 +329,17 @@ err_before_quota:
                goto err_trans;
        }
 
-       if (!tmpfile) {
+       if (!(flags & BCH_CREATE_TMPFILE)) {
                bch2_inode_update_after_write(c, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
                journal_seq_copy(c, dir, journal_seq);
                mutex_unlock(&dir->ei_update_lock);
        }
 
-       bch2_vfs_inode_init(c, inode, &inode_u);
+       inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+       inum.inum = inode_u.bi_inum;
+
+       bch2_vfs_inode_init(c, inum, inode, &inode_u);
        journal_seq_copy(c, inode, journal_seq);
 
        set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -322,8 +352,12 @@ err_before_quota:
         */
 
        inode->v.i_state |= I_CREATING;
-       old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
-                                     inum_test, NULL, &inode->v.i_ino));
+
+       old = to_bch_ei(inode_insert5(&inode->v,
+                                     bch2_inode_hash(inum),
+                                     bch2_iget5_test,
+                                     bch2_iget5_set,
+                                     &inum));
        BUG_ON(!old);
 
        if (unlikely(old != inode)) {
@@ -350,7 +384,7 @@ err:
        posix_acl_release(acl);
        return inode;
 err_trans:
-       if (!tmpfile)
+       if (!(flags & BCH_CREATE_TMPFILE))
                mutex_unlock(&dir->ei_update_lock);
 
        bch2_trans_exit(&trans);
@@ -369,12 +403,13 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
        struct bch_inode_info *dir = to_bch_ei(vdir);
        struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
        struct inode *vinode = NULL;
-       u64 inum;
+       subvol_inum inum = { .subvol = 1 };
+       int ret;
 
-       inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
-                                 &dentry->d_name);
+       ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+                                &dentry->d_name, &inum);
 
-       if (inum)
+       if (!ret)
                vinode = bch2_vfs_inode_get(c, inum);
 
        return d_splice_alias(vinode, dentry);
@@ -385,7 +420,8 @@ static int bch2_mknod(struct user_namespace *mnt_userns,
                      umode_t mode, dev_t rdev)
 {
        struct bch_inode_info *inode =
-               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false);
+               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
+                             (subvol_inum) { 0 }, 0);
 
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -415,8 +451,8 @@ static int __bch2_link(struct bch_fs *c,
 
        ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
                        bch2_link_trans(&trans,
-                                       dir->v.i_ino,
-                                       inode->v.i_ino, &dir_u, &inode_u,
+                                       inode_inum(dir),   &dir_u,
+                                       inode_inum(inode), &inode_u,
                                        &dentry->d_name));
 
        if (likely(!ret)) {
@@ -452,7 +488,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
        return 0;
 }
 
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+                 int deleting_snapshot)
 {
        struct bch_fs *c = vdir->i_sb->s_fs_info;
        struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -467,8 +504,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
        ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
                              BTREE_INSERT_NOFAIL,
                        bch2_unlink_trans(&trans,
-                                         dir->v.i_ino, &dir_u,
-                                         &inode_u, &dentry->d_name));
+                                         inode_inum(dir), &dir_u,
+                                         &inode_u, &dentry->d_name,
+                                         deleting_snapshot));
 
        if (likely(!ret)) {
                BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@@ -486,6 +524,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
        return ret;
 }
 
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+       return __bch2_unlink(vdir, dentry, -1);
+}
+
 static int bch2_symlink(struct user_namespace *mnt_userns,
                        struct inode *vdir, struct dentry *dentry,
                        const char *symname)
@@ -494,7 +537,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
        struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
        int ret;
 
-       inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+       inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+                             (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
        if (unlikely(IS_ERR(inode)))
                return PTR_ERR(inode);
 
@@ -587,8 +631,8 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
 
        ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
                        bch2_rename_trans(&trans,
-                                         src_dir->v.i_ino, &src_dir_u,
-                                         dst_dir->v.i_ino, &dst_dir_u,
+                                         inode_inum(src_dir), &src_dir_u,
+                                         inode_inum(dst_dir), &dst_dir_u,
                                          &src_inode_u,
                                          &dst_inode_u,
                                          &src_dentry->d_name,
@@ -711,7 +755,7 @@ retry:
        kfree(acl);
        acl = NULL;
 
-       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
                              BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
@@ -719,7 +763,8 @@ retry:
        bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
 
        if (attr->ia_valid & ATTR_MODE) {
-               ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
+               ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+                                    inode_u.bi_mode, &acl);
                if (ret)
                        goto btree_err;
        }
@@ -810,7 +855,8 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns,
                        struct inode *vdir, struct dentry *dentry, umode_t mode)
 {
        struct bch_inode_info *inode =
-               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true);
+               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
+                             (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -885,6 +931,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
        unsigned offset_into_extent, sectors;
        bool have_extent = false;
+       u32 snapshot;
        int ret = 0;
 
        ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@ -894,15 +941,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        if (start + len < start)
                return -EINVAL;
 
+       start >>= 9;
+
        bch2_bkey_buf_init(&cur);
        bch2_bkey_buf_init(&prev);
        bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            POS(ei->v.i_ino, start >> 9), 0);
 retry:
        bch2_trans_begin(&trans);
 
+       ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            SPOS(ei->v.i_ino, start, snapshot), 0);
+
        while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k)) &&
               bkey_cmp(iter.pos, end) < 0) {
@@ -951,7 +1004,9 @@ retry:
                bch2_btree_iter_set_pos(&iter,
                        POS(iter.pos.inode, iter.pos.offset + sectors));
        }
-
+       start = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
        if (ret == -EINTR)
                goto retry;
 
@@ -959,7 +1014,6 @@ retry:
                ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
                                       FIEMAP_EXTENT_LAST);
 
-       bch2_trans_iter_exit(&trans, &iter);
        ret = bch2_trans_exit(&trans) ?: ret;
        bch2_bkey_buf_exit(&cur, c);
        bch2_bkey_buf_exit(&prev, c);
@@ -996,7 +1050,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
        if (!dir_emit_dots(file, ctx))
                return 0;
 
-       return bch2_readdir(c, inode->v.i_ino, ctx);
+       return bch2_readdir(c, inode_inum(inode), ctx);
 }
 
 static const struct file_operations bch_file_operations = {
@@ -1096,6 +1150,7 @@ static const struct address_space_operations bch_address_space_operations = {
        .error_remove_page = generic_error_remove_page,
 };
 
+#if 0
 static struct inode *bch2_nfs_get_inode(struct super_block *sb,
                u64 ino, u32 generation)
 {
@@ -1129,14 +1184,15 @@ static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    bch2_nfs_get_inode);
 }
+#endif
 
 static const struct export_operations bch_export_ops = {
-       .fh_to_dentry   = bch2_fh_to_dentry,
-       .fh_to_parent   = bch2_fh_to_parent,
+       //.fh_to_dentry = bch2_fh_to_dentry,
+       //.fh_to_parent = bch2_fh_to_parent,
        //.get_parent   = bch2_get_parent,
 };
 
-static void bch2_vfs_inode_init(struct bch_fs *c,
+static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
                                struct bch_inode_info *inode,
                                struct bch_inode_unpacked *bi)
 {
@@ -1152,6 +1208,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
        inode->ei_journal_seq   = 0;
        inode->ei_quota_reserved = 0;
        inode->ei_qid           = bch_qid(bi);
+       inode->ei_subvol        = inum.subvol;
 
        inode->v.i_mapping->a_ops = &bch_address_space_operations;
 
@@ -1249,7 +1306,7 @@ static void bch2_evict_inode(struct inode *vinode)
                                KEY_TYPE_QUOTA_WARN);
                bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
-               bch2_inode_rm(c, inode->v.i_ino, true);
+               bch2_inode_rm(c, inode_inum(inode), true);
        }
 }
 
@@ -1593,7 +1650,7 @@ got_sb:
                sb->s_flags     |= SB_POSIXACL;
 #endif
 
-       vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+       vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
        if (IS_ERR(vinode)) {
                bch_err(c, "error mounting: error getting root inode %i",
                        (int) PTR_ERR(vinode));
index 36cc6ba2d644f3c10b3ef3c2428bb21b6e84ed43..48fc504e2da20dd69bfa644edf38d26867f4b554 100644 (file)
@@ -45,10 +45,20 @@ struct bch_inode_info {
        struct mutex            ei_quota_lock;
        struct bch_qid          ei_qid;
 
+       u32                     ei_subvol;
+
        /* copy of inode in btree: */
        struct bch_inode_unpacked ei_inode;
 };
 
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+       return (subvol_inum) {
+               .subvol = inode->ei_subvol,
+               .inum   = inode->ei_inode.bi_inum,
+       };
+}
+
 /*
  * Set if we've gotten a btree error for this inode, and thus the vfs inode and
  * btree inode may be inconsistent:
@@ -135,6 +145,10 @@ struct bch_inode_unpacked;
 
 #ifndef NO_BCACHEFS_FS
 
+struct bch_inode_info *
+__bch2_create(struct user_namespace *, struct bch_inode_info *,
+             struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
 int bch2_fs_quota_transfer(struct bch_fs *,
                           struct bch_inode_info *,
                           struct bch_qid,
@@ -154,7 +168,7 @@ static inline int bch2_set_projid(struct bch_fs *c,
                                      KEY_TYPE_QUOTA_PREALLOC);
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
 
 /* returns 0 if we want to do the update, or error is passed up */
 typedef int (*inode_set_fn)(struct bch_inode_info *,
@@ -170,6 +184,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 int bch2_setattr_nonsize(struct user_namespace *,
                         struct bch_inode_info *,
                         struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, int);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
index eb979e79eaac9c226ff90c70ef912395d4506448..16a1eae9b374b953ddb9f7078214745325fc6ddb 100644 (file)
@@ -9,6 +9,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
+#include "subvolume.h"
 #include "super.h"
 #include "xattr.h"
 
@@ -17,7 +18,8 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+                                   u32 snapshot)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
@@ -25,7 +27,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
        int ret;
 
        for_each_btree_key(trans, iter, BTREE_ID_extents,
-                          POS(inum, 0), 0, k, ret) {
+                          SPOS(inum, 0, snapshot), 0, k, ret) {
                if (k.k->p.inode != inum)
                        break;
 
@@ -38,6 +40,100 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
        return ret ?: sectors;
 }
 
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+                                   u32 snapshot)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_dirent d;
+       u64 subdirs = 0;
+       int ret;
+
+       for_each_btree_key(trans, iter, BTREE_ID_dirents,
+                          SPOS(inum, 0, snapshot), 0, k, ret) {
+               if (k.k->p.inode != inum)
+                       break;
+
+               if (k.k->type != KEY_TYPE_dirent)
+                       continue;
+
+               d = bkey_s_c_to_dirent(k);
+               if (d.v->d_type == DT_DIR)
+                       subdirs++;
+       }
+
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret ?: subdirs;
+}
+
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+                                   u32 *subvol)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+                            POS(0, snapshot), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_snapshot) {
+               bch_err(trans->c, "snapshot %u not fonud", snapshot);
+               ret = -ENOENT;
+               goto err;
+       }
+
+       *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+
+}
+
+static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+                                 u32 *subvol)
+{
+       return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol));
+}
+
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+                          u32 *snapshot, u64 *inum)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+                            POS(0, subvol), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_subvolume) {
+               bch_err(trans->c, "subvolume %u not fonud", subvol);
+               ret = -ENOENT;
+               goto err;
+       }
+
+       *snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+       *inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+                        u32 *snapshot, u64 *inum)
+{
+       return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
+}
+
 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
                          struct bch_inode_unpacked *inode,
                          u32 *snapshot)
@@ -47,14 +143,13 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
        int ret;
 
        bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-                            POS(0, inode_nr), 0);
+                            SPOS(0, inode_nr, *snapshot), 0);
        k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
-       if (snapshot)
-               *snapshot = iter.pos.snapshot;
+       *snapshot = iter.pos.snapshot;
        ret = k.k->type == KEY_TYPE_inode
                ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
                : -ENOENT;
@@ -70,6 +165,36 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
        return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
 }
 
+static int __lookup_dirent(struct btree_trans *trans,
+                          struct bch_hash_info hash_info,
+                          subvol_inum dir, struct qstr *name,
+                          u64 *target, unsigned *type)
+{
+       struct btree_iter iter;
+       struct bkey_s_c_dirent d;
+       int ret;
+
+       ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+                              &hash_info, dir, name, 0);
+       if (ret)
+               return ret;
+
+       d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+       *target = le64_to_cpu(d.v->d_inum);
+       *type = d.v->d_type;
+       bch2_trans_iter_exit(trans, &iter);
+       return 0;
+}
+
+static int lookup_dirent(struct btree_trans *trans,
+                        struct bch_hash_info hash_info,
+                        subvol_inum dir, struct qstr *name,
+                        u64 *target, unsigned *type)
+{
+       return lockrestart_do(trans,
+               __lookup_dirent(trans, hash_info, dir, name, target, type));
+}
+
 static int __write_inode(struct btree_trans *trans,
                         struct bch_inode_unpacked *inode,
                         u32 snapshot)
@@ -100,6 +225,71 @@ static int write_inode(struct btree_trans *trans,
        return ret;
 }
 
+static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+       struct btree_iter iter = { NULL };
+       struct bkey_i_inode_generation delete;
+       struct bch_inode_unpacked inode_u;
+       struct bkey_s_c k;
+       int ret;
+
+       ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+                                             SPOS(inum, 0, snapshot),
+                                             SPOS(inum, U64_MAX, snapshot),
+                                             0, NULL) ?:
+               bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+                                             SPOS(inum, 0, snapshot),
+                                             SPOS(inum, U64_MAX, snapshot),
+                                             0, NULL) ?:
+               bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+                                             SPOS(inum, 0, snapshot),
+                                             SPOS(inum, U64_MAX, snapshot),
+                                             0, NULL);
+       if (ret)
+               goto err;
+retry:
+       bch2_trans_begin(trans);
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                            SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_inode) {
+               bch2_fs_inconsistent(trans->c,
+                                    "inode %llu:%u not found when deleting",
+                                    inum, snapshot);
+               ret = -EIO;
+               goto err;
+       }
+
+       bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+
+       /* Subvolume root? */
+       if (inode_u.bi_subvol) {
+               ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1);
+               if (ret)
+                       goto err;
+       }
+
+       bkey_inode_generation_init(&delete.k_i);
+       delete.k.p = iter.pos;
+       delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+       ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                               BTREE_INSERT_NOFAIL);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       if (ret == -EINTR)
+               goto retry;
+
+       return ret;
+}
+
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
        struct bch_fs *c = trans->c;
@@ -117,7 +307,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
        bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
        ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                 &dir_hash_info, &iter);
+                                 &dir_hash_info, &iter, 0);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -134,29 +324,49 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos)
 }
 
 /* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans,
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
                            struct bch_inode_unpacked *lostfound)
 {
        struct bch_fs *c = trans->c;
        struct bch_inode_unpacked root;
        struct bch_hash_info root_hash_info;
        struct qstr lostfound_str = QSTR("lost+found");
-       u64 inum;
+       subvol_inum root_inum = { .subvol = subvol };
+       u64 inum = 0;
+       unsigned d_type = 0;
        u32 snapshot;
        int ret;
 
-       ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
-       if (ret && ret != -ENOENT)
+       ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+       if (ret)
+               return ret;
+
+       ret = lookup_inode(trans, root_inum.inum, &root, &snapshot);
+       if (ret) {
+               bch_err(c, "error fetching subvol root: %i", ret);
                return ret;
+       }
 
        root_hash_info = bch2_hash_info_init(c, &root);
-       inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
-                                 &lostfound_str);
-       if (!inum) {
+
+       ret = lookup_dirent(trans, root_hash_info, root_inum,
+                           &lostfound_str, &inum, &d_type);
+       if (ret == -ENOENT) {
                bch_notice(c, "creating lost+found");
                goto create_lostfound;
        }
 
+       if (ret) {
+               bch_err(c, "error looking up lost+found: %i", ret);
+               return ret;
+       }
+
+       if (d_type != DT_DIR) {
+               bch_err(c, "error looking up lost+found: not a directory");
+               return ret;
+
+       }
+
        ret = lookup_inode(trans, inum, lostfound, &snapshot);
        if (ret && ret != -ENOENT) {
                /*
@@ -174,11 +384,10 @@ create_lostfound:
                ret = __bch2_trans_do(trans, NULL, NULL,
                                      BTREE_INSERT_NOFAIL|
                                      BTREE_INSERT_LAZY_RW,
-                       bch2_create_trans(trans,
-                                         BCACHEFS_ROOT_INO, &root,
-                                         lostfound,
-                                         &lostfound_str,
-                                         0, 0, S_IFDIR|0700, 0, NULL, NULL));
+                       bch2_create_trans(trans, root_inum, &root,
+                                         lostfound, &lostfound_str,
+                                         0, 0, S_IFDIR|0700, 0, NULL, NULL,
+                                         (subvol_inum) { }, 0));
                if (ret)
                        bch_err(c, "error creating lost+found: %i", ret);
        }
@@ -187,16 +396,22 @@ create_lostfound:
 }
 
 static int reattach_inode(struct btree_trans *trans,
-                         struct bch_inode_unpacked *inode)
+                         struct bch_inode_unpacked *inode,
+                         u32 inode_snapshot)
 {
        struct bch_hash_info dir_hash;
        struct bch_inode_unpacked lostfound;
        char name_buf[20];
        struct qstr name;
        u64 dir_offset = 0;
+       u32 subvol;
        int ret;
 
-       ret = lookup_lostfound(trans, &lostfound);
+       ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+       if (ret)
+               return ret;
+
+       ret = lookup_lostfound(trans, subvol, &lostfound);
        if (ret)
                return ret;
 
@@ -214,10 +429,15 @@ static int reattach_inode(struct btree_trans *trans,
        name = (struct qstr) QSTR(name_buf);
 
        ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-               bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
-                                  mode_to_type(inode->bi_mode),
-                                  &name, inode->bi_inum, &dir_offset,
-                                  BCH_HASH_SET_MUST_CREATE));
+                       bch2_dirent_create(trans,
+                                          (subvol_inum) {
+                                               .subvol = subvol,
+                                               .inum = lostfound.bi_inum,
+                                          },
+                                          &dir_hash,
+                                          mode_to_type(inode->bi_mode),
+                                          &name, inode->bi_inum, &dir_offset,
+                                          BCH_HASH_SET_MUST_CREATE));
        if (ret) {
                bch_err(trans->c, "error %i reattaching inode %llu",
                        ret, inode->bi_inum);
@@ -227,7 +447,7 @@ static int reattach_inode(struct btree_trans *trans,
        inode->bi_dir           = lostfound.bi_inum;
        inode->bi_dir_offset    = dir_offset;
 
-       return write_inode(trans, inode, U32_MAX);
+       return write_inode(trans, inode, inode_snapshot);
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -254,45 +474,254 @@ out:
        return ret;
 }
 
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+{
+       pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+
+       if (bkey_cmp(s->pos, pos))
+               s->nr = 0;
+       s->pos = pos;
+
+       /* Might get called multiple times due to lock restarts */
+       if (s->nr && s->d[s->nr - 1] == pos.snapshot)
+               return 0;
+
+       return snapshots_seen_add(c, s, pos.snapshot);
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+                                   u32 id, u32 ancestor)
+{
+       ssize_t i;
+
+       BUG_ON(id > ancestor);
+
+       id              = snapshot_t(c, id)->equiv;
+       ancestor        = snapshot_t(c, ancestor)->equiv;
+
+       /* @ancestor should be the snapshot most recently added to @seen */
+       BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
+       BUG_ON(seen->pos.snapshot != ancestor);
+
+       if (id == ancestor)
+               return true;
+
+       if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+               return false;
+
+       for (i = seen->nr - 2;
+            i >= 0 && seen->d[i] >= id;
+            --i)
+               if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
+                   bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+                       return false;
+
+       return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * This assumes we're visiting @src keys in natural key order.
+ *
+ * @s  - list of snapshot IDs already seen at @src
+ * @src        - snapshot ID of src key
+ * @dst        - snapshot ID of dst key
+ */
+static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+                      u32 src, u32 dst)
+{
+       return dst <= src
+               ? key_visible_in_snapshot(c, s, dst, src)
+               : bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)      \
+       for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+               if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
 struct inode_walker {
-       bool                    first_this_inode;
-       bool                    have_inode;
-       u64                     cur_inum;
-       u32                     snapshot;
-       struct bch_inode_unpacked inode;
+       bool                            first_this_inode;
+       u64                             cur_inum;
+
+       size_t                          nr;
+       size_t                          size;
+       struct inode_walker_entry {
+               struct bch_inode_unpacked inode;
+               u32                     snapshot;
+               u64                     count;
+       } *d;
 };
 
+static void inode_walker_exit(struct inode_walker *w)
+{
+       kfree(w->d);
+       w->d = NULL;
+}
+
 static struct inode_walker inode_walker_init(void)
 {
-       return (struct inode_walker) {
-               .cur_inum       = -1,
-               .have_inode     = false,
+       return (struct inode_walker) { 0, };
+}
+
+static int inode_walker_realloc(struct inode_walker *w)
+{
+       if (w->nr == w->size) {
+               size_t new_size = max_t(size_t, 8UL, w->size * 2);
+               void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
+                                  GFP_KERNEL);
+               if (!d)
+                       return -ENOMEM;
+
+               w->d = d;
+               w->size = new_size;
+       }
+
+       return 0;
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+                    struct bkey_s_c_inode inode)
+{
+       struct bch_inode_unpacked u;
+       int ret;
+
+       ret = inode_walker_realloc(w);
+       if (ret)
+               return ret;
+
+       BUG_ON(bch2_inode_unpack(inode, &u));
+
+       w->d[w->nr++] = (struct inode_walker_entry) {
+               .inode          = u,
+               .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
        };
+
+       return 0;
 }
 
 static int __walk_inode(struct btree_trans *trans,
-                       struct inode_walker *w, u64 inum)
+                       struct inode_walker *w, struct bpos pos)
 {
-       if (inum != w->cur_inum) {
-               int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot);
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       unsigned i, ancestor_pos;
+       int ret;
 
-               if (ret && ret != -ENOENT)
-                       return ret;
+       pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
 
-               w->have_inode   = !ret;
-               w->cur_inum     = inum;
-               w->first_this_inode = true;
-       } else {
+       if (pos.inode == w->cur_inum) {
                w->first_this_inode = false;
+               goto lookup_snapshot;
        }
 
-       return 0;
+       w->nr = 0;
+
+       for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               if (k.k->p.offset != pos.inode)
+                       break;
+
+               if (k.k->type == KEY_TYPE_inode)
+                       add_inode(c, w, bkey_s_c_to_inode(k));
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (ret)
+               return ret;
+
+       w->cur_inum             = pos.inode;
+       w->first_this_inode     = true;
+lookup_snapshot:
+       for (i = 0; i < w->nr; i++)
+               if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+                       goto found;
+       return INT_MAX;
+found:
+       BUG_ON(pos.snapshot > w->d[i].snapshot);
+
+       if (pos.snapshot != w->d[i].snapshot) {
+               ancestor_pos = i;
+
+               while (i && w->d[i - 1].snapshot > pos.snapshot)
+                       --i;
+
+               ret = inode_walker_realloc(w);
+               if (ret)
+                       return ret;
+
+               array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
+               w->d[i].snapshot = pos.snapshot;
+               w->d[i].count   = 0;
+       }
+
+       return i;
 }
 
 static int walk_inode(struct btree_trans *trans,
-                     struct inode_walker *w, u64 inum)
+                     struct inode_walker *w, struct bpos pos)
+{
+       return lockrestart_do(trans, __walk_inode(trans, w, pos));
+}
+
+static int __get_visible_inodes(struct btree_trans *trans,
+                               struct inode_walker *w,
+                               struct snapshots_seen *s,
+                               u64 inum)
 {
-       return lockrestart_do(trans, __walk_inode(trans, w, inum));
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       w->nr = 0;
+
+       for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               if (k.k->p.offset != inum)
+                       break;
+
+               if (k.k->type != KEY_TYPE_inode)
+                       continue;
+
+               if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+                       add_inode(c, w, bkey_s_c_to_inode(k));
+                       if (k.k->p.snapshot >= s->pos.snapshot)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+                                 struct btree_iter *iter,
+                                 struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       char buf[200];
+       int ret = 0;
+
+       if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+                       "key in missing snapshot: %s",
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
+               ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+                       bch2_btree_delete_at(trans, iter,
+                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+               return ret ?: -EINTR;
+       }
+fsck_err:
+       return ret;
 }
 
 static int hash_redo_key(struct btree_trans *trans,
@@ -300,6 +729,9 @@ static int hash_redo_key(struct btree_trans *trans,
                         struct bch_hash_info *hash_info,
                         struct btree_iter *k_iter, struct bkey_s_c k)
 {
+       bch_err(trans->c, "hash_redo_key() not implemented yet");
+       return -EINVAL;
+#if 0
        struct bkey_i *delete;
        struct bkey_i *tmp;
 
@@ -318,6 +750,7 @@ static int hash_redo_key(struct btree_trans *trans,
        return  bch2_btree_iter_traverse(k_iter) ?:
                bch2_trans_update(trans, k_iter, delete, 0) ?:
                bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
+#endif
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -327,7 +760,7 @@ static int fsck_hash_delete_at(struct btree_trans *trans,
 {
        int ret;
 retry:
-       ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
+       ret   = bch2_hash_delete_at(trans, desc, info, iter, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL|
                                  BTREE_INSERT_LAZY_RW);
@@ -409,30 +842,29 @@ fsck_err:
 
 static int check_inode(struct btree_trans *trans,
                       struct btree_iter *iter,
-                      struct bkey_s_c_inode inode)
+                      struct bch_inode_unpacked *prev,
+                      struct bch_inode_unpacked u)
 {
        struct bch_fs *c = trans->c;
-       struct bch_inode_unpacked u;
        bool do_update = false;
        int ret = 0;
 
-       ret = bch2_inode_unpack(inode, &u);
-
-       if (bch2_fs_inconsistent_on(ret, c,
-                        "error unpacking inode %llu in fsck",
-                        inode.k->p.inode))
-               return ret;
+       if (fsck_err_on(prev &&
+                       (prev->bi_hash_seed             != u.bi_hash_seed ||
+                        mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c,
+                       "inodes in different snapshots don't match")) {
+               bch_err(c, "repair not implemented yet");
+               return -EINVAL;
+       }
 
        if (u.bi_flags & BCH_INODE_UNLINKED &&
            (!c->sb.clean ||
             fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
                      u.bi_inum))) {
-               bch_verbose(c, "deleting inode %llu", u.bi_inum);
-
                bch2_trans_unlock(trans);
                bch2_fs_lazy_rw(c);
 
-               ret = bch2_inode_rm(c, u.bi_inum, false);
+               ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
                if (ret)
                        bch_err(c, "error in fsck: error %i while deleting inode", ret);
                return ret;
@@ -452,9 +884,10 @@ static int check_inode(struct btree_trans *trans,
                 * just switch units to bytes and that issue goes away
                 */
                ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-                               POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
+                               SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+                                    iter->pos.snapshot),
                                POS(u.bi_inum, U64_MAX),
-                               NULL);
+                               0, NULL);
                if (ret) {
                        bch_err(c, "error in fsck: error %i truncating inode", ret);
                        return ret;
@@ -479,7 +912,7 @@ static int check_inode(struct btree_trans *trans,
                bch_verbose(c, "recounting sectors for inode %llu",
                            u.bi_inum);
 
-               sectors = bch2_count_inode_sectors(trans, u.bi_inum);
+               sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
                if (sectors < 0) {
                        bch_err(c, "error in fsck: error %i recounting inode sectors",
                                (int) sectors);
@@ -499,11 +932,7 @@ static int check_inode(struct btree_trans *trans,
        }
 
        if (do_update) {
-               ret = __bch2_trans_do(trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|
-                                     BTREE_INSERT_LAZY_RW,
-                               bch2_btree_iter_traverse(iter) ?:
-                               bch2_inode_write(trans, iter, &u));
+               ret = write_inode(trans, &u, iter->pos.snapshot);
                if (ret)
                        bch_err(c, "error in fsck: error %i "
                                "updating inode", ret);
@@ -519,26 +948,49 @@ static int check_inodes(struct bch_fs *c, bool full)
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_inode inode;
+       struct bch_inode_unpacked prev, u;
        int ret;
 
+       memset(&prev, 0, sizeof(prev));
+
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               ret = check_key_has_snapshot(&trans, &iter, k);
+               if (ret)
+                       break;
+
+               /*
+                * if snapshot id isn't a leaf node, skip it - deletion in
+                * particular is not atomic, so on the internal snapshot nodes
+                * we can see inodes marked for deletion after a clean shutdown
+                */
+               if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
+                       continue;
+
                if (k.k->type != KEY_TYPE_inode)
                        continue;
 
                inode = bkey_s_c_to_inode(k);
 
-               if (full ||
-                   (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
-                                         BCH_INODE_I_SECTORS_DIRTY|
-                                         BCH_INODE_UNLINKED))) {
-                       ret = check_inode(&trans, &iter, inode);
-                       if (ret)
-                               break;
-               }
+               if (!full &&
+                   !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+                                          BCH_INODE_I_SECTORS_DIRTY|
+                                          BCH_INODE_UNLINKED)))
+                       continue;
+
+               BUG_ON(bch2_inode_unpack(inode, &u));
+
+               ret = check_inode(&trans, &iter,
+                                 full && prev.bi_inum == u.bi_inum
+                                 ? &prev : NULL, u);
+               if (ret)
+                       break;
+
+               prev = u;
        }
        bch2_trans_iter_exit(&trans, &iter);
 
@@ -547,6 +999,29 @@ static int check_inodes(struct bch_fs *c, bool full)
        return bch2_trans_exit(&trans) ?: ret;
 }
 
+noinline_for_stack
+static int check_subvols(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
+                          0, k, ret) {
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+/*
+ * Checking for overlapping extents needs to be reimplemented
+ */
+#if 0
 static int fix_overlapping_extent(struct btree_trans *trans,
                                       struct bkey_s_c k, struct bpos cut_at)
 {
@@ -559,59 +1034,199 @@ static int fix_overlapping_extent(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       bkey_reassemble(u, k);
-       bch2_cut_front(cut_at, u);
+       bkey_reassemble(u, k);
+       bch2_cut_front(cut_at, u);
+
+
+       /*
+        * We don't want to go through the extent_handle_overwrites path:
+        *
+        * XXX: this is going to screw up disk accounting, extent triggers
+        * assume things about extent overwrites - we should be running the
+        * triggers manually here
+        */
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
+                            BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+
+       BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                                 BTREE_INSERT_NOFAIL|
+                                 BTREE_INSERT_LAZY_RW);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+#endif
+
+static int inode_backpointer_exists(struct btree_trans *trans,
+                                   struct bch_inode_unpacked *inode,
+                                   u32 snapshot)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+                       SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto out;
+       if (k.k->type != KEY_TYPE_dirent)
+               goto out;
+
+       ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
+out:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
+                                     struct bch_inode_unpacked *inode)
+{
+       return d.k->p.inode == inode->bi_dir &&
+               d.k->p.offset == inode->bi_dir_offset;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+       struct bch_fs *c = trans->c;
+       struct inode_walker_entry *i;
+       int ret = 0, ret2 = 0;
+       s64 count2;
+
+       for (i = w->d; i < w->d + w->nr; i++) {
+               if (i->inode.bi_sectors == i->count)
+                       continue;
+
+               count2 = lockrestart_do(trans,
+                       bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+
+               if (i->count != count2) {
+                       bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
+                               i->count, count2);
+                       i->count = count2;
+                       if (i->inode.bi_sectors == i->count)
+                               continue;
+               }
+
+               if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
+                           "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+                           w->cur_inum, i->snapshot,
+                           i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
+                       continue;
+
+               i->inode.bi_sectors = i->count;
+               ret = write_inode(trans, &i->inode, i->snapshot);
+               if (ret)
+                       break;
+               ret2 = -EINTR;
+       }
+fsck_err:
+       return ret ?: ret2;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+                       struct inode_walker *inode,
+                       struct snapshots_seen *s)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+       struct inode_walker_entry *i;
+       char buf[200];
+       int ret = 0;
+
+       k = bch2_btree_iter_peek(iter);
+       if (!k.k)
+               return 0;
+
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       ret = check_key_has_snapshot(trans, iter, k);
+       if (ret)
+               return ret;
+
+       ret = snapshots_seen_update(c, s, k.k->p);
+       if (ret)
+               return ret;
+
+       if (k.k->type == KEY_TYPE_whiteout)
+               return 0;
+
+       if (inode->cur_inum != k.k->p.inode) {
+               ret = check_i_sectors(trans, inode);
+               if (ret)
+                       return ret;
+       }
+#if 0
+       if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+               char buf1[200];
+               char buf2[200];
+
+               bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
+               bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+
+               if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+                       return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+       }
+#endif
+       ret = __walk_inode(trans, inode, k.k->p);
+       if (ret < 0)
+               return ret;
 
+       if (fsck_err_on(ret == INT_MAX, c,
+                       "extent in missing inode:\n  %s",
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+               return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+                       bch2_btree_delete_at(trans, iter,
+                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
 
-       /*
-        * We don't want to go through the extent_handle_overwrites path:
-        *
-        * XXX: this is going to screw up disk accounting, extent triggers
-        * assume things about extent overwrites - we should be running the
-        * triggers manually here
-        */
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
-                            BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+       if (ret == INT_MAX)
+               return 0;
 
-       BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
-       ret   = bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_LAZY_RW);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
+       i = inode->d + ret;
+       ret = 0;
 
-static int inode_backpointer_exists(struct btree_trans *trans,
-                                   struct bch_inode_unpacked *inode)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
+       if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
+                       !S_ISLNK(i->inode.bi_mode), c,
+                       "extent in non regular inode mode %o:\n  %s",
+                       i->inode.bi_mode,
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+               return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+                        bch2_btree_delete_at(trans, iter,
+                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+
+       if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
+               for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
+                       if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+                                       k.k->type != KEY_TYPE_reservation &&
+                                       k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+                                       "extent type %u offset %llu past end of inode %llu, i_size %llu",
+                                       k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
+                               bch2_fs_lazy_rw(c);
+                               return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+                                               SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
+                                                    k.k->p.snapshot),
+                                               POS(k.k->p.inode, U64_MAX),
+                                               0, NULL) ?: -EINTR;
+                       }
+               }
+       }
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
-                            POS(inode->bi_dir, inode->bi_dir_offset), 0);
-       k = bch2_btree_iter_peek_slot(&iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto out;
-       if (k.k->type != KEY_TYPE_dirent)
-               goto out;
+       if (bkey_extent_is_allocation(k.k))
+               for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+                       i->count += k.k->size;
+#if 0
+       bch2_bkey_buf_reassemble(&prev, c, k);
+#endif
 
-       ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
-out:
-       bch2_trans_iter_exit(trans, &iter);
+fsck_err:
        return ret;
 }
 
-static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
-                                     struct bch_inode_unpacked *inode)
-{
-       return d.k->p.inode == inode->bi_dir &&
-               d.k->p.offset == inode->bi_dir_offset;
-}
-
 /*
  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
  * that i_size an i_sectors are consistent
@@ -620,15 +1235,17 @@ noinline_for_stack
 static int check_extents(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
+       struct snapshots_seen s;
        struct btree_trans trans;
        struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_buf prev;
-       u64 i_sectors = 0;
        int ret = 0;
 
+#if 0
+       struct bkey_buf prev;
        bch2_bkey_buf_init(&prev);
        prev.k->k = KEY(0, 0, 0);
+#endif
+       snapshots_seen_init(&s);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        bch_verbose(c, "checking extents");
@@ -636,95 +1253,172 @@ static int check_extents(struct bch_fs *c)
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
                             POS(BCACHEFS_ROOT_INO, 0),
                             BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH);
-retry:
-       while ((k = bch2_btree_iter_peek(&iter)).k &&
-              !(ret = bkey_err(k))) {
-               if (w.have_inode &&
-                   w.cur_inum != k.k->p.inode &&
-                   !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
-                   fsck_err_on(w.inode.bi_sectors != i_sectors, c,
-                               "inode %llu has incorrect i_sectors: got %llu, should be %llu",
-                               w.inode.bi_inum,
-                               w.inode.bi_sectors, i_sectors)) {
-                       w.inode.bi_sectors = i_sectors;
-
-                       ret = write_inode(&trans, &w.inode, w.snapshot);
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+
+       do {
+               ret = lockrestart_do(&trans,
+                       check_extent(&trans, &iter, &w, &s));
+               if (ret)
+                       break;
+       } while (bch2_btree_iter_advance(&iter));
+       bch2_trans_iter_exit(&trans, &iter);
+#if 0
+       bch2_bkey_buf_exit(&prev, c);
+#endif
+       inode_walker_exit(&w);
+       bch2_trans_exit(&trans);
+       snapshots_seen_exit(&s);
+
+       return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+       struct bch_fs *c = trans->c;
+       struct inode_walker_entry *i;
+       int ret = 0, ret2 = 0;
+       s64 count2;
+
+       for (i = w->d; i < w->d + w->nr; i++) {
+               if (i->inode.bi_nlink == i->count)
+                       continue;
+
+               count2 = lockrestart_do(trans,
+                               bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
+
+               if (i->count != count2) {
+                       bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+                               i->count, count2);
+                       i->count = count2;
+                       if (i->inode.bi_nlink == i->count)
+                               continue;
+               }
+
+               if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+                               "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+                               w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
+                       i->inode.bi_nlink = i->count;
+                       ret = write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
+                       ret2 = -EINTR;
                }
+       }
+fsck_err:
+       return ret ?: ret2;
+}
+
+static int check_dirent_target(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_s_c_dirent d,
+                              struct bch_inode_unpacked *target,
+                              u32 target_snapshot)
+{
+       struct bch_fs *c = trans->c;
+       bool backpointer_exists = true;
+       char buf[200];
+       int ret = 0;
+
+       if (!target->bi_dir &&
+           !target->bi_dir_offset) {
+               target->bi_dir          = d.k->p.inode;
+               target->bi_dir_offset   = d.k->p.offset;
 
-               if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
-                       char buf1[200];
-                       char buf2[200];
+               ret = write_inode(trans, target, target_snapshot);
+               if (ret)
+                       goto err;
+       }
+
+       if (!inode_backpointer_matches(d, target)) {
+               ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+               if (ret < 0)
+                       goto err;
 
-                       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
-                       bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+               backpointer_exists = ret;
+               ret = 0;
 
-                       if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
-                               return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
+               if (fsck_err_on(S_ISDIR(target->bi_mode) &&
+                               backpointer_exists, c,
+                               "directory %llu with multiple links",
+                               target->bi_inum)) {
+                       ret = remove_dirent(trans, d.k->p);
+                       if (ret)
+                               goto err;
+                       return 0;
                }
 
-               ret = walk_inode(&trans, &w, k.k->p.inode);
-               if (ret)
-                       break;
+               if (fsck_err_on(backpointer_exists &&
+                               !target->bi_nlink, c,
+                               "inode %llu has multiple links but i_nlink 0",
+                               target->bi_inum)) {
+                       target->bi_nlink++;
+                       target->bi_flags &= ~BCH_INODE_UNLINKED;
 
-               if (w.first_this_inode)
-                       i_sectors = 0;
-
-               if (fsck_err_on(!w.have_inode, c,
-                               "extent type %u for missing inode %llu",
-                               k.k->type, k.k->p.inode) ||
-                   fsck_err_on(w.have_inode &&
-                               !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
-                               "extent type %u for non regular file, inode %llu mode %o",
-                               k.k->type, k.k->p.inode, w.inode.bi_mode)) {
-                       bch2_fs_lazy_rw(c);
-                       return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-                                                      POS(k.k->p.inode, 0),
-                                                      POS(k.k->p.inode, U64_MAX),
-                                                      NULL) ?: -EINTR;
+                       ret = write_inode(trans, target, target_snapshot);
+                       if (ret)
+                               goto err;
+               }
+
+               if (fsck_err_on(!backpointer_exists, c,
+                               "inode %llu has wrong backpointer:\n"
+                               "got       %llu:%llu\n"
+                               "should be %llu:%llu",
+                               target->bi_inum,
+                               target->bi_dir,
+                               target->bi_dir_offset,
+                               d.k->p.inode,
+                               d.k->p.offset)) {
+                       target->bi_dir          = d.k->p.inode;
+                       target->bi_dir_offset   = d.k->p.offset;
+
+                       ret = write_inode(trans, target, target_snapshot);
+                       if (ret)
+                               goto err;
                }
+       }
+
+       if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c,
+                       "incorrect d_type: should be %u:\n%s",
+                       mode_to_type(target->bi_mode),
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+               struct bkey_i_dirent *n;
 
-               if (fsck_err_on(w.have_inode &&
-                               !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-                               k.k->type != KEY_TYPE_reservation &&
-                               k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
-                               "extent type %u offset %llu past end of inode %llu, i_size %llu",
-                               k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
-                       bch2_fs_lazy_rw(c);
-                       return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-                                       POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9),
-                                       POS(k.k->p.inode, U64_MAX),
-                                       NULL) ?: -EINTR;
+               n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+               if (!n) {
+                       ret = -ENOMEM;
+                       goto err;
                }
 
-               if (bkey_extent_is_allocation(k.k))
-                       i_sectors += k.k->size;
-               bch2_bkey_buf_reassemble(&prev, c, k);
+               bkey_reassemble(&n->k_i, d.s_c);
+               n->v.d_type = mode_to_type(target->bi_mode);
 
-               bch2_btree_iter_advance(&iter);
+               ret = __bch2_trans_do(trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL|
+                                     BTREE_INSERT_LAZY_RW,
+                       bch2_trans_update(trans, iter, &n->k_i, 0));
+               kfree(n);
+               if (ret)
+                       goto err;
        }
+err:
 fsck_err:
-       if (ret == -EINTR)
-               goto retry;
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_bkey_buf_exit(&prev, c);
-       return bch2_trans_exit(&trans) ?: ret;
+       return ret;
 }
 
 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                        struct bch_hash_info *hash_info,
-                       struct inode_walker *w, unsigned *nr_subdirs)
+                       struct inode_walker *dir,
+                       struct inode_walker *target,
+                       struct snapshots_seen *s)
 {
        struct bch_fs *c = trans->c;
        struct bkey_s_c k;
        struct bkey_s_c_dirent d;
-       struct bch_inode_unpacked target;
+       struct inode_walker_entry *i;
        u32 target_snapshot;
-       bool have_target;
-       bool backpointer_exists = true;
-       u64 d_inum;
+       u32 target_subvol;
+       u64 target_inum;
        char buf[200];
        int ret;
 
@@ -736,38 +1430,49 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
        if (ret)
                return ret;
 
-       if (w->have_inode &&
-           w->cur_inum != k.k->p.inode &&
-           fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c,
-                       "directory %llu with wrong i_nlink: got %u, should be %u",
-                       w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) {
-               w->inode.bi_nlink = *nr_subdirs;
-               ret = write_inode(trans, &w->inode, w->snapshot);
-               return ret ?: -EINTR;
-       }
+       ret = check_key_has_snapshot(trans, iter, k);
+       if (ret)
+               return ret;
 
-       ret = __walk_inode(trans, w, k.k->p.inode);
+       ret = snapshots_seen_update(c, s, k.k->p);
        if (ret)
                return ret;
 
-       if (w->first_this_inode)
-               *nr_subdirs = 0;
+       if (k.k->type == KEY_TYPE_whiteout)
+               return 0;
+
+       if (dir->cur_inum != k.k->p.inode) {
+               ret = check_subdir_count(trans, dir);
+               if (ret)
+                       return ret;
+       }
+
+       ret = __walk_inode(trans, dir, k.k->p);
+       if (ret < 0)
+               return ret;
 
-       if (fsck_err_on(!w->have_inode, c,
+       if (fsck_err_on(ret == INT_MAX, c,
                        "dirent in nonexisting directory:\n%s",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) ||
-           fsck_err_on(!S_ISDIR(w->inode.bi_mode), c,
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+               return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+                               bch2_btree_delete_at(trans, iter,
+                                                    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+
+       if (ret == INT_MAX)
+               return 0;
+
+       i = dir->d + ret;
+       ret = 0;
+
+       if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
                        "dirent in non directory inode type %u:\n%s",
-                       mode_to_type(w->inode.bi_mode),
+                       mode_to_type(i->inode.bi_mode),
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
                return __bch2_trans_do(trans, NULL, NULL, 0,
                                bch2_btree_delete_at(trans, iter, 0));
 
-       if (!w->have_inode)
-               return 0;
-
-       if (w->first_this_inode)
-               *hash_info = bch2_hash_info_init(c, &w->inode);
+       if (dir->first_this_inode)
+               *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
 
        ret = hash_check_key(trans, bch2_dirent_hash_desc,
                             hash_info, iter, k);
@@ -780,105 +1485,76 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                return 0;
 
        d = bkey_s_c_to_dirent(k);
-       d_inum = le64_to_cpu(d.v->d_inum);
 
-       ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
+       ret = __bch2_dirent_read_target(trans, d,
+                                       &target_subvol,
+                                       &target_snapshot,
+                                       &target_inum,
+                                       true);
        if (ret && ret != -ENOENT)
                return ret;
 
-       have_target = !ret;
-       ret = 0;
-
-       if (fsck_err_on(!have_target, c,
-                       "dirent points to missing inode:\n%s",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                              k), buf)))
+       if (fsck_err_on(ret, c,
+                       "dirent points to missing subvolume %llu",
+                       le64_to_cpu(d.v->d_inum)))
                return remove_dirent(trans, d.k->p);
 
-       if (!have_target)
-               return 0;
-
-       if (!target.bi_dir &&
-           !target.bi_dir_offset) {
-               target.bi_dir           = k.k->p.inode;
-               target.bi_dir_offset    = k.k->p.offset;
-
-               ret = __write_inode(trans, &target, target_snapshot) ?:
-                       bch2_trans_commit(trans, NULL, NULL,
-                                         BTREE_INSERT_NOFAIL|
-                                         BTREE_INSERT_LAZY_RW);
-               if (ret)
-                       return ret;
-               return -EINTR;
-       }
+       if (target_subvol) {
+               struct bch_inode_unpacked subvol_root;
 
-       if (!inode_backpointer_matches(d, &target)) {
-               ret = inode_backpointer_exists(trans, &target);
-               if (ret < 0)
+               ret = __lookup_inode(trans, target_inum,
+                                  &subvol_root, &target_snapshot);
+               if (ret && ret != -ENOENT)
                        return ret;
 
-               backpointer_exists = ret;
-               ret = 0;
+               if (fsck_err_on(ret, c,
+                               "subvolume %u points to missing subvolume root %llu",
+                               target_subvol,
+                               target_inum)) {
+                       bch_err(c, "repair not implemented yet");
+                       return -EINVAL;
+               }
 
-               if (fsck_err_on(S_ISDIR(target.bi_mode) &&
-                               backpointer_exists, c,
-                               "directory %llu with multiple links",
-                               target.bi_inum))
-                       return remove_dirent(trans, d.k->p);
+               if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+                               "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+                               target_inum,
+                               subvol_root.bi_subvol, target_subvol)) {
+                       subvol_root.bi_subvol = target_subvol;
+                       ret = write_inode(trans, &subvol_root, target_snapshot);
+                       if (ret)
+                               return ret;
+               }
 
-               if (fsck_err_on(backpointer_exists &&
-                               !target.bi_nlink, c,
-                               "inode %llu has multiple links but i_nlink 0",
-                               d_inum)) {
-                       target.bi_nlink++;
-                       target.bi_flags &= ~BCH_INODE_UNLINKED;
+               ret = check_dirent_target(trans, iter, d, &subvol_root,
+                                         target_snapshot);
+               if (ret)
+                       return ret;
+       } else {
+               ret = __get_visible_inodes(trans, target, s, target_inum);
+               if (ret)
+                       return ret;
 
-                       ret = write_inode(trans, &target, target_snapshot);
-                       return ret ?: -EINTR;
+               if (fsck_err_on(!target->nr, c,
+                               "dirent points to missing inode:\n%s",
+                               (bch2_bkey_val_to_text(&PBUF(buf), c,
+                                                      k), buf))) {
+                       ret = remove_dirent(trans, d.k->p);
+                       if (ret)
+                               return ret;
                }
 
-               if (fsck_err_on(!backpointer_exists, c,
-                               "inode %llu has wrong backpointer:\n"
-                               "got       %llu:%llu\n"
-                               "should be %llu:%llu",
-                               d_inum,
-                               target.bi_dir,
-                               target.bi_dir_offset,
-                               k.k->p.inode,
-                               k.k->p.offset)) {
-                       target.bi_dir           = k.k->p.inode;
-                       target.bi_dir_offset    = k.k->p.offset;
-
-                       ret = write_inode(trans, &target, target_snapshot);
-                       return ret ?: -EINTR;
+               for (i = target->d; i < target->d + target->nr; i++) {
+                       ret = check_dirent_target(trans, iter, d,
+                                                 &i->inode, i->snapshot);
+                       if (ret)
+                               return ret;
                }
        }
 
-       if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
-                       "incorrect d_type: should be %u:\n%s",
-                       mode_to_type(target.bi_mode),
-                       (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                              k), buf))) {
-               struct bkey_i_dirent *n;
-
-               n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
-               if (!n)
-                       return -ENOMEM;
-
-               bkey_reassemble(&n->k_i, d.s_c);
-               n->v.d_type = mode_to_type(target.bi_mode);
-
-               ret = __bch2_trans_do(trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|
-                                     BTREE_INSERT_LAZY_RW,
-                       bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(trans, iter, &n->k_i, 0));
-               kfree(n);
-               return ret ?: -EINTR;
-       }
+       if (d.v->d_type == DT_DIR)
+               for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+                       i->count++;
 
-       *nr_subdirs += d.v->d_type == DT_DIR;
-       return 0;
 fsck_err:
        return ret;
 }
@@ -890,31 +1566,39 @@ fsck_err:
 noinline_for_stack
 static int check_dirents(struct bch_fs *c)
 {
-       struct inode_walker w = inode_walker_init();
+       struct inode_walker dir = inode_walker_init();
+       struct inode_walker target = inode_walker_init();
+       struct snapshots_seen s;
        struct bch_hash_info hash_info;
        struct btree_trans trans;
        struct btree_iter iter;
-       unsigned nr_subdirs = 0;
        int ret = 0;
 
        bch_verbose(c, "checking dirents");
 
+       snapshots_seen_init(&s);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
                             POS(BCACHEFS_ROOT_INO, 0),
                             BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH);
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
        do {
                ret = lockrestart_do(&trans,
-                               check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs));
+                       check_dirent(&trans, &iter, &hash_info,
+                                    &dir, &target, &s));
                if (ret)
                        break;
        } while (bch2_btree_iter_advance(&iter));
        bch2_trans_iter_exit(&trans, &iter);
 
-       return bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
+       snapshots_seen_exit(&s);
+       inode_walker_exit(&dir);
+       inode_walker_exit(&target);
+       return ret;
 }
 
 /*
@@ -937,15 +1621,22 @@ static int check_xattrs(struct bch_fs *c)
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
                             POS(BCACHEFS_ROOT_INO, 0),
                             BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH);
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 retry:
+       bch2_trans_begin(&trans);
+
        while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k))) {
-               ret = walk_inode(&trans, &w, k.k->p.inode);
+               ret = check_key_has_snapshot(&trans, &iter, k);
                if (ret)
                        break;
 
-               if (fsck_err_on(!w.have_inode, c,
+               ret = walk_inode(&trans, &w, k.k->p);
+               if (ret < 0)
+                       break;
+
+               if (fsck_err_on(ret == INT_MAX, c,
                                "xattr for missing inode %llu",
                                k.k->p.inode)) {
                        ret = bch2_btree_delete_at(&trans, &iter, 0);
@@ -954,14 +1645,18 @@ retry:
                        continue;
                }
 
-               if (w.first_this_inode && w.have_inode)
-                       hash_info = bch2_hash_info_init(c, &w.inode);
+               if (ret == INT_MAX)
+                       goto next;
+               ret = 0;
+
+               if (w.first_this_inode)
+                       hash_info = bch2_hash_info_init(c, &w.d[0].inode);
 
                ret = hash_check_key(&trans, bch2_xattr_hash_desc,
                                     &hash_info, &iter, k);
                if (ret)
                        break;
-
+next:
                bch2_btree_iter_advance(&iter);
        }
 fsck_err:
@@ -973,40 +1668,63 @@ fsck_err:
 }
 
 /* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+static int check_root(struct bch_fs *c)
 {
-       struct bkey_inode_buf packed;
+       struct btree_trans trans;
+       struct bch_inode_unpacked root_inode;
        u32 snapshot;
+       u64 inum;
        int ret;
 
+       bch2_trans_init(&trans, c, 0, 0);
+
        bch_verbose(c, "checking root directory");
 
-       ret = bch2_trans_do(c, NULL, NULL, 0,
-               lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
+       ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
        if (ret && ret != -ENOENT)
                return ret;
 
-       if (fsck_err_on(ret, c, "root directory missing"))
-               goto create_root;
+       if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+               struct bkey_i_subvolume root_subvol;
 
-       if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
-                       "root inode not a directory"))
-               goto create_root;
+               snapshot        = U32_MAX;
+               inum            = BCACHEFS_ROOT_INO;
 
-       return 0;
-fsck_err:
-       return ret;
-create_root:
-       bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
-                       0, NULL);
-       root_inode->bi_inum = BCACHEFS_ROOT_INO;
+               bkey_subvolume_init(&root_subvol.k_i);
+               root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+               root_subvol.v.flags     = 0;
+               root_subvol.v.snapshot  = cpu_to_le32(snapshot);
+               root_subvol.v.inode     = cpu_to_le64(inum);
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL|
+                                     BTREE_INSERT_LAZY_RW,
+                       __bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+               if (ret) {
+                       bch_err(c, "error writing root subvol: %i", ret);
+                       goto err;
+               }
+
+       }
 
-       bch2_inode_pack(c, &packed, root_inode);
+       ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+       if (ret && ret != -ENOENT)
+               return ret;
 
-       return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
-                                NULL, NULL,
-                                BTREE_INSERT_NOFAIL|
-                                BTREE_INSERT_LAZY_RW);
+       if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
+           mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+                               "root inode not a directory")) {
+               bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+                               0, NULL);
+               root_inode.bi_inum = inum;
+
+               ret = write_inode(&trans, &root_inode, snapshot);
+               if (ret)
+                       bch_err(c, "error writing root inode: %i", ret);
+       }
+err:
+fsck_err:
+       bch2_trans_exit(&trans);
+       return ret;
 }
 
 struct pathbuf {
@@ -1041,29 +1759,30 @@ static int path_down(struct pathbuf *p, u64 inum)
 
 static int check_path(struct btree_trans *trans,
                      struct pathbuf *p,
-                     struct bch_inode_unpacked *inode)
+                     struct bch_inode_unpacked *inode,
+                     u32 snapshot)
 {
        struct bch_fs *c = trans->c;
-       u32 snapshot;
        size_t i;
        int ret = 0;
 
+       snapshot = snapshot_t(c, snapshot)->equiv;
        p->nr = 0;
 
        while (inode->bi_inum != BCACHEFS_ROOT_INO) {
                ret = lockrestart_do(trans,
-                       inode_backpointer_exists(trans, inode));
+                       inode_backpointer_exists(trans, inode, snapshot));
                if (ret < 0)
                        break;
 
                if (!ret) {
-                       if (fsck_err(c,  "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
-                                    inode->bi_inum,
+                       if (fsck_err(c,  "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
+                                    inode->bi_inum, snapshot,
                                     mode_to_type(inode->bi_mode),
                                     inode->bi_nlink,
                                     inode->bi_dir,
                                     inode->bi_dir_offset))
-                               ret = reattach_inode(trans, inode);
+                               ret = reattach_inode(trans, inode, snapshot);
                        break;
                }
                ret = 0;
@@ -1086,13 +1805,13 @@ static int check_path(struct btree_trans *trans,
                                return 0;
 
                        ret = lockrestart_do(trans,
-                                        remove_backpointer(trans, inode));
+                                       remove_backpointer(trans, inode));
                        if (ret) {
                                bch_err(c, "error removing dirent: %i", ret);
                                break;
                        }
 
-                       ret = reattach_inode(trans, inode);
+                       ret = reattach_inode(trans, inode, snapshot);
                        break;
                }
 
@@ -1127,7 +1846,8 @@ static int check_directory_structure(struct bch_fs *c)
 
        for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
                if (k.k->type != KEY_TYPE_inode)
                        continue;
 
@@ -1138,7 +1858,10 @@ static int check_directory_structure(struct bch_fs *c)
                        break;
                }
 
-               ret = check_path(&trans, &path, &u);
+               if (u.bi_flags & BCH_INODE_UNLINKED)
+                       continue;
+
+               ret = check_path(&trans, &path, &u, iter.pos.snapshot);
                if (ret)
                        break;
        }
@@ -1196,8 +1919,9 @@ static int nlink_cmp(const void *_l, const void *_r)
        return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
 }
 
-static void inc_link(struct bch_fs *c, struct nlink_table *links,
-                    u64 range_start, u64 range_end, u64 inum)
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+                    struct nlink_table *links,
+                    u64 range_start, u64 range_end, u64 inum, u32 snapshot)
 {
        struct nlink *link, key = {
                .inum = inum, .snapshot = U32_MAX,
@@ -1208,8 +1932,18 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links,
 
        link = __inline_bsearch(&key, links->d, links->nr,
                                sizeof(links->d[0]), nlink_cmp);
-       if (link)
-               link->count++;
+       if (!link)
+               return;
+
+       while (link > links->d && link[0].inum == link[-1].inum)
+               --link;
+
+       for (; link < links->d + links->nr && link->inum == inum; link++)
+               if (ref_visible(c, s, snapshot, link->snapshot)) {
+                       link->count++;
+                       if (link->snapshot >= snapshot)
+                               break;
+               }
 }
 
 noinline_for_stack
@@ -1229,7 +1963,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
        for_each_btree_key(&trans, iter, BTREE_ID_inodes,
                           POS(0, start),
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
                if (k.k->type != KEY_TYPE_inode)
                        continue;
 
@@ -1270,23 +2005,33 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
                                     u64 range_start, u64 range_end)
 {
        struct btree_trans trans;
+       struct snapshots_seen s;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_dirent d;
        int ret;
 
+       snapshots_seen_init(&s);
+
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               ret = snapshots_seen_update(c, &s, k.k->p);
+               if (ret)
+                       break;
+
                switch (k.k->type) {
                case KEY_TYPE_dirent:
                        d = bkey_s_c_to_dirent(k);
 
-                       if (d.v->d_type != DT_DIR)
-                               inc_link(c, links, range_start, range_end,
-                                        le64_to_cpu(d.v->d_inum));
+                       if (d.v->d_type != DT_DIR &&
+                           d.v->d_type != DT_SUBVOL)
+                               inc_link(c, &s, links, range_start, range_end,
+                                        le64_to_cpu(d.v->d_inum),
+                                        d.k->p.snapshot);
                        break;
                }
 
@@ -1294,10 +2039,11 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
        }
        bch2_trans_iter_exit(&trans, &iter);
 
-       ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
                bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
 
+       bch2_trans_exit(&trans);
+       snapshots_seen_exit(&s);
        return ret;
 }
 
@@ -1319,7 +2065,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
        for_each_btree_key(&trans, iter, BTREE_ID_inodes,
                           POS(0, range_start),
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
                if (k.k->p.offset >= range_end)
                        break;
 
@@ -1335,7 +2082,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                if (!u.bi_nlink)
                        continue;
 
-               while (link->inum < k.k->p.offset) {
+               while ((cmp_int(link->inum, k.k->p.offset) ?:
+                       cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
                        link++;
                        BUG_ON(link >= links->d + links->nr);
                }
@@ -1408,13 +2156,13 @@ static int check_nlinks(struct bch_fs *c)
  */
 int bch2_fsck_full(struct bch_fs *c)
 {
-       struct bch_inode_unpacked root_inode;
-
-       return  check_inodes(c, true) ?:
+       return  bch2_fs_snapshots_check(c) ?:
+               check_inodes(c, true) ?:
+               check_subvols(c) ?:
                check_extents(c) ?:
                check_dirents(c) ?:
                check_xattrs(c) ?:
-               check_root(c, &root_inode) ?:
+               check_root(c) ?:
                check_directory_structure(c) ?:
                check_nlinks(c);
 }
index 14b0e8c031199aec57938c7224a0542c0c403de7..9130d571e84d8853b02fda35bfc6734ff3d4a802 100644 (file)
@@ -6,8 +6,10 @@
 #include "btree_update.h"
 #include "error.h"
 #include "extents.h"
+#include "extent_update.h"
 #include "inode.h"
 #include "str_hash.h"
+#include "subvolume.h"
 #include "varint.h"
 
 #include <linux/random.h>
@@ -295,15 +297,21 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 int bch2_inode_peek(struct btree_trans *trans,
                    struct btree_iter *iter,
                    struct bch_inode_unpacked *inode,
-                   u64 inum, unsigned flags)
+                   subvol_inum inum, unsigned flags)
 {
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
-       if (trans->c->opts.inodes_use_key_cache)
+       if (0 && trans->c->opts.inodes_use_key_cache)
                flags |= BTREE_ITER_CACHED;
 
-       bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
+
+       bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
+                            SPOS(0, inum.inum, snapshot), flags);
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
@@ -340,8 +348,8 @@ int bch2_inode_write(struct btree_trans *trans,
 
 const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-               struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-               struct bch_inode_unpacked unpacked;
+       struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+       struct bch_inode_unpacked unpacked;
 
        if (k.k->p.inode)
                return "nonzero k.p.inode";
@@ -368,6 +376,9 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
            unpacked.bi_nlink != 0)
                return "flagged as unlinked but bi_nlink != 0";
 
+       if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+               return "subvolume root but not a directory";
+
        return NULL;
 }
 
@@ -482,6 +493,9 @@ static inline u32 bkey_generation(struct bkey_s_c k)
        }
 }
 
+/*
+ * This just finds an empty slot:
+ */
 int bch2_inode_create(struct btree_trans *trans,
                      struct btree_iter *iter,
                      struct bch_inode_unpacked *inode_u,
@@ -581,19 +595,77 @@ found_slot:
        return 0;
 }
 
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+                                 subvol_inum inum, enum btree_id id)
+{
+       u64 offset = 0;
+       int ret = 0;
+
+       while (!ret || ret == -EINTR) {
+               struct btree_iter iter;
+               struct bkey_s_c k;
+               struct bkey_i delete;
+               u32 snapshot;
+
+               bch2_trans_begin(trans);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+               if (ret)
+                       continue;
+
+               bch2_trans_iter_init(trans, &iter, id,
+                                    SPOS(inum.inum, offset, snapshot),
+                                    BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek(&iter);
+
+               if (!k.k || iter.pos.inode != inum.inum) {
+                       bch2_trans_iter_exit(trans, &iter);
+                       break;
+               }
+
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               bkey_init(&delete.k);
+               delete.k.p = iter.pos;
+
+               if (btree_node_type_is_extents(iter.btree_id)) {
+                       unsigned max_sectors =
+                               min_t(u64, U64_MAX - iter.pos.offset,
+                                     KEY_SIZE_MAX & (~0 << trans->c->block_bits));
+
+                       /* create the biggest key we can */
+                       bch2_key_resize(&delete.k, max_sectors);
+
+                       ret = bch2_extent_trim_atomic(trans, &iter, &delete);
+                       if (ret)
+                               goto err;
+               }
+
+               ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+                     bch2_trans_commit(trans, NULL, NULL,
+                                       BTREE_INSERT_NOFAIL);
+err:
+               offset = iter.pos.offset;
+               bch2_trans_iter_exit(trans, &iter);
+       }
+
+       return ret;
+}
+
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
 {
        struct btree_trans trans;
        struct btree_iter iter = { NULL };
        struct bkey_i_inode_generation delete;
-       struct bpos start = POS(inode_nr, 0);
-       struct bpos end = POS(inode_nr + 1, 0);
        struct bch_inode_unpacked inode_u;
        struct bkey_s_c k;
        unsigned iter_flags = BTREE_ITER_INTENT;
+       u32 snapshot;
        int ret;
 
-       if (cached && c->opts.inodes_use_key_cache)
+       if (0 && cached && c->opts.inodes_use_key_cache)
                iter_flags |= BTREE_ITER_CACHED;
 
        bch2_trans_init(&trans, c, 0, 1024);
@@ -606,19 +678,20 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
         * XXX: the dirent could ideally would delete whiteouts when they're no
         * longer needed
         */
-       ret   = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-                                             start, end, NULL) ?:
-               bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
-                                             start, end, NULL) ?:
-               bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
-                                             start, end, NULL);
+       ret   = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
+               bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
+               bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
        if (ret)
                goto err;
 retry:
        bch2_trans_begin(&trans);
 
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
-                            POS(0, inode_nr), iter_flags);
+                            SPOS(0, inum.inum, snapshot), iter_flags);
        k = bch2_btree_iter_peek_slot(&iter);
 
        ret = bkey_err(k);
@@ -628,13 +701,20 @@ retry:
        if (k.k->type != KEY_TYPE_inode) {
                bch2_fs_inconsistent(trans.c,
                                     "inode %llu not found when deleting",
-                                    inode_nr);
+                                    inum.inum);
                ret = -EIO;
                goto err;
        }
 
        bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
 
+       /* Subvolume root? */
+       if (inode_u.bi_subvol) {
+               ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1);
+               if (ret)
+                       goto err;
+       }
+
        bkey_inode_generation_init(&delete.k_i);
        delete.k.p = iter.pos;
        delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
@@ -651,20 +731,22 @@ err:
        return ret;
 }
 
-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+static int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+                                        subvol_inum inum,
                                         struct bch_inode_unpacked *inode)
 {
-       struct btree_iter iter = { NULL };
+       struct btree_iter iter;
        int ret;
 
-       ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0);
-       bch2_trans_iter_exit(trans, &iter);
+       ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+       if (!ret)
+               bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
                            struct bch_inode_unpacked *inode)
 {
        return bch2_trans_do(c, NULL, NULL, 0,
-               bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
+               bch2_inode_find_by_inum_trans(&trans, inum, inode));
 }
index 25bef104ebcc5a692a6dcc54c2b105697fadb294..9e84cddcc6cb707461b2810e510849ff0eabd343 100644 (file)
@@ -58,7 +58,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
 int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
-                   struct bch_inode_unpacked *, u64, unsigned);
+                   struct bch_inode_unpacked *, subvol_inum, unsigned);
 int bch2_inode_write(struct btree_trans *, struct btree_iter *,
                     struct bch_inode_unpacked *);
 
@@ -74,9 +74,10 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 int bch2_inode_create(struct btree_trans *, struct btree_iter *,
                      struct bch_inode_unpacked *, u32, u64);
 
-int bch2_inode_rm(struct bch_fs *, u64, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
 
-int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+                           struct bch_inode_unpacked *);
 
 static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
 {
index ccde9001aaf72487e8be759d2254267d1c7ecf20..0bc72d2a4dd4cf09bc6b11b9e22a8afcb43977f8 100644 (file)
@@ -27,6 +27,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "rebalance.h"
+#include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
 
@@ -220,7 +221,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
                        : 0;
 
                if (!*usage_increasing &&
-                   (new_replicas > bch2_bkey_replicas(c, old) ||
+                   (new->k.p.snapshot != old.k->p.snapshot ||
+                    new_replicas > bch2_bkey_replicas(c, old) ||
                     (!new_compressed && bch2_bkey_sectors_compressed(old))))
                        *usage_increasing = true;
 
@@ -256,6 +258,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 }
 
 int bch2_extent_update(struct btree_trans *trans,
+                      subvol_inum inum,
                       struct btree_iter *iter,
                       struct bkey_i *k,
                       struct disk_reservation *disk_res,
@@ -314,8 +317,8 @@ int bch2_extent_update(struct btree_trans *trans,
                struct btree_iter inode_iter;
                struct bch_inode_unpacked inode_u;
 
-               ret = bch2_inode_peek(trans, &inode_iter, &inode_u,
-                               k->k.p.inode, BTREE_ITER_INTENT);
+               ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
+                                     BTREE_ITER_INTENT);
                if (ret)
                        return ret;
 
@@ -371,22 +374,37 @@ int bch2_extent_update(struct btree_trans *trans,
        return 0;
 }
 
+/*
+ * Returns -EINTR if we had to drop locks:
+ */
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-                  struct bpos end, u64 *journal_seq,
-                  s64 *i_sectors_delta)
+                  subvol_inum inum, u64 end,
+                  u64 *journal_seq, s64 *i_sectors_delta)
 {
        struct bch_fs *c        = trans->c;
        unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
+       struct bpos end_pos = POS(inum.inum, end);
        struct bkey_s_c k;
        int ret = 0, ret2 = 0;
+       u32 snapshot;
 
-       while ((bch2_trans_begin(trans),
-               (k = bch2_btree_iter_peek(iter)).k) &&
-              bkey_cmp(iter->pos, end) < 0) {
+       while (1) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(c, 0);
                struct bkey_i delete;
 
+               bch2_trans_begin(trans);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+               if (ret)
+                       goto btree_err;
+
+               bch2_btree_iter_set_snapshot(iter, snapshot);
+
+               k = bch2_btree_iter_peek(iter);
+               if (bkey_cmp(iter->pos, end_pos) >= 0)
+                       break;
+
                ret = bkey_err(k);
                if (ret)
                        goto btree_err;
@@ -396,9 +414,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
                /* create the biggest key we can */
                bch2_key_resize(&delete.k, max_sectors);
-               bch2_cut_back(end, &delete);
+               bch2_cut_back(end_pos, &delete);
 
-               ret = bch2_extent_update(trans, iter, &delete,
+               ret = bch2_extent_update(trans, inum, iter, &delete,
                                &disk_res, journal_seq,
                                0, i_sectors_delta, false);
                bch2_disk_reservation_put(c, &disk_res);
@@ -411,36 +429,31 @@ btree_err:
                        break;
        }
 
-       if (bkey_cmp(iter->pos, end) > 0) {
-               bch2_btree_iter_set_pos(iter, end);
-               ret = bch2_btree_iter_traverse(iter);
-       }
+       if (bkey_cmp(iter->pos, end_pos) > 0)
+               bch2_btree_iter_set_pos(iter, end_pos);
 
        return ret ?: ret2;
 }
 
-int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
                u64 *journal_seq, s64 *i_sectors_delta)
 {
        struct btree_trans trans;
        struct btree_iter iter;
-       int ret = 0;
+       int ret;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                                  POS(inum, start),
-                                  BTREE_ITER_INTENT);
+                            POS(inum.inum, start),
+                            BTREE_ITER_INTENT);
 
-       ret = bch2_fpunch_at(&trans, &iter, POS(inum, end),
+       ret = bch2_fpunch_at(&trans, &iter, inum, end,
                             journal_seq, i_sectors_delta);
 
        bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
-       if (ret == -EINTR)
-               ret = 0;
-
-       return ret;
+       return ret == -EINTR ? 0 : ret;
 }
 
 int bch2_write_index_default(struct bch_write_op *op)
@@ -451,40 +464,51 @@ int bch2_write_index_default(struct bch_write_op *op)
        struct bkey_i *k = bch2_keylist_front(keys);
        struct btree_trans trans;
        struct btree_iter iter;
+       subvol_inum inum = {
+               .subvol = op->subvol,
+               .inum   = k->k.p.inode,
+       };
        int ret;
 
+       BUG_ON(!inum.subvol);
+
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            bkey_start_pos(&k->k),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
        do {
                bch2_trans_begin(&trans);
 
                k = bch2_keylist_front(keys);
+               bch2_bkey_buf_copy(&sk, c, k);
 
-               k->k.p.snapshot = iter.snapshot;
+               ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+                                                 &sk.k->k.p.snapshot);
+               if (ret == -EINTR)
+                       continue;
+               if (ret)
+                       break;
 
-               bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
-               bkey_copy(sk.k, k);
-               bch2_cut_front(iter.pos, sk.k);
+               bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                                    bkey_start_pos(&sk.k->k),
+                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-               ret = bch2_extent_update(&trans, &iter, sk.k,
+               ret = bch2_extent_update(&trans, inum, &iter, sk.k,
                                         &op->res, op_journal_seq(op),
                                         op->new_i_size, &op->i_sectors_delta,
                                         op->flags & BCH_WRITE_CHECK_ENOSPC);
+               bch2_trans_iter_exit(&trans, &iter);
+
                if (ret == -EINTR)
                        continue;
                if (ret)
                        break;
 
                if (bkey_cmp(iter.pos, k->k.p) >= 0)
-                       bch2_keylist_pop_front(keys);
+                       bch2_keylist_pop_front(&op->insert_keys);
+               else
+                       bch2_cut_front(iter.pos, k);
        } while (!bch2_keylist_empty(keys));
 
-       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
@@ -1645,7 +1669,7 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
 }
 
 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-                                    struct bvec_iter bvec_iter, u64 inode,
+                                    struct bvec_iter bvec_iter,
                                     struct bch_io_failures *failed,
                                     unsigned flags)
 {
@@ -1709,7 +1733,10 @@ static void bch2_rbio_retry(struct work_struct *work)
        struct bch_fs *c        = rbio->c;
        struct bvec_iter iter   = rbio->bvec_iter;
        unsigned flags          = rbio->flags;
-       u64 inode               = rbio->read_pos.inode;
+       subvol_inum inum = {
+               .subvol = rbio->subvol,
+               .inum   = rbio->read_pos.inode,
+       };
        struct bch_io_failures failed = { .nr = 0 };
 
        trace_read_retry(&rbio->bio);
@@ -1725,12 +1752,12 @@ static void bch2_rbio_retry(struct work_struct *work)
        flags &= ~BCH_READ_MAY_PROMOTE;
 
        if (flags & BCH_READ_NODECODE) {
-               bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
+               bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
        } else {
                flags &= ~BCH_READ_LAST_FRAGMENT;
                flags |= BCH_READ_MUST_CLONE;
 
-               __bch2_read(c, rbio, iter, inode, &failed, flags);
+               __bch2_read(c, rbio, iter, inum, &failed, flags);
        }
 }
 
@@ -1804,7 +1831,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if (!bch2_bkey_narrow_crcs(new, new_crc))
                goto out;
 
-       ret = bch2_trans_update(trans, &iter, new, 0);
+       ret = bch2_trans_update(trans, &iter, new,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 out:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
@@ -2172,6 +2200,7 @@ get_bio:
        /* XXX: only initialize this if needed */
        rbio->devs_have         = bch2_bkey_devs(k);
        rbio->pick              = pick;
+       rbio->subvol            = orig->subvol;
        rbio->read_pos          = read_pos;
        rbio->data_btree        = data_btree;
        rbio->data_pos          = data_pos;
@@ -2274,25 +2303,31 @@ out_read_done:
 }
 
 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                struct bvec_iter bvec_iter, u64 inode,
+                struct bvec_iter bvec_iter, subvol_inum inum,
                 struct bch_io_failures *failed, unsigned flags)
 {
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_buf sk;
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
        BUG_ON(flags & BCH_READ_NODECODE);
 
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            POS(inode, bvec_iter.bi_sector),
-                            BTREE_ITER_SLOTS);
 retry:
        bch2_trans_begin(&trans);
+       iter = (struct btree_iter) { NULL };
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
 
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+                            BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
        while (1) {
                unsigned bytes, sectors, offset_into_extent;
                enum btree_id data_btree = BTREE_ID_extents;
@@ -2307,7 +2342,7 @@ retry:
                }
 
                bch2_btree_iter_set_pos(&iter,
-                               POS(inode, bvec_iter.bi_sector));
+                               POS(inum.inum, bvec_iter.bi_sector));
 
                k = bch2_btree_iter_peek_slot(&iter);
                ret = bkey_err(k);
@@ -2357,16 +2392,17 @@ retry:
                swap(bvec_iter.bi_size, bytes);
                bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
        }
+err:
+       bch2_trans_iter_exit(&trans, &iter);
 
        if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
                goto retry;
 
-       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
        if (ret) {
-               bch_err_inum_ratelimited(c, inode,
+               bch_err_inum_ratelimited(c, inum.inum,
                                         "read error %i from btree lookup", ret);
                rbio->bio.bi_status = BLK_STS_IOERR;
                bch2_rbio_done(rbio);
index bc0a0bd6f849438a82474c7e3ce2b331f6950be7..38efd39c664ec7ae3449e753af9e52c847f75a82 100644 (file)
@@ -63,12 +63,13 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 
 int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
                               struct bkey_i *, bool *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, struct btree_iter *,
-                      struct bkey_i *, struct disk_reservation *,
-                      u64 *, u64, s64 *, bool);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+                      struct btree_iter *, struct bkey_i *,
+                      struct disk_reservation *, u64 *, u64, s64 *, bool);
+
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-                  struct bpos, u64 *, s64 *);
-int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+                  subvol_inum, u64, u64 *, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *);
 
 int bch2_write_index_default(struct bch_write_op *);
 
@@ -90,6 +91,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
        op->devs_have.nr        = 0;
        op->target              = 0;
        op->opts                = opts;
+       op->subvol              = 0;
        op->pos                 = POS_MAX;
        op->version             = ZERO_VERSION;
        op->write_point         = (struct write_point_specifier) { 0 };
@@ -157,10 +159,10 @@ static inline void bch2_read_extent(struct btree_trans *trans,
 }
 
 void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-                u64, struct bch_io_failures *, unsigned flags);
+                subvol_inum, struct bch_io_failures *, unsigned flags);
 
 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                            u64 inode)
+                            subvol_inum inum)
 {
        struct bch_io_failures failed = { .nr = 0 };
 
@@ -168,8 +170,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 
        rbio->c = c;
        rbio->start_time = local_clock();
+       rbio->subvol = inum.subvol;
 
-       __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+       __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
                    BCH_READ_RETRY_IF_STALE|
                    BCH_READ_MAY_PROMOTE|
                    BCH_READ_USER_MAPPED);
index 0aab77951c4c37022899d8a8f40aa4601cc5d7ea..78bff13d36f27cb46c6a28c5bcc9dd65cb0ecac6 100644 (file)
@@ -62,6 +62,7 @@ struct bch_read_bio {
        /*
         * pos we read from - different from data_pos for indirect extents:
         */
+       u32                     subvol;
        struct bpos             read_pos;
 
        /*
@@ -122,6 +123,7 @@ struct bch_write_op {
        u16                     nonce;
        struct bch_io_opts      opts;
 
+       u32                     subvol;
        struct bpos             pos;
        struct bversion         version;
 
index 1899326d9754eeebdf9c850ace262b2eb25fe16a..7c764ee4ea09c2182865081f3e560256725e6aa3 100644 (file)
@@ -48,7 +48,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
-                            BTREE_ITER_PREFETCH);
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
        while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k))) {
@@ -74,7 +75,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
 
                ret   = bch2_btree_iter_traverse(&iter) ?:
-                       bch2_trans_update(&trans, &iter, sk.k, 0) ?:
+                       bch2_trans_update(&trans, &iter, sk.k,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(&trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL);
 
index 7001e3cda8c5ec06d3f141395430ccff46f20f84..44a61818d9a44861661d72841ee006d1a1cfa845 100644 (file)
@@ -13,6 +13,7 @@
 #include "journal_reclaim.h"
 #include "move.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super-io.h"
 #include "keylist.h"
 
@@ -53,6 +54,81 @@ struct moving_context {
        wait_queue_head_t       wait;
 };
 
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+                                    enum btree_id id,
+                                    struct bpos old_pos,
+                                    struct bpos new_pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter, update_iter;
+       struct bkey_s_c k;
+       struct snapshots_seen s;
+       int ret;
+
+       if (!btree_type_has_snapshots(id))
+               return 0;
+
+       snapshots_seen_init(&s);
+
+       if (!bkey_cmp(old_pos, new_pos))
+               return 0;
+
+       if (!snapshot_t(c, old_pos.snapshot)->children[0])
+               return 0;
+
+       bch2_trans_iter_init(trans, &iter, id, old_pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while (1) {
+next:
+               k = bch2_btree_iter_prev(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               if (bkey_cmp(old_pos, k.k->p))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+                       struct bkey_i *update;
+                       size_t i;
+
+                       for (i = 0; i < s.nr; i++)
+                               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
+                                       goto next;
+
+                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+                       ret = PTR_ERR_OR_ZERO(update);
+                       if (ret)
+                               break;
+
+                       bkey_init(&update->k);
+                       update->k.p = new_pos;
+                       update->k.p.snapshot = k.k->p.snapshot;
+
+                       bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_ALL_SNAPSHOTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+                       bch2_trans_iter_exit(trans, &update_iter);
+                       if (ret)
+                               break;
+
+                       ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
+                       if (ret)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+       kfree(s.d);
+
+       return ret;
+}
+
 static int bch2_migrate_index_update(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
@@ -166,7 +242,10 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
                next_pos = insert->k.p;
 
-               ret   = bch2_trans_update(&trans, &iter, insert, 0) ?:
+               ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
+                                                 k.k->p, insert->k.p) ?:
+                       bch2_trans_update(&trans, &iter, insert,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(&trans, &op->res,
                                op_journal_seq(op),
                                BTREE_INSERT_NOFAIL|
@@ -581,7 +660,8 @@ static int __bch2_move_data(struct bch_fs *c,
        stats->pos      = start;
 
        bch2_trans_iter_init(&trans, &iter, btree_id, start,
-                            BTREE_ITER_PREFETCH);
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
        if (rate)
                bch2_ratelimit_reset(rate);
index 5de296078219fc42e749193624d4c7fca4a3f25f..ff99c6d24abdd6fb7a54011085ddf962bc03b190 100644 (file)
@@ -63,7 +63,7 @@ const char * const bch2_member_states[] = {
 
 #undef x
 
-const char * const bch2_d_types[DT_MAX] = {
+const char * const bch2_d_types[BCH_DT_MAX] = {
        [DT_UNKNOWN]    = "unknown",
        [DT_FIFO]       = "fifo",
        [DT_CHR]        = "chr",
@@ -73,6 +73,7 @@ const char * const bch2_d_types[DT_MAX] = {
        [DT_LNK]        = "lnk",
        [DT_SOCK]       = "sock",
        [DT_WHT]        = "whiteout",
+       [DT_SUBVOL]     = "subvol",
 };
 
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
index 147b4021fdaef0d3816cc6311eb895eff1f10242..d39d6a546ac4b74655a2392c3332edfa0d7bf5b7 100644 (file)
@@ -215,19 +215,19 @@ enum opt_type {
          BCH_SB_POSIX_ACL,             true,                           \
          NULL,         "Enable POSIX acls")                            \
        x(usrquota,                     u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         0,                                                            \
          OPT_BOOL(),                                                   \
-         BCH_SB_USRQUOTA,              false,                          \
+         NO_SB_OPT,            false,                                  \
          NULL,         "Enable user quotas")                           \
        x(grpquota,                     u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         0,                                                            \
          OPT_BOOL(),                                                   \
-         BCH_SB_GRPQUOTA,              false,                          \
+         NO_SB_OPT,            false,                                  \
          NULL,         "Enable group quotas")                          \
        x(prjquota,                     u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         0,                                                            \
          OPT_BOOL(),                                                   \
-         BCH_SB_PRJQUOTA,              false,                          \
+         NO_SB_OPT,            false,                                  \
          NULL,         "Enable project quotas")                        \
        x(degraded,                     u8,                             \
          OPT_MOUNT,                                                    \
index 11208e83fabee044669a6bcc592a1e802d41dd40..64e0b542e7791d53cf0a0b5323e00e3e79656800 100644 (file)
@@ -20,6 +20,7 @@
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super-io.h"
 
 #include <linux/sort.h>
@@ -961,6 +962,81 @@ fsck_err:
        return ret;
 }
 
+static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
+{
+       struct bkey_i_snapshot  root_snapshot;
+       struct bkey_i_subvolume root_volume;
+       int ret;
+
+       bkey_snapshot_init(&root_snapshot.k_i);
+       root_snapshot.k.p.offset = U32_MAX;
+       root_snapshot.v.flags   = 0;
+       root_snapshot.v.parent  = 0;
+       root_snapshot.v.subvol  = BCACHEFS_ROOT_SUBVOL;
+       root_snapshot.v.pad     = 0;
+       SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+       ret = bch2_btree_insert(c, BTREE_ID_snapshots,
+                               &root_snapshot.k_i,
+                               NULL, NULL, 0);
+       if (ret)
+               return ret;
+
+
+       bkey_subvolume_init(&root_volume.k_i);
+       root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+       root_volume.v.flags     = 0;
+       root_volume.v.snapshot  = cpu_to_le32(U32_MAX);
+       root_volume.v.inode     = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+       ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
+                               &root_volume.k_i,
+                               NULL, NULL, 0);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_inode_unpacked inode;
+       struct bkey_inode_buf *packed;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                            POS(0, BCACHEFS_ROOT_INO), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_inode) {
+               bch_err(c, "root inode not found");
+               ret = -ENOENT;
+               goto err;
+       }
+
+       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode);
+       BUG_ON(ret);
+
+       inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+       packed = bch2_trans_kmalloc(trans, sizeof(*packed));
+       ret = PTR_ERR_OR_ZERO(packed);
+       if (ret)
+               goto err;
+
+       bch2_inode_pack(c, packed, &inode);
+       ret = bch2_trans_update(trans, &iter, &packed->inode.k_i, 0);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
 int bch2_fs_recovery(struct bch_fs *c)
 {
        const char *err = "cannot allocate memory";
@@ -1017,11 +1093,12 @@ int bch2_fs_recovery(struct bch_fs *c)
                c->opts.version_upgrade = true;
                c->opts.fsck            = true;
                c->opts.fix_errors      = FSCK_OPT_YES;
-       }
-
-       if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
+       } else if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
                bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
                c->opts.version_upgrade = true;
+       } else if (c->sb.version < bcachefs_metadata_version_snapshot) {
+               bch_info(c, "filesystem version is prior to snapshot field - upgrading");
+               c->opts.version_upgrade = true;
        }
 
        ret = bch2_blacklist_table_initialize(c);
@@ -1190,6 +1267,29 @@ use_clean:
                bch_verbose(c, "alloc write done");
        }
 
+       if (c->sb.version < bcachefs_metadata_version_snapshot) {
+               err = "error creating root snapshot node";
+               ret = bch2_fs_initialize_subvolumes(c);
+               if (ret)
+                       goto err;
+       }
+
+       bch_verbose(c, "reading snapshots table");
+       err = "error reading snapshots table";
+       ret = bch2_fs_snapshots_start(c);
+       if (ret)
+               goto err;
+       bch_verbose(c, "reading snapshots done");
+
+       if (c->sb.version < bcachefs_metadata_version_snapshot) {
+               /* set bi_subvol on root inode */
+               err = "error upgrade root inode for subvolumes";
+               ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+                                   bch2_fs_upgrade_for_subvolumes(&trans));
+               if (ret)
+                       goto err;
+       }
+
        if (c->opts.fsck) {
                bch_info(c, "starting fsck");
                err = "error in fsck";
@@ -1350,9 +1450,22 @@ int bch2_fs_initialize(struct bch_fs *c)
                }
        }
 
+       err = "error creating root snapshot node";
+       ret = bch2_fs_initialize_subvolumes(c);
+       if (ret)
+               goto err;
+
+       bch_verbose(c, "reading snapshots table");
+       err = "error reading snapshots table";
+       ret = bch2_fs_snapshots_start(c);
+       if (ret)
+               goto err;
+       bch_verbose(c, "reading snapshots done");
+
        bch2_inode_init(c, &root_inode, 0, 0,
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
-       root_inode.bi_inum = BCACHEFS_ROOT_INO;
+       root_inode.bi_inum      = BCACHEFS_ROOT_INO;
+       root_inode.bi_subvol    = BCACHEFS_ROOT_SUBVOL;
        bch2_inode_pack(c, &packed_inode, &root_inode);
        packed_inode.inode.k.p.snapshot = U32_MAX;
 
@@ -1367,11 +1480,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 
        err = "error creating lost+found";
        ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+               bch2_create_trans(&trans,
+                                 BCACHEFS_ROOT_SUBVOL_INUM,
                                  &root_inode, &lostfound_inode,
                                  &lostfound,
                                  0, 0, S_IFDIR|0700, 0,
-                                 NULL, NULL));
+                                 NULL, NULL, (subvol_inum) { 0 }, 0));
        if (ret) {
                bch_err(c, "error creating lost+found");
                goto err;
index 576cfbccf5b537b2d000935d739476b278fe58b7..92ff609453b8349e3ce6a25277bcaeece461d388 100644 (file)
@@ -7,6 +7,7 @@
 #include "inode.h"
 #include "io.h"
 #include "reflink.h"
+#include "subvolume.h"
 
 #include <linux/sched/signal.h>
 
@@ -197,7 +198,8 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 }
 
 s64 bch2_remap_range(struct bch_fs *c,
-                    struct bpos dst_start, struct bpos src_start,
+                    subvol_inum dst_inum, u64 dst_offset,
+                    subvol_inum src_inum, u64 src_offset,
                     u64 remap_sectors, u64 *journal_seq,
                     u64 new_i_size, s64 *i_sectors_delta)
 {
@@ -205,9 +207,12 @@ s64 bch2_remap_range(struct bch_fs *c,
        struct btree_iter dst_iter, src_iter;
        struct bkey_s_c src_k;
        struct bkey_buf new_dst, new_src;
+       struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+       struct bpos src_start = POS(src_inum.inum, src_offset);
        struct bpos dst_end = dst_start, src_end = src_start;
        struct bpos src_want;
        u64 dst_done;
+       u32 dst_snapshot, src_snapshot;
        int ret = 0, ret2 = 0;
 
        if (!percpu_ref_tryget(&c->writes))
@@ -238,6 +243,20 @@ s64 bch2_remap_range(struct bch_fs *c,
                        break;
                }
 
+               ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+                                                 &src_snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+               ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+                                                 &dst_snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
                dst_done = dst_iter.pos.offset - dst_start.offset;
                src_want = POS(src_start.inode, src_start.offset + dst_done);
                bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -248,11 +267,11 @@ s64 bch2_remap_range(struct bch_fs *c,
                        continue;
 
                if (bkey_cmp(src_want, src_iter.pos) < 0) {
-                       ret = bch2_fpunch_at(&trans, &dst_iter,
-                                       bpos_min(dst_end,
-                                                POS(dst_iter.pos.inode, dst_iter.pos.offset +
-                                                    src_iter.pos.offset - src_want.offset)),
-                                                journal_seq, i_sectors_delta);
+                       ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+                                       min(dst_end.offset,
+                                           dst_iter.pos.offset +
+                                           src_iter.pos.offset - src_want.offset),
+                                       journal_seq, i_sectors_delta);
                        continue;
                }
 
@@ -289,8 +308,9 @@ s64 bch2_remap_range(struct bch_fs *c,
                bch2_key_resize(&new_dst.k->k,
                                min(src_k.k->p.offset - src_want.offset,
                                    dst_end.offset - dst_iter.pos.offset));
-               ret = bch2_extent_update(&trans, &dst_iter, new_dst.k,
-                                        &disk_res, journal_seq,
+
+               ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+                                        new_dst.k, &disk_res, journal_seq,
                                         new_i_size, i_sectors_delta,
                                         true);
                bch2_disk_reservation_put(c, &disk_res);
@@ -311,7 +331,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                bch2_trans_begin(&trans);
 
                ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
-                               dst_start.inode, BTREE_ITER_INTENT);
+                                      dst_inum, BTREE_ITER_INTENT);
 
                if (!ret2 &&
                    inode_u.bi_size < new_i_size) {
index 68c5cb5a2780ddd1552d41d03229e145f1d14d3c..4c1b82860b0b9ca31aae88213d2dcb7916bfcfc1 100644 (file)
@@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k)
        }
 }
 
-s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
-                    u64, u64 *, u64, s64 *);
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+                    subvol_inum, u64, u64, u64 *, u64, s64 *);
 
 #endif /* _BCACHEFS_REFLINK_H */
index c6a132b3c5bb2eb24cf112f11fd23d254e01b499..6486e709b700de4f8160334ad54e0b08f7a871d7 100644 (file)
@@ -8,6 +8,7 @@
 #include "error.h"
 #include "inode.h"
 #include "siphash.h"
+#include "subvolume.h"
 #include "super.h"
 
 #include <linux/crc32c.h>
@@ -144,16 +145,21 @@ bch2_hash_lookup(struct btree_trans *trans,
                 struct btree_iter *iter,
                 const struct bch_hash_desc desc,
                 const struct bch_hash_info *info,
-                u64 inode, const void *key,
+                subvol_inum inum, const void *key,
                 unsigned flags)
 {
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
+
        for_each_btree_key(trans, *iter, desc.btree_id,
-                          POS(inode, desc.hash_key(info, key)),
+                          SPOS(inum.inum, desc.hash_key(info, key), snapshot),
                           BTREE_ITER_SLOTS|flags, k, ret) {
-               if (iter->pos.inode != inode)
+               if (iter->pos.inode != inum.inum)
                        break;
 
                if (k.k->type == desc.key_type) {
@@ -176,15 +182,20 @@ bch2_hash_hole(struct btree_trans *trans,
               struct btree_iter *iter,
               const struct bch_hash_desc desc,
               const struct bch_hash_info *info,
-              u64 inode, const void *key)
+              subvol_inum inum, const void *key)
 {
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
+
        for_each_btree_key(trans, *iter, desc.btree_id,
-                          POS(inode, desc.hash_key(info, key)),
+                          SPOS(inum.inum, desc.hash_key(info, key), snapshot),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               if (iter->pos.inode != inode)
+               if (iter->pos.inode != inum.inum)
                        break;
 
                if (k.k->type != desc.key_type)
@@ -229,17 +240,25 @@ static __always_inline
 int bch2_hash_set(struct btree_trans *trans,
                  const struct bch_hash_desc desc,
                  const struct bch_hash_info *info,
-                 u64 inode, struct bkey_i *insert, int flags)
+                 subvol_inum inum,
+                 struct bkey_i *insert, int flags)
 {
        struct btree_iter iter, slot = { NULL };
        struct bkey_s_c k;
        bool found = false;
+       u32 snapshot;
        int ret;
 
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
+
        for_each_btree_key(trans, iter, desc.btree_id,
-                          POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+                          SPOS(inum.inum,
+                               desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+                               snapshot),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               if (iter.pos.inode != inode)
+               if (iter.pos.inode != inum.inum)
                        break;
 
                if (k.k->type == desc.key_type) {
@@ -288,7 +307,8 @@ static __always_inline
 int bch2_hash_delete_at(struct btree_trans *trans,
                        const struct bch_hash_desc desc,
                        const struct bch_hash_info *info,
-                       struct btree_iter *iter)
+                       struct btree_iter *iter,
+                       unsigned update_flags)
 {
        struct bkey_i *delete;
        int ret;
@@ -306,24 +326,24 @@ int bch2_hash_delete_at(struct btree_trans *trans,
        delete->k.p = iter->pos;
        delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
 
-       return bch2_trans_update(trans, iter, delete, 0);
+       return bch2_trans_update(trans, iter, delete, update_flags);
 }
 
 static __always_inline
 int bch2_hash_delete(struct btree_trans *trans,
                     const struct bch_hash_desc desc,
                     const struct bch_hash_info *info,
-                    u64 inode, const void *key)
+                    subvol_inum inum, const void *key)
 {
        struct btree_iter iter;
        int ret;
 
-       ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key,
+       ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
                                BTREE_ITER_INTENT);
        if (ret)
                return ret;
 
-       ret = bch2_hash_delete_at(trans, desc, info, &iter);
+       ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
new file mode 100644 (file)
index 0000000..ff3b4d2
--- /dev/null
@@ -0,0 +1,981 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "error.h"
+#include "subvolume.h"
+
+/* Snapshot tree: */
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *);
+static void bch2_delete_dead_snapshots(struct bch_fs *);
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+                          struct bkey_s_c k)
+{
+       struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+       pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+              BCH_SNAPSHOT_SUBVOL(s.v),
+              BCH_SNAPSHOT_DELETED(s.v),
+              le32_to_cpu(s.v->parent),
+              le32_to_cpu(s.v->children[0]),
+              le32_to_cpu(s.v->children[1]),
+              le32_to_cpu(s.v->subvol));
+}
+
+const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_snapshot s;
+       u32 i, id;
+
+       if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
+           bkey_cmp(k.k->p, POS(0, 1)) < 0)
+               return "bad pos";
+
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
+               return "bad val size";
+
+       s = bkey_s_c_to_snapshot(k);
+
+       id = le32_to_cpu(s.v->parent);
+       if (id && id <= k.k->p.offset)
+               return "bad parent node";
+
+       if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
+               return "children not normalized";
+
+       if (s.v->children[0] &&
+           s.v->children[0] == s.v->children[1])
+               return "duplicate child nodes";
+
+       for (i = 0; i < 2; i++) {
+               id = le32_to_cpu(s.v->children[i]);
+
+               if (id >= k.k->p.offset)
+                       return "bad child node";
+       }
+
+       return NULL;
+}
+
+int bch2_mark_snapshot(struct bch_fs *c,
+                      struct bkey_s_c old, struct bkey_s_c new,
+                      u64 journal_seq, unsigned flags)
+{
+       struct snapshot_t *t;
+
+       t = genradix_ptr_alloc(&c->snapshots,
+                              U32_MAX - new.k->p.offset,
+                              GFP_KERNEL);
+       if (!t)
+               return -ENOMEM;
+
+       if (new.k->type == KEY_TYPE_snapshot) {
+               struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+               t->parent       = le32_to_cpu(s.v->parent);
+               t->children[0]  = le32_to_cpu(s.v->children[0]);
+               t->children[1]  = le32_to_cpu(s.v->children[1]);
+               t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+       } else {
+               t->parent       = 0;
+               t->children[0]  = 0;
+               t->children[1]  = 0;
+               t->subvol       = 0;
+       }
+
+       return 0;
+}
+
+static int subvol_lookup(struct btree_trans *trans, unsigned id, struct bch_subvolume *s)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, id), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
+
+       if (!ret)
+               *s = *bkey_s_c_to_subvolume(k).v;
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int snapshot_lookup(struct btree_trans *trans, u32 id,
+                          struct bch_snapshot *s)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
+
+       if (!ret)
+               *s = *bkey_s_c_to_snapshot(k).v;
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int snapshot_live(struct btree_trans *trans, u32 id)
+{
+       struct bch_snapshot v;
+       int ret;
+
+       if (!id)
+               return 0;
+
+       ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+       if (ret == -ENOENT)
+               bch_err(trans->c, "snapshot node %u not found", id);
+       if (ret)
+               return ret;
+
+       return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_snapshot snap;
+       unsigned i;
+       int ret;
+
+       for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               u32 id = k.k->p.offset, child[2];
+               unsigned nr_live = 0, live_idx;
+
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               snap = bkey_s_c_to_snapshot(k);
+               child[0] = le32_to_cpu(snap.v->children[0]);
+               child[1] = le32_to_cpu(snap.v->children[1]);
+
+               for (i = 0; i < 2; i++) {
+                       ret = snapshot_live(trans, child[i]);
+                       if (ret < 0)
+                               break;
+
+                       if (ret)
+                               live_idx = i;
+                       nr_live += ret;
+               }
+
+               snapshot_t(c, id)->equiv = nr_live == 1
+                       ? snapshot_t(c, child[live_idx])->equiv
+                       : id;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (ret)
+               bch_err(c, "error walking snapshots: %i", ret);
+
+       return ret;
+}
+
+/* fsck: */
+static int bch2_snapshot_check(struct btree_trans *trans,
+                              struct bkey_s_c_snapshot s)
+{
+       struct bch_subvolume subvol;
+       struct bch_snapshot v;
+       u32 i, id;
+       int ret;
+
+       id = le32_to_cpu(s.v->subvol);
+       ret = lockrestart_do(trans, subvol_lookup(trans, id, &subvol));
+       if (ret == -ENOENT)
+               bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
+                       s.k->p.offset, id);
+       if (ret)
+               return ret;
+
+       if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+               bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+                       s.k->p.offset);
+               return -EINVAL;
+       }
+
+       id = le32_to_cpu(s.v->parent);
+       if (id) {
+               ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+               if (ret == -ENOENT)
+                       bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
+                               s.k->p.offset, id);
+               if (ret)
+                       return ret;
+
+               if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
+                   le32_to_cpu(v.children[1]) != s.k->p.offset) {
+                       bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+                               id, s.k->p.offset);
+                       return -EINVAL;
+               }
+       }
+
+       for (i = 0; i < 2 && s.v->children[i]; i++) {
+               id = le32_to_cpu(s.v->children[i]);
+
+               ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+               if (ret == -ENOENT)
+                       bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+                               s.k->p.offset, id);
+               if (ret)
+                       return ret;
+
+               if (le32_to_cpu(v.parent) != s.k->p.offset) {
+                       bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+                               id, le32_to_cpu(v.parent), s.k->p.offset);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_snapshot s;
+       unsigned id;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret) {
+               bch_err(c, "error %i checking snapshots", ret);
+               goto err;
+       }
+
+       for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_subvolume)
+                       continue;
+again_2:
+               id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+               ret = snapshot_lookup(&trans, id, &s);
+
+               if (ret == -EINTR) {
+                       k = bch2_btree_iter_peek(&iter);
+                       goto again_2;
+               } else if (ret == -ENOENT)
+                       bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+                               k.k->p.offset, id);
+               else if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+       genradix_free(&c->snapshots);
+}
+
+int bch2_fs_snapshots_start(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       bool have_deleted = false;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+              if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+                      break;
+
+               if (k.k->type != KEY_TYPE_snapshot) {
+                       bch_err(c, "found wrong key type %u in snapshot node table",
+                               k.k->type);
+                       continue;
+               }
+
+               if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+                       have_deleted = true;
+
+               ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0);
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret)
+               goto err;
+
+       ret = bch2_snapshots_set_equiv(&trans);
+       if (ret)
+               goto err;
+err:
+       bch2_trans_exit(&trans);
+
+       if (!ret && have_deleted) {
+               bch_info(c, "restarting deletion of dead snapshots");
+               if (c->opts.fsck) {
+                       bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
+               } else {
+                       bch2_delete_dead_snapshots(c);
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_snapshot *s;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_snapshot) {
+               bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+               ret = -ENOENT;
+               goto err;
+       }
+
+       /* already deleted? */
+       if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+               goto err;
+
+       s = bch2_trans_kmalloc(trans, sizeof(*s));
+       ret = PTR_ERR_OR_ZERO(s);
+       if (ret)
+               goto err;
+
+       bkey_reassemble(&s->k_i, k);
+
+       SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+       ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+       if (ret)
+               goto err;
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+       struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+       struct bkey_s_c k;
+       struct bkey_s_c_snapshot s;
+       struct bkey_i_snapshot *parent;
+       u32 parent_id;
+       unsigned i;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_snapshot) {
+               bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+               ret = -ENOENT;
+               goto err;
+       }
+
+       s = bkey_s_c_to_snapshot(k);
+
+       BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+       parent_id = le32_to_cpu(s.v->parent);
+
+       if (parent_id) {
+               bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
+                                    POS(0, parent_id),
+                                    BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek_slot(&p_iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_snapshot) {
+                       bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               parent = bch2_trans_kmalloc(trans, sizeof(*parent));
+               ret = PTR_ERR_OR_ZERO(parent);
+               if (ret)
+                       goto err;
+
+               bkey_reassemble(&parent->k_i, k);
+
+               for (i = 0; i < 2; i++)
+                       if (le32_to_cpu(parent->v.children[i]) == id)
+                               break;
+
+               if (i == 2)
+                       bch_err(trans->c, "snapshot %u missing child pointer to %u",
+                               parent_id, id);
+               else
+                       parent->v.children[i] = 0;
+
+               if (le32_to_cpu(parent->v.children[0]) <
+                   le32_to_cpu(parent->v.children[1]))
+                       swap(parent->v.children[0],
+                            parent->v.children[1]);
+
+               ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
+               if (ret)
+                       goto err;
+       }
+
+       ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+       bch2_trans_iter_exit(trans, &p_iter);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+                                    u32 *new_snapids,
+                                    u32 *snapshot_subvols,
+                                    unsigned nr_snapids)
+{
+       struct btree_iter iter;
+       struct bkey_i_snapshot *n;
+       struct bkey_s_c k;
+       unsigned i;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+                            POS_MIN, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       for (i = 0; i < nr_snapids; i++) {
+               k = bch2_btree_iter_prev_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (!k.k || !k.k->p.offset) {
+                       ret = -ENOSPC;
+                       goto err;
+               }
+
+               n = bch2_trans_kmalloc(trans, sizeof(*n));
+               ret = PTR_ERR_OR_ZERO(n);
+               if (ret)
+                       return ret;
+
+               bkey_snapshot_init(&n->k_i);
+               n->k.p          = iter.pos;
+               n->v.flags      = 0;
+               n->v.parent     = cpu_to_le32(parent);
+               n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
+               n->v.pad        = 0;
+               SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+               bch2_trans_update(trans, &iter, &n->k_i, 0);
+
+               ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0);
+               if (ret)
+                       break;
+
+               new_snapids[i]  = iter.pos.offset;
+       }
+
+       if (parent) {
+               bch2_btree_iter_set_pos(&iter, POS(0, parent));
+               k = bch2_btree_iter_peek(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_snapshot) {
+                       bch_err(trans->c, "snapshot %u not found", parent);
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               n = bch2_trans_kmalloc(trans, sizeof(*n));
+               ret = PTR_ERR_OR_ZERO(n);
+               if (ret)
+                       return ret;
+
+               bkey_reassemble(&n->k_i, k);
+
+               if (n->v.children[0] || n->v.children[1]) {
+                       bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               n->v.children[0] = cpu_to_le32(new_snapids[0]);
+               n->v.children[1] = cpu_to_le32(new_snapids[1]);
+               SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
+               bch2_trans_update(trans, &iter, &n->k_i, 0);
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+/* List of snapshot IDs that are being deleted: */
+struct snapshot_id_list {
+       u32             nr;
+       u32             size;
+       u32             *d;
+};
+
+static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+{
+       unsigned i;
+
+       for (i = 0; i < s->nr; i++)
+               if (id == s->d[i])
+                       return true;
+       return false;
+}
+
+static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+{
+       BUG_ON(snapshot_list_has_id(s, id));
+
+       if (s->nr == s->size) {
+               size_t new_size = max(8U, s->size * 2);
+               void *n = krealloc(s->d,
+                                  new_size * sizeof(s->d[0]),
+                                  GFP_KERNEL);
+               if (!n) {
+                       pr_err("error allocating snapshot ID list");
+                       return -ENOMEM;
+               }
+
+               s->d    = n;
+               s->size = new_size;
+       };
+
+       s->d[s->nr++] = id;
+       return 0;
+}
+
+static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
+                                          struct snapshot_id_list *deleted,
+                                          enum btree_id btree_id)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct snapshot_id_list equiv_seen = { 0 };
+       struct bpos last_pos = POS_MIN;
+       int ret = 0;
+
+       /*
+        * XXX: We should also delete whiteouts that no longer overwrite
+        * anything
+        */
+
+       bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+
+       while ((bch2_trans_begin(trans),
+               (k = bch2_btree_iter_peek(&iter)).k) &&
+              !(ret = bkey_err(k))) {
+               u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
+
+               if (bkey_cmp(k.k->p, last_pos))
+                       equiv_seen.nr = 0;
+               last_pos = k.k->p;
+
+               if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+                   snapshot_list_has_id(&equiv_seen, equiv)) {
+                       if (btree_id == BTREE_ID_inodes &&
+                           bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
+                               continue;
+
+                       ret = __bch2_trans_do(trans, NULL, NULL,
+                                             BTREE_INSERT_NOFAIL,
+                               bch2_btree_iter_traverse(&iter) ?:
+                               bch2_btree_delete_at(trans, &iter,
+                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+                       if (ret)
+                               break;
+               } else {
+                       ret = snapshot_id_add(&equiv_seen, equiv);
+                       if (ret)
+                               break;
+               }
+
+               bch2_btree_iter_advance(&iter);
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       kfree(equiv_seen.d);
+
+       return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_snapshot snap;
+       struct snapshot_id_list deleted = { 0 };
+       u32 i, id, children[2];
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       /*
+        * For every snapshot node: If we have no live children and it's not
+        * pointed to by a subvolume, delete it:
+        */
+       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               snap = bkey_s_c_to_snapshot(k);
+               if (BCH_SNAPSHOT_DELETED(snap.v) ||
+                   BCH_SNAPSHOT_SUBVOL(snap.v))
+                       continue;
+
+               children[0] = le32_to_cpu(snap.v->children[0]);
+               children[1] = le32_to_cpu(snap.v->children[1]);
+
+               ret   = snapshot_live(&trans, children[0]) ?:
+                       snapshot_live(&trans, children[1]);
+               if (ret < 0)
+                       break;
+               if (ret)
+                       continue;
+
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
+               if (ret) {
+                       bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret) {
+               bch_err(c, "error walking snapshots: %i", ret);
+               goto err;
+       }
+
+       ret = bch2_snapshots_set_equiv(&trans);
+       if (ret)
+               goto err;
+
+       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               snap = bkey_s_c_to_snapshot(k);
+               if (BCH_SNAPSHOT_DELETED(snap.v)) {
+                       ret = snapshot_id_add(&deleted, k.k->p.offset);
+                       if (ret)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret) {
+               bch_err(c, "error walking snapshots: %i", ret);
+               goto err;
+       }
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               if (!btree_type_has_snapshots(id))
+                       continue;
+
+               ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+               if (ret) {
+                       bch_err(c, "error deleting snapshot keys: %i", ret);
+                       goto err;
+               }
+       }
+
+       for (i = 0; i < deleted.nr; i++) {
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(&trans, deleted.d[i]));
+               if (ret) {
+                       bch_err(c, "error deleting snapshot %u: %i",
+                               deleted.d[i], ret);
+                       goto err;
+               }
+       }
+err:
+       kfree(deleted.d);
+       bch2_trans_exit(&trans);
+       percpu_ref_put(&c->writes);
+}
+
+static void bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+       if (unlikely(!percpu_ref_tryget(&c->writes)))
+               return;
+
+       if (!queue_work(system_long_wq, &c->snapshot_delete_work))
+               percpu_ref_put(&c->writes);
+}
+
+static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
+                                          struct btree_trans_commit_hook *h)
+{
+       bch2_delete_dead_snapshots(trans->c);
+       return 0;
+}
+
+/* Subvolumes: */
+
+const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
+               return "invalid pos";
+
+       if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+               return "invalid pos";
+
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
+               return "bad val size";
+
+       return NULL;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+                           struct bkey_s_c k)
+{
+       struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+       pr_buf(out, "root %llu snapshot id %u",
+              le64_to_cpu(s.v->inode),
+              le32_to_cpu(s.v->snapshot));
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+                               u32 *snapid)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+                            POS(0, subvol),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_subvolume) {
+               bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+               ret = -EIO;
+               goto err;
+       }
+
+       *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+/* XXX: mark snapshot id for deletion, walk btree and delete: */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
+                         int deleting_snapshot)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_subvolume subvol;
+       struct btree_trans_commit_hook *h;
+       struct bkey_i *delete;
+       u32 snapid;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+                            POS(0, subvolid),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_subvolume) {
+               bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+               ret = -EIO;
+               goto err;
+       }
+
+       subvol = bkey_s_c_to_subvolume(k);
+       snapid = le32_to_cpu(subvol.v->snapshot);
+
+       if (deleting_snapshot >= 0 &&
+           deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) {
+               ret = -ENOENT;
+               goto err;
+       }
+
+       delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+       ret = PTR_ERR_OR_ZERO(delete);
+       if (ret)
+               goto err;
+
+       bkey_init(&delete->k);
+       delete->k.p = iter.pos;
+       ret = bch2_trans_update(trans, &iter, delete, 0);
+       if (ret)
+               goto err;
+
+       ret = bch2_snapshot_node_set_deleted(trans, snapid);
+
+       h = bch2_trans_kmalloc(trans, sizeof(*h));
+       ret = PTR_ERR_OR_ZERO(h);
+       if (ret)
+               goto err;
+
+       h->fn = bch2_delete_dead_snapshots_hook;
+       bch2_trans_commit_hook(trans, h);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+                         u32 src_subvolid,
+                         u32 *new_subvolid,
+                         u32 *new_snapshotid,
+                         bool ro)
+{
+       struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+       struct bkey_i_subvolume *new_subvol = NULL;
+       struct bkey_i_subvolume *src_subvol = NULL;
+       struct bkey_s_c k;
+       u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+       int ret = 0;
+
+       for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
+                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+               if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+                       break;
+               if (bkey_deleted(k.k))
+                       goto found_slot;
+       }
+
+       if (!ret)
+               ret = -ENOSPC;
+       goto err;
+found_slot:
+       snapshot_subvols[0] = dst_iter.pos.offset;
+       snapshot_subvols[1] = src_subvolid;
+
+       if (src_subvolid) {
+               /* Creating a snapshot: */
+               src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
+               ret = PTR_ERR_OR_ZERO(src_subvol);
+               if (ret)
+                       goto err;
+
+               bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
+                                    POS(0, src_subvolid),
+                                    BTREE_ITER_CACHED|
+                                    BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek_slot(&src_iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_subvolume) {
+                       bch_err(trans->c, "subvolume %u not found", src_subvolid);
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               bkey_reassemble(&src_subvol->k_i, k);
+               parent = le32_to_cpu(src_subvol->v.snapshot);
+       }
+
+       ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+                                       snapshot_subvols,
+                                       src_subvolid ? 2 : 1);
+       if (ret)
+               goto err;
+
+       if (src_subvolid) {
+               src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+               bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+       }
+
+       new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+       ret = PTR_ERR_OR_ZERO(new_subvol);
+       if (ret)
+               goto err;
+
+       bkey_subvolume_init(&new_subvol->k_i);
+       new_subvol->v.flags     = 0;
+       new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
+       new_subvol->v.inode     = cpu_to_le64(inode);
+       SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+       SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+       new_subvol->k.p         = dst_iter.pos;
+       bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
+
+       *new_subvolid   = new_subvol->k.p.offset;
+       *new_snapshotid = new_nodes[0];
+err:
+       bch2_trans_iter_exit(trans, &src_iter);
+       bch2_trans_iter_exit(trans, &dst_iter);
+       return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+       INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+       return 0;
+}
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
new file mode 100644 (file)
index 0000000..0740c7b
--- /dev/null
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_snapshot (struct bkey_ops) {             \
+       .key_invalid    = bch2_snapshot_invalid,                \
+       .val_to_text    = bch2_snapshot_to_text,                \
+}
+
+int bch2_mark_snapshot(struct bch_fs *, struct bkey_s_c,
+                      struct bkey_s_c, u64, unsigned);
+
+static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+       return genradix_ptr(&c->snapshots, U32_MAX - id);
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+       return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
+{
+       struct snapshot_t *s = snapshot_t(c, id);
+
+       return s->children[0] || s->children[1];
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+       struct snapshot_t *s;
+       u32 parent = bch2_snapshot_parent(c, id);
+
+       if (!parent)
+               return 0;
+
+       s = snapshot_t(c, bch2_snapshot_parent(c, id));
+       if (id == s->children[0])
+               return s->children[1];
+       if (id == s->children[1])
+               return s->children[0];
+       return 0;
+}
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+       while (id && id < ancestor)
+               id = bch2_snapshot_parent(c, id);
+
+       return id == ancestor;
+}
+
+struct snapshots_seen {
+       struct bpos                     pos;
+       size_t                          nr;
+       size_t                          size;
+       u32                             *d;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+       kfree(s->d);
+       s->d = NULL;
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+       memset(s, 0, sizeof(*s));
+}
+
+static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+       if (s->nr == s->size) {
+               size_t new_size = max(s->size, 128UL) * 2;
+               u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
+
+               if (!d) {
+                       bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
+                               new_size);
+                       return -ENOMEM;
+               }
+
+               s->size = new_size;
+               s->d    = d;
+       }
+
+       s->d[s->nr++] = id;
+       return 0;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+int bch2_fs_snapshots_start(struct bch_fs *);
+
+const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume (struct bkey_ops) {            \
+       .key_invalid    = bch2_subvolume_invalid,               \
+       .val_to_text    = bch2_subvolume_to_text,               \
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+int bch2_subvolume_delete(struct btree_trans *, u32, int);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+                         u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
index 8f847661359498c71cb203faa682b7e87f1ec336..1feb7dee2e0c1ca744683cb9158f8596fdcb4f32 100644 (file)
@@ -39,6 +39,7 @@
 #include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
@@ -468,6 +469,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_exit(&c->times[i]);
 
+       bch2_fs_snapshots_exit(c);
        bch2_fs_quota_exit(c);
        bch2_fs_fsio_exit(c);
        bch2_fs_ec_exit(c);
@@ -686,6 +688,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        mutex_init(&c->usage_scratch_lock);
 
        mutex_init(&c->bio_bounce_pages_lock);
+       mutex_init(&c->snapshot_table_lock);
 
        spin_lock_init(&c->btree_write_error_lock);
 
@@ -789,6 +792,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
            bch2_fs_btree_iter_init(c) ||
            bch2_fs_btree_interior_update_init(c) ||
+           bch2_fs_subvolumes_init(c) ||
            bch2_fs_io_init(c) ||
            bch2_fs_encryption_init(c) ||
            bch2_fs_compress_init(c) ||
index ef6ae97e0df58886d89785bfe7cd522e3d1b6409..a182e242a0e817ec3cd9af31081c46a3d05911ce 100644 (file)
@@ -128,7 +128,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
        int ret;
 
        ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
-                              inode->v.i_ino,
+                              inode_inum(inode),
                               &X_SEARCH(type, name, strlen(name)),
                               0);
        if (ret)
@@ -160,7 +160,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
                bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
 }
 
-int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
                   const struct bch_hash_info *hash_info,
                   const char *name, const void *value, size_t size,
                   int type, int flags)
@@ -282,13 +282,21 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        struct btree_iter iter;
        struct bkey_s_c k;
        struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
-       u64 inum = dentry->d_inode->i_ino;
+       u64 offset = 0, inum = inode->ei_inode.bi_inum;
+       u32 snapshot;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+       iter = (struct btree_iter) { NULL };
+
+       ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+       if (ret)
+               goto err;
 
        for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-                          POS(inum, 0), 0, k, ret) {
+                          SPOS(inum, offset, snapshot), 0, k, ret) {
                BUG_ON(k.k->p.inode < inum);
 
                if (k.k->p.inode > inum)
@@ -301,7 +309,12 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                if (ret)
                        break;
        }
+
+       offset = iter.pos.offset;
        bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
        ret = bch2_trans_exit(&trans) ?: ret;
 
@@ -340,7 +353,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 
        return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
-                       bch2_xattr_set(&trans, inode->v.i_ino, &hash,
+                       bch2_xattr_set(&trans, inode_inum(inode), &hash,
                                       name, value, size,
                                       handler->flags, flags));
 }
index 4151065ab853546c3f071a831cfba10d9af03010..f4f896545e1c29f0ff35018263bf6b227250567b 100644 (file)
@@ -39,7 +39,8 @@ struct bch_inode_info;
 int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
                  const char *, void *, size_t, int);
 
-int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+                  const struct bch_hash_info *,
                   const char *, const void *, size_t, int, int);
 
 ssize_t bch2_xattr_list(struct dentry *, char *, size_t);