]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/fs.c
Update bcachefs sources to a5da815430 bcachefs: Convert constants to consts
[bcachefs-tools-debian] / libbcachefs / fs.c
index 45a2af3f59cabe85ec6ad647fa5bc0e62c6c2470..c23309f1a1aeac7226e0565865bc3f482b8a83ed 100644 (file)
@@ -8,6 +8,7 @@
 #include "buckets.h"
 #include "chardev.h"
 #include "dirent.h"
+#include "errcode.h"
 #include "extents.h"
 #include "fs.h"
 #include "fs-common.h"
 #include <linux/pagemap.h>
 #include <linux/posix_acl.h>
 #include <linux/random.h>
+#include <linux/seq_file.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/xattr.h>
 
 static struct kmem_cache *bch2_inode_cache;
 
-static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum,
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
                                struct bch_inode_info *,
-                               struct bch_inode_unpacked *);
+                               struct bch_inode_unpacked *,
+                               struct bch_subvolume *);
 
-static void journal_seq_copy(struct bch_fs *c,
-                            struct bch_inode_info *dst,
-                            u64 journal_seq)
-{
-       /*
-        * atomic64_cmpxchg has a fallback for archs that don't support it,
-        * cmpxchg does not:
-        */
-       atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
-       u64 old, v = READ_ONCE(dst->ei_journal_seq);
-
-       do {
-               old = v;
-
-               if (old >= journal_seq)
-                       break;
-       } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
-
-       bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
-}
-
-static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
-{
-       BUG_ON(atomic_long_read(&lock->v) == 0);
-
-       if (atomic_long_sub_return_release(i, &lock->v) == 0)
-               wake_up_all(&lock->wait);
-}
-
-static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
-{
-       long v = atomic_long_read(&lock->v), old;
-
-       do {
-               old = v;
-
-               if (i > 0 ? v < 0 : v > 0)
-                       return false;
-       } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
-                                       old, old + i)) != old);
-       return true;
-}
-
-static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
-{
-       wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
-}
-
-void bch2_pagecache_add_put(struct pagecache_lock *lock)
-{
-       __pagecache_lock_put(lock, 1);
-}
-
-bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
-{
-       return __pagecache_lock_tryget(lock, 1);
-}
-
-void bch2_pagecache_add_get(struct pagecache_lock *lock)
-{
-       __pagecache_lock_get(lock, 1);
-}
-
-void bch2_pagecache_block_put(struct pagecache_lock *lock)
-{
-       __pagecache_lock_put(lock, -1);
-}
-
-void bch2_pagecache_block_get(struct pagecache_lock *lock)
-{
-       __pagecache_lock_get(lock, -1);
-}
-
-void bch2_inode_update_after_write(struct bch_fs *c,
+void bch2_inode_update_after_write(struct btree_trans *trans,
                                   struct bch_inode_info *inode,
                                   struct bch_inode_unpacked *bi,
                                   unsigned fields)
 {
+       struct bch_fs *c = trans->c;
+
+       BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+       bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+                              POS(0, bi->bi_inum),
+                              c->opts.inodes_use_key_cache);
+
        set_nlink(&inode->v, bch2_inode_nlink_get(bi));
        i_uid_write(&inode->v, bi->bi_uid);
        i_gid_write(&inode->v, bi->bi_gid);
@@ -153,20 +91,18 @@ retry:
                                BTREE_ITER_INTENT) ?:
                (set ? set(inode, &inode_u, p) : 0) ?:
                bch2_inode_write(&trans, &iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL,
-                                 &inode->ei_journal_seq,
-                                 BTREE_INSERT_NOFAIL);
+               bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 
        /*
         * the btree node lock protects inode->ei_inode, not ei_update_lock;
         * this is important for inode updates via bchfs_write_index_update
         */
        if (!ret)
-               bch2_inode_update_after_write(c, inode, &inode_u, fields);
+               bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
 
        bch2_trans_iter_exit(&trans, &iter);
 
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        bch2_trans_exit(&trans);
@@ -237,6 +173,8 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
        struct bch_inode_unpacked inode_u;
        struct bch_inode_info *inode;
+       struct btree_trans trans;
+       struct bch_subvolume subvol;
        int ret;
 
        inode = to_bch_ei(iget5_locked(c->vfs_sb,
@@ -249,16 +187,20 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
        if (!(inode->v.i_state & I_NEW))
                return &inode->v;
 
-       ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+       bch2_trans_init(&trans, c, 8, 0);
+       ret = lockrestart_do(&trans,
+               bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
+               bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+
+       if (!ret)
+               bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+       bch2_trans_exit(&trans);
+
        if (ret) {
                iget_failed(&inode->v);
                return ERR_PTR(ret);
        }
 
-       bch2_vfs_inode_init(c, inum, inode, &inode_u);
-
-       inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum);
-
        unlock_new_inode(&inode->v);
 
        return &inode->v;
@@ -277,6 +219,7 @@ __bch2_create(struct user_namespace *mnt_userns,
        struct bch_inode_unpacked inode_u;
        struct posix_acl *default_acl = NULL, *acl = NULL;
        subvol_inum inum;
+       struct bch_subvolume subvol;
        u64 journal_seq = 0;
        int ret;
 
@@ -319,28 +262,29 @@ retry:
        if (unlikely(ret))
                goto err_before_quota;
 
-       ret   = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+       inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+       inum.inum = inode_u.bi_inum;
+
+       ret   = bch2_subvolume_get(&trans, inum.subvol, true,
+                                  BTREE_ITER_WITH_UPDATES, &subvol) ?:
+               bch2_trans_commit(&trans, NULL, &journal_seq, 0);
        if (unlikely(ret)) {
                bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
 err_before_quota:
-               if (ret == -EINTR)
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
                goto err_trans;
        }
 
        if (!(flags & BCH_CREATE_TMPFILE)) {
-               bch2_inode_update_after_write(c, dir, &dir_u,
+               bch2_inode_update_after_write(&trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               journal_seq_copy(c, dir, journal_seq);
                mutex_unlock(&dir->ei_update_lock);
        }
 
-       inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
-       inum.inum = inode_u.bi_inum;
-
-       bch2_vfs_inode_init(c, inum, inode, &inode_u);
-       journal_seq_copy(c, inode, journal_seq);
+       bch2_iget5_set(&inode->v, &inum);
+       bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
 
        set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
        set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -365,7 +309,6 @@ err_before_quota:
                 * We raced, another process pulled the new inode into cache
                 * before us:
                 */
-               journal_seq_copy(c, old, journal_seq);
                make_bad_inode(&inode->v);
                iput(&inode->v);
 
@@ -424,7 +367,7 @@ static int bch2_mknod(struct user_namespace *mnt_userns,
                              (subvol_inum) { 0 }, 0);
 
        if (IS_ERR(inode))
-               return PTR_ERR(inode);
+               return bch2_err_class(PTR_ERR(inode));
 
        d_instantiate(dentry, &inode->v);
        return 0;
@@ -449,19 +392,16 @@ static int __bch2_link(struct bch_fs *c,
        mutex_lock(&inode->ei_update_lock);
        bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                        bch2_link_trans(&trans,
                                        inode_inum(dir),   &dir_u,
                                        inode_inum(inode), &inode_u,
                                        &dentry->d_name));
 
        if (likely(!ret)) {
-               BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
-               journal_seq_copy(c, inode, dir->ei_journal_seq);
-               bch2_inode_update_after_write(c, dir, &dir_u,
+               bch2_inode_update_after_write(&trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+               bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
        }
 
        bch2_trans_exit(&trans);
@@ -489,7 +429,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 }
 
 int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
-                 int deleting_snapshot)
+                 bool deleting_snapshot)
 {
        struct bch_fs *c = vdir->i_sb->s_fs_info;
        struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -501,7 +441,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
        bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
        bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
+       ret = commit_do(&trans, NULL, NULL,
                              BTREE_INSERT_NOFAIL,
                        bch2_unlink_trans(&trans,
                                          inode_inum(dir), &dir_u,
@@ -509,12 +449,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
                                          deleting_snapshot));
 
        if (likely(!ret)) {
-               BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
-               journal_seq_copy(c, inode, dir->ei_journal_seq);
-               bch2_inode_update_after_write(c, dir, &dir_u,
+               bch2_inode_update_after_write(&trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               bch2_inode_update_after_write(c, inode, &inode_u,
+               bch2_inode_update_after_write(&trans, inode, &inode_u,
                                              ATTR_MTIME);
        }
 
@@ -526,7 +463,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
 
 static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 {
-       return __bch2_unlink(vdir, dentry, -1);
+       return __bch2_unlink(vdir, dentry, false);
 }
 
 static int bch2_symlink(struct user_namespace *mnt_userns,
@@ -539,8 +476,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
 
        inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
                              (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-       if (unlikely(IS_ERR(inode)))
-               return PTR_ERR(inode);
+       if (IS_ERR(inode))
+               return bch2_err_class(PTR_ERR(inode));
 
        inode_lock(&inode->v);
        ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
@@ -553,8 +490,6 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
        if (unlikely(ret))
                goto err;
 
-       journal_seq_copy(c, dir, inode->ei_journal_seq);
-
        ret = __bch2_link(c, inode, dir, dentry);
        if (unlikely(ret))
                goto err;
@@ -589,7 +524,6 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
                ? BCH_RENAME_EXCHANGE
                : dst_dentry->d_inode
                ? BCH_RENAME_OVERWRITE : BCH_RENAME;
-       u64 journal_seq = 0;
        int ret;
 
        if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
@@ -629,7 +563,7 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
                        goto err;
        }
 
-       ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
+       ret = commit_do(&trans, NULL, NULL, 0,
                        bch2_rename_trans(&trans,
                                          inode_inum(src_dir), &src_dir_u,
                                          inode_inum(dst_dir), &dst_dir_u,
@@ -645,25 +579,19 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
        BUG_ON(dst_inode &&
               dst_inode->v.i_ino != dst_inode_u.bi_inum);
 
-       bch2_inode_update_after_write(c, src_dir, &src_dir_u,
+       bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
                                      ATTR_MTIME|ATTR_CTIME);
-       journal_seq_copy(c, src_dir, journal_seq);
 
-       if (src_dir != dst_dir) {
-               bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
+       if (src_dir != dst_dir)
+               bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               journal_seq_copy(c, dst_dir, journal_seq);
-       }
 
-       bch2_inode_update_after_write(c, src_inode, &src_inode_u,
+       bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
                                      ATTR_CTIME);
-       journal_seq_copy(c, src_inode, journal_seq);
 
-       if (dst_inode) {
-               bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
+       if (dst_inode)
+               bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
                                              ATTR_CTIME);
-               journal_seq_copy(c, dst_inode, journal_seq);
-       }
 err:
        bch2_trans_exit(&trans);
 
@@ -739,10 +667,10 @@ int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
        qid = inode->ei_qid;
 
        if (attr->ia_valid & ATTR_UID)
-               qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
+               qid.q[QTYP_USR] = from_kuid(mnt_userns, attr->ia_uid);
 
        if (attr->ia_valid & ATTR_GID)
-               qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
+               qid.q[QTYP_GRP] = from_kgid(mnt_userns, attr->ia_gid);
 
        ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
                                     KEY_TYPE_QUOTA_PREALLOC);
@@ -770,18 +698,17 @@ retry:
        }
 
        ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL,
-                                 &inode->ei_journal_seq,
+               bch2_trans_commit(&trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL);
 btree_err:
        bch2_trans_iter_exit(&trans, &inode_iter);
 
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (unlikely(ret))
                goto err_trans;
 
-       bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
+       bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
 
        if (acl)
                set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -790,7 +717,7 @@ err_trans:
 err:
        mutex_unlock(&inode->ei_update_lock);
 
-       return ret;
+       return bch2_err_class(ret);
 }
 
 static int bch2_getattr(struct user_namespace *mnt_userns,
@@ -852,18 +779,19 @@ static int bch2_setattr(struct user_namespace *mnt_userns,
 }
 
 static int bch2_tmpfile(struct user_namespace *mnt_userns,
-                       struct inode *vdir, struct dentry *dentry, umode_t mode)
+                       struct inode *vdir, struct file *file, umode_t mode)
 {
        struct bch_inode_info *inode =
-               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
+               __bch2_create(mnt_userns, to_bch_ei(vdir),
+                             file->f_path.dentry, mode, 0,
                              (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
        if (IS_ERR(inode))
-               return PTR_ERR(inode);
+               return bch2_err_class(PTR_ERR(inode));
 
-       d_mark_tmpfile(dentry, &inode->v);
-       d_instantiate(dentry, &inode->v);
-       return 0;
+       d_mark_tmpfile(file, &inode->v);
+       d_instantiate(file->f_path.dentry, &inode->v);
+       return finish_open_simple(file, 0);
 }
 
 static int bch2_fill_extent(struct bch_fs *c,
@@ -883,13 +811,16 @@ static int bch2_fill_extent(struct bch_fs *c,
                        int flags2 = 0;
                        u64 offset = p.ptr.offset;
 
+                       if (p.ptr.unwritten)
+                               flags2 |= FIEMAP_EXTENT_UNWRITTEN;
+
                        if (p.crc.compression_type)
                                flags2 |= FIEMAP_EXTENT_ENCODED;
                        else
                                offset += p.crc.offset;
 
-                       if ((offset & (c->opts.block_size - 1)) ||
-                           (k.k->size & (c->opts.block_size - 1)))
+                       if ((offset & (block_sectors(c) - 1)) ||
+                           (k.k->size & (block_sectors(c) - 1)))
                                flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
                        ret = fiemap_fill_next_extent(info,
@@ -956,9 +887,9 @@ retry:
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
                             SPOS(ei->v.i_ino, start, snapshot), 0);
 
-       while ((k = bch2_btree_iter_peek(&iter)).k &&
-              !(ret = bkey_err(k)) &&
-              bkey_cmp(iter.pos, end) < 0) {
+       while (!(ret = btree_trans_too_many_iters(&trans)) &&
+              (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
+              !(ret = bkey_err(k))) {
                enum btree_id data_btree = BTREE_ID_extents;
 
                if (!bkey_extent_is_data(k.k) &&
@@ -1007,7 +938,7 @@ retry:
        start = iter.pos.offset;
        bch2_trans_iter_exit(&trans, &iter);
 err:
-       if (ret == -EINTR)
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        if (!ret && have_extent)
@@ -1134,69 +1065,258 @@ static const struct inode_operations bch_special_inode_operations = {
 };
 
 static const struct address_space_operations bch_address_space_operations = {
-       .writepage      = bch2_writepage,
-       .readpage       = bch2_readpage,
+       .read_folio     = bch2_read_folio,
        .writepages     = bch2_writepages,
        .readahead      = bch2_readahead,
-       .set_page_dirty = __set_page_dirty_nobuffers,
+       .dirty_folio    = filemap_dirty_folio,
        .write_begin    = bch2_write_begin,
        .write_end      = bch2_write_end,
-       .invalidatepage = bch2_invalidatepage,
-       .releasepage    = bch2_releasepage,
+       .invalidate_folio = bch2_invalidate_folio,
+       .release_folio  = bch2_release_folio,
        .direct_IO      = noop_direct_IO,
 #ifdef CONFIG_MIGRATION
-       .migratepage    = bch2_migrate_page,
+       .migrate_folio  = filemap_migrate_folio,
 #endif
        .error_remove_page = generic_error_remove_page,
 };
 
-#if 0
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
-               u64 ino, u32 generation)
+struct bcachefs_fid {
+       u64             inum;
+       u32             subvol;
+       u32             gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+       struct bcachefs_fid     fid;
+       struct bcachefs_fid     dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
 {
-       struct bch_fs *c = sb->s_fs_info;
-       struct inode *vinode;
+       switch (fh_type) {
+       case FILEID_BCACHEFS_WITHOUT_PARENT:
+               return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+       case FILEID_BCACHEFS_WITH_PARENT:
+               return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+       default:
+               return false;
+       }
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+       return (struct bcachefs_fid) {
+               .inum   = inode->ei_inode.bi_inum,
+               .subvol = inode->ei_subvol,
+               .gen    = inode->ei_inode.bi_generation,
+       };
+}
+
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+                         struct inode *vdir)
+{
+       struct bch_inode_info *inode    = to_bch_ei(vinode);
+       struct bch_inode_info *dir      = to_bch_ei(vdir);
+
+       if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
+               return FILEID_INVALID;
+
+       if (!S_ISDIR(inode->v.i_mode) && dir) {
+               struct bcachefs_fid_with_parent *fid = (void *) fh;
 
-       if (ino < BCACHEFS_ROOT_INO)
-               return ERR_PTR(-ESTALE);
+               fid->fid = bch2_inode_to_fid(inode);
+               fid->dir = bch2_inode_to_fid(dir);
 
-       vinode = bch2_vfs_inode_get(c, ino);
-       if (IS_ERR(vinode))
-               return ERR_CAST(vinode);
-       if (generation && vinode->i_generation != generation) {
-               /* we didn't find the right inode.. */
+               *len = sizeof(*fid) / sizeof(u32);
+               return FILEID_BCACHEFS_WITH_PARENT;
+       } else {
+               struct bcachefs_fid *fid = (void *) fh;
+
+               *fid = bch2_inode_to_fid(inode);
+
+               *len = sizeof(*fid) / sizeof(u32);
+               return FILEID_BCACHEFS_WITHOUT_PARENT;
+       }
+}
+
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+                                       struct bcachefs_fid fid)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+                                   .subvol = fid.subvol,
+                                   .inum = fid.inum,
+       });
+       if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
                iput(vinode);
-               return ERR_PTR(-ESTALE);
+               vinode = ERR_PTR(-ESTALE);
        }
        return vinode;
 }
 
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
                int fh_len, int fh_type)
 {
-       return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-                                   bch2_nfs_get_inode);
+       struct bcachefs_fid *fid = (void *) _fid;
+
+       if (!bcachefs_fid_valid(fh_len, fh_type))
+               return NULL;
+
+       return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
 }
 
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
                int fh_len, int fh_type)
 {
-       return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-                                   bch2_nfs_get_inode);
+       struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+       if (!bcachefs_fid_valid(fh_len, fh_type) ||
+           fh_type != FILEID_BCACHEFS_WITH_PARENT)
+               return NULL;
+
+       return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+       struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       subvol_inum parent_inum = {
+               .subvol = inode->ei_inode.bi_parent_subvol ?:
+                       inode->ei_subvol,
+               .inum = inode->ei_inode.bi_dir,
+       };
+
+       if (!parent_inum.inum)
+               return NULL;
+
+       return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+       struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
+       struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct btree_iter iter1;
+       struct btree_iter iter2;
+       struct bkey_s_c k;
+       struct bkey_s_c_dirent d;
+       struct bch_inode_unpacked inode_u;
+       subvol_inum target;
+       u32 snapshot;
+       unsigned name_len;
+       int ret;
+
+       if (!S_ISDIR(dir->v.i_mode))
+               return -EINVAL;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+                            POS(dir->ei_inode.bi_inum, 0), 0);
+       bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+                            POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_btree_iter_set_snapshot(&iter1, snapshot);
+       bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+       ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+       if (ret)
+               goto err;
+
+       if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+               bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+               k = bch2_btree_iter_peek_slot(&iter1);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_dirent) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               d = bkey_s_c_to_dirent(k);
+               ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+               if (ret > 0)
+                       ret = -ENOENT;
+               if (ret)
+                       goto err;
+
+               if (target.subvol       == inode->ei_subvol &&
+                   target.inum         == inode->ei_inode.bi_inum)
+                       goto found;
+       } else {
+               /*
+                * File with multiple hardlinks and our backref is to the wrong
+                * directory - linear search:
+                */
+               for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+                       if (k.k->p.inode > dir->ei_inode.bi_inum)
+                               break;
+
+                       if (k.k->type != KEY_TYPE_dirent)
+                               continue;
+
+                       d = bkey_s_c_to_dirent(k);
+                       ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+                       if (ret < 0)
+                               break;
+                       if (ret)
+                               continue;
+
+                       if (target.subvol       == inode->ei_subvol &&
+                           target.inum         == inode->ei_inode.bi_inum)
+                               goto found;
+               }
+       }
+
+       ret = -ENOENT;
+       goto err;
+found:
+       name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+
+       memcpy(name, d.v->d_name, name_len);
+       name[name_len] = '\0';
+err:
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               goto retry;
+
+       bch2_trans_iter_exit(&trans, &iter1);
+       bch2_trans_iter_exit(&trans, &iter2);
+       bch2_trans_exit(&trans);
+
+       return ret;
 }
-#endif
 
 static const struct export_operations bch_export_ops = {
-       //.fh_to_dentry = bch2_fh_to_dentry,
-       //.fh_to_parent = bch2_fh_to_parent,
-       //.get_parent   = bch2_get_parent,
+       .encode_fh      = bch2_encode_fh,
+       .fh_to_dentry   = bch2_fh_to_dentry,
+       .fh_to_parent   = bch2_fh_to_parent,
+       .get_parent     = bch2_get_parent,
+       .get_name       = bch2_get_name,
 };
 
-static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
                                struct bch_inode_info *inode,
-                               struct bch_inode_unpacked *bi)
+                               struct bch_inode_unpacked *bi,
+                               struct bch_subvolume *subvol)
 {
-       bch2_inode_update_after_write(c, inode, bi, ~0);
+       bch2_inode_update_after_write(trans, inode, bi, ~0);
+
+       if (BCH_SUBVOLUME_SNAP(subvol))
+               set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+       else
+               clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
 
        inode->v.i_blocks       = bi->bi_sectors;
        inode->v.i_ino          = bi->bi_inum;
@@ -1205,7 +1325,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum,
        inode->v.i_size         = bi->bi_size;
 
        inode->ei_flags         = 0;
-       inode->ei_journal_seq   = 0;
        inode->ei_quota_reserved = 0;
        inode->ei_qid           = bch_qid(bi);
        inode->ei_subvol        = inum.subvol;
@@ -1242,9 +1361,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 
        inode_init_once(&inode->v);
        mutex_init(&inode->ei_update_lock);
-       pagecache_lock_init(&inode->ei_pagecache_lock);
+       two_state_lock_init(&inode->ei_pagecache_lock);
        mutex_init(&inode->ei_quota_lock);
-       inode->ei_journal_seq = 0;
 
        return &inode->v;
 }
@@ -1287,7 +1405,7 @@ static int bch2_vfs_write_inode(struct inode *vinode,
                               ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
        mutex_unlock(&inode->ei_update_lock);
 
-       return ret;
+       return bch2_err_class(ret);
 }
 
 static void bch2_evict_inode(struct inode *vinode)
@@ -1306,8 +1424,55 @@ static void bch2_evict_inode(struct inode *vinode)
                                KEY_TYPE_QUOTA_WARN);
                bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
-               bch2_inode_rm(c, inode_inum(inode), true);
+               bch2_inode_rm(c, inode_inum(inode));
+       }
+}
+
+void bch2_evict_subvolume_inodes(struct bch_fs *c,
+                                snapshot_id_list *s)
+{
+       struct super_block *sb = c->vfs_sb;
+       struct inode *inode;
+
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+                   (inode->i_state & I_FREEING))
+                       continue;
+
+               d_mark_dontcache(inode);
+               d_prune_aliases(inode);
+       }
+       spin_unlock(&sb->s_inode_list_lock);
+again:
+       cond_resched();
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+                   (inode->i_state & I_FREEING))
+                       continue;
+
+               if (!(inode->i_state & I_DONTCACHE)) {
+                       d_mark_dontcache(inode);
+                       d_prune_aliases(inode);
+               }
+
+               spin_lock(&inode->i_lock);
+               if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
+                   !(inode->i_state & I_FREEING)) {
+                       wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
+                       DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+                       prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+                       spin_unlock(&inode->i_lock);
+                       spin_unlock(&sb->s_inode_list_lock);
+                       schedule();
+                       finish_wait(wq, &wait.wq_entry);
+                       goto again;
+               }
+
+               spin_unlock(&inode->i_lock);
        }
+       spin_unlock(&sb->s_inode_list_lock);
 }
 
 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1344,6 +1509,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int bch2_sync_fs(struct super_block *sb, int wait)
 {
        struct bch_fs *c = sb->s_fs_info;
+       int ret;
 
        if (c->opts.journal_flush_disabled)
                return 0;
@@ -1353,7 +1519,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
 
-       return bch2_journal_flush(&c->journal);
+       ret = bch2_journal_flush(&c->journal);
+       return bch2_err_class(ret);
 }
 
 static struct bch_fs *bch2_path_to_fs(const char *path)
@@ -1409,7 +1576,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
        ret = bch2_parse_mount_opts(c, &opts, data);
        if (ret)
-               return ret;
+               goto err;
 
        if (opts.read_only != c->opts.read_only) {
                down_write(&c->state_lock);
@@ -1423,7 +1590,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
                        if (ret) {
                                bch_err(c, "error going rw: %i", ret);
                                up_write(&c->state_lock);
-                               return -EINVAL;
+                               ret = -EINVAL;
+                               goto err;
                        }
 
                        sb->s_flags &= ~SB_RDONLY;
@@ -1436,8 +1604,8 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
 
        if (opts.errors >= 0)
                c->opts.errors = opts.errors;
-
-       return ret;
+err:
+       return bch2_err_class(ret);
 }
 
 static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
@@ -1462,25 +1630,30 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 {
        struct bch_fs *c = root->d_sb->s_fs_info;
        enum bch_opt_id i;
-       char buf[512];
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
 
        for (i = 0; i < bch2_opts_nr; i++) {
                const struct bch_option *opt = &bch2_opt_table[i];
                u64 v = bch2_opt_get_by_id(&c->opts, i);
 
-               if (!(opt->mode & OPT_MOUNT))
+               if (!(opt->flags & OPT_MOUNT))
                        continue;
 
                if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
                        continue;
 
-               bch2_opt_to_text(&PBUF(buf), c, opt, v,
+               printbuf_reset(&buf);
+               bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
                                 OPT_SHOW_MOUNT_STYLE);
                seq_putc(seq, ',');
-               seq_puts(seq, buf);
+               seq_puts(seq, buf.buf);
        }
 
-       return 0;
+       if (buf.allocation_failure)
+               ret = -ENOMEM;
+       printbuf_exit(&buf);
+       return ret;
 }
 
 static void bch2_put_super(struct super_block *sb)
@@ -1597,8 +1770,11 @@ got_sb:
        kfree(devs[0]);
        kfree(devs);
 
-       if (IS_ERR(sb))
-               return ERR_CAST(sb);
+       if (IS_ERR(sb)) {
+               ret = PTR_ERR(sb);
+               ret = bch2_err_class(ret);
+               return ERR_PTR(ret);
+       }
 
        c = sb->s_fs_info;
 
@@ -1625,7 +1801,7 @@ got_sb:
        sb->s_time_min          = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
        sb->s_time_max          = div_s64(S64_MAX, c->sb.time_units_per_sec);
        c->vfs_sb               = sb;
-       strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+       strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 
        ret = super_setup_bdi(sb);
        if (ret)
@@ -1650,11 +1826,12 @@ got_sb:
                sb->s_flags     |= SB_POSIXACL;
 #endif
 
+       sb->s_shrink.seeks = 0;
+
        vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
-       if (IS_ERR(vinode)) {
-               bch_err(c, "error mounting: error getting root inode %i",
-                       (int) PTR_ERR(vinode));
-               ret = PTR_ERR(vinode);
+       ret = PTR_ERR_OR_ZERO(vinode);
+       if (ret) {
+               bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
                goto err_put_super;
        }
 
@@ -1695,8 +1872,7 @@ MODULE_ALIAS_FS("bcachefs");
 void bch2_vfs_exit(void)
 {
        unregister_filesystem(&bcache_fs_type);
-       if (bch2_inode_cache)
-               kmem_cache_destroy(bch2_inode_cache);
+       kmem_cache_destroy(bch2_inode_cache);
 }
 
 int __init bch2_vfs_init(void)