]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/fs.c
Update bcachefs sources to 7227ff07f14b Merge pull request #10 from modelrockettier...
[bcachefs-tools-debian] / libbcachefs / fs.c
index 081ae140c70546621df6cf57fa47d5556b5107c0..1cf729107b1ec49e971172f57cfea37277faa4de 100644 (file)
 #include "fs-ioctl.h"
 #include "fsck.h"
 #include "inode.h"
+#include "io.h"
 #include "journal.h"
 #include "keylist.h"
+#include "quota.h"
 #include "super.h"
 #include "xattr.h"
 
@@ -32,6 +34,43 @@ static void bch2_vfs_inode_init(struct bch_fs *,
                                struct bch_inode_info *,
                                struct bch_inode_unpacked *);
 
+static void journal_seq_copy(struct bch_inode_info *dst,
+                            u64 journal_seq)
+{
+       u64 old, v = READ_ONCE(dst->ei_journal_seq);
+
+       do {
+               old = v;
+
+               if (old >= journal_seq)
+                       break;
+       } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+}
+
+static inline int ptrcmp(void *l, void *r)
+{
+       return (l > r) - (l < r);
+}
+
+#define __bch2_lock_inodes(_lock, ...)                                 \
+do {                                                                   \
+       struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };             \
+       unsigned i;                                                     \
+                                                                       \
+       bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp);                 \
+                                                                       \
+       for (i = ARRAY_SIZE(a) - 1; a[i]; --i)                          \
+               if (a[i] != a[i - 1]) {                                 \
+                       if (_lock)                                      \
+                               mutex_lock_nested(&a[i]->ei_update_lock, i);\
+                       else                                            \
+                               mutex_unlock(&a[i]->ei_update_lock);    \
+               }                                                       \
+} while (0)
+
+#define bch2_lock_inodes(...)  __bch2_lock_inodes(true, __VA_ARGS__)
+#define bch2_unlock_inodes(...)        __bch2_lock_inodes(false, __VA_ARGS__)
+
 /*
  * I_SIZE_DIRTY requires special handling:
  *
@@ -60,114 +99,102 @@ static void bch2_vfs_inode_init(struct bch_fs *,
  * be set explicitly.
  */
 
-int __must_check __bch2_write_inode(struct bch_fs *c,
-                                   struct bch_inode_info *inode,
-                                   inode_set_fn set,
-                                   void *p)
+void bch2_inode_update_after_write(struct bch_fs *c,
+                                  struct bch_inode_info *inode,
+                                  struct bch_inode_unpacked *bi,
+                                  unsigned fields)
 {
-       struct btree_iter iter;
-       struct bch_inode_unpacked inode_u;
-       struct bkey_inode_buf inode_p;
-       u64 inum = inode->v.i_ino;
-       unsigned i_nlink = READ_ONCE(inode->v.i_nlink);
-       int ret;
+       set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED
+                 ? 0
+                 : bi->bi_nlink + nlink_bias(inode->v.i_mode));
+       i_uid_write(&inode->v, bi->bi_uid);
+       i_gid_write(&inode->v, bi->bi_gid);
+       inode->v.i_mode = bi->bi_mode;
 
-       /*
-        * We can't write an inode with i_nlink == 0 because it's stored biased;
-        * however, we don't need to because if i_nlink is 0 the inode is
-        * getting deleted when it's evicted.
-        */
-       if (!i_nlink)
-               return 0;
+       if (fields & ATTR_ATIME)
+               inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+       if (fields & ATTR_MTIME)
+               inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+       if (fields & ATTR_CTIME)
+               inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
 
-       lockdep_assert_held(&inode->ei_update_lock);
+       inode->ei_inode         = *bi;
+       inode->ei_qid           = bch_qid(bi);
 
-       bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0),
-                            BTREE_ITER_INTENT);
+       bch2_inode_flags_to_vfs(inode);
+}
 
-       do {
-               struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+int __must_check bch2_write_inode_trans(struct btree_trans *trans,
+                               struct bch_inode_info *inode,
+                               struct bch_inode_unpacked *inode_u,
+                               inode_set_fn set,
+                               void *p)
+{
+       struct btree_iter *iter;
+       struct bkey_inode_buf *inode_p;
+       int ret;
 
-               if ((ret = btree_iter_err(k)))
-                       goto out;
+       lockdep_assert_held(&inode->ei_update_lock);
 
-               if (WARN_ONCE(k.k->type != BCH_INODE_FS,
-                             "inode %llu not found when updating", inum)) {
-                       bch2_btree_iter_unlock(&iter);
-                       return -ENOENT;
-               }
+       iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+                       POS(inode->v.i_ino, 0),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
 
-               ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
-               if (WARN_ONCE(ret,
-                             "error %i unpacking inode %llu", ret, inum)) {
-                       ret = -ENOENT;
-                       break;
-               }
+       /* The btree node lock is our lock on the inode: */
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               return ret;
 
-               if (set) {
-                       ret = set(inode, &inode_u, p);
-                       if (ret)
-                               goto out;
-               }
+       *inode_u = inode->ei_inode;
 
-               BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
-
-               inode_u.bi_mode = inode->v.i_mode;
-               inode_u.bi_uid  = i_uid_read(&inode->v);
-               inode_u.bi_gid  = i_gid_read(&inode->v);
-               inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode);
-               inode_u.bi_dev  = inode->v.i_rdev;
-               inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime);
-               inode_u.bi_mtime= timespec_to_bch2_time(c, inode->v.i_mtime);
-               inode_u.bi_ctime= timespec_to_bch2_time(c, inode->v.i_ctime);
-
-               bch2_inode_pack(&inode_p, &inode_u);
-
-               ret = bch2_btree_insert_at(c, NULL, NULL,
-                               &inode->ei_journal_seq,
-                               BTREE_INSERT_ATOMIC|
-                               BTREE_INSERT_NOFAIL,
-                               BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
-       } while (ret == -EINTR);
-
-       if (!ret) {
-               inode->ei_size  = inode_u.bi_size;
-               inode->ei_flags = inode_u.bi_flags;
+       if (set) {
+               ret = set(inode, inode_u, p);
+               if (ret)
+                       return ret;
        }
-out:
-       bch2_btree_iter_unlock(&iter);
 
-       return ret < 0 ? ret : 0;
-}
+       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+       if (IS_ERR(inode_p))
+               return PTR_ERR(inode_p);
 
-int __must_check bch2_write_inode(struct bch_fs *c,
-                                 struct bch_inode_info *inode)
-{
-       return __bch2_write_inode(c, inode, NULL, NULL);
+       bch2_inode_pack(inode_p, inode_u);
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
+       return 0;
 }
 
-int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+int __must_check bch2_write_inode(struct bch_fs *c,
+                                 struct bch_inode_info *inode,
+                                 inode_set_fn set,
+                                 void *p, unsigned fields)
 {
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
        int ret;
 
-       mutex_lock(&inode->ei_update_lock);
-       inc_nlink(&inode->v);
-       ret = bch2_write_inode(c, inode);
-       mutex_unlock(&inode->ei_update_lock);
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
 
-       return ret;
-}
-
-int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
-{
-       int ret = 0;
+       ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
+               bch2_trans_commit(&trans, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
 
-       mutex_lock(&inode->ei_update_lock);
-       drop_nlink(&inode->v);
-       ret = bch2_write_inode(c, inode);
-       mutex_unlock(&inode->ei_update_lock);
+       /*
+        * the btree node lock protects inode->ei_inode, not ei_update_lock;
+        * this is important for inode updates via bchfs_write_index_update
+        */
+       if (!ret)
+               bch2_inode_update_after_write(c, inode, &inode_u, fields);
 
-       return ret;
+       bch2_trans_exit(&trans);
+       return ret < 0 ? ret : 0;
 }
 
 static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
@@ -197,117 +224,181 @@ static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
        return &inode->v;
 }
 
-static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
-                                                   struct bch_inode_info *dir,
-                                                   umode_t mode, dev_t rdev)
+static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
+                                 const struct inode *dir, umode_t mode)
 {
-       struct posix_acl *default_acl = NULL, *acl = NULL;
-       struct bch_inode_info *inode;
+       kuid_t uid = current_fsuid();
+       kgid_t gid;
+
+       if (dir && dir->i_mode & S_ISGID) {
+               gid = dir->i_gid;
+               if (S_ISDIR(mode))
+                       mode |= S_ISGID;
+       } else
+               gid = current_fsgid();
+
+       inode_u->bi_uid         = from_kuid(dir->i_sb->s_user_ns, uid);
+       inode_u->bi_gid         = from_kgid(dir->i_sb->s_user_ns, gid);
+       inode_u->bi_mode        = mode;
+}
+
+static int inode_update_for_create_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_inode_unpacked *new_inode = p;
+
+       bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+
+       if (S_ISDIR(new_inode->bi_mode))
+               bi->bi_nlink++;
+
+       return 0;
+}
+
+static struct bch_inode_info *
+__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
+             umode_t mode, dev_t rdev, bool tmpfile)
+{
+       struct bch_fs *c = dir->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct bch_inode_unpacked dir_u;
+       struct bch_inode_info *inode, *old;
        struct bch_inode_unpacked inode_u;
+       struct bch_hash_info hash_info;
+       struct posix_acl *default_acl = NULL, *acl = NULL;
+       u64 journal_seq = 0;
        int ret;
 
-       inode = to_bch_ei(new_inode(c->vfs_sb));
-       if (unlikely(!inode))
-               return ERR_PTR(-ENOMEM);
+       bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
+       bch2_inode_init_owner(&inode_u, &dir->v, mode);
 
-       inode_init_owner(&inode->v, &dir->v, mode);
+       inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
+
+       hash_info = bch2_hash_info_init(c, &inode_u);
+
+       if (tmpfile)
+               inode_u.bi_flags |= BCH_INODE_UNLINKED;
+
+       ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
+       if (ret)
+               return ERR_PTR(ret);
 
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-       ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl);
-       if (ret) {
-               make_bad_inode(&inode->v);
+       ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
+       if (ret)
                goto err;
-       }
 #endif
 
-       bch2_inode_init(c, &inode_u,
-                       i_uid_read(&inode->v),
-                       i_gid_read(&inode->v),
-                       inode->v.i_mode, rdev);
-       ret = bch2_inode_create(c, &inode_u,
-                               BLOCKDEV_INODE_MAX, 0,
-                               &c->unused_inode_hint);
-       if (unlikely(ret)) {
-               /*
-                * indicate to bch_evict_inode that the inode was never actually
-                * created:
-                */
-               make_bad_inode(&inode->v);
+       /*
+        * preallocate vfs inode before btree transaction, so that nothing can
+        * fail after the transaction succeeds:
+        */
+       inode = to_bch_ei(new_inode(c->vfs_sb));
+       if (unlikely(!inode)) {
+               ret = -ENOMEM;
                goto err;
        }
 
-       bch2_vfs_inode_init(c, inode, &inode_u);
+       if (!tmpfile)
+               mutex_lock(&dir->ei_update_lock);
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_inode_create(&trans, &inode_u,
+                                   BLOCKDEV_INODE_MAX, 0,
+                                   &c->unused_inode_hint) ?:
+               (default_acl
+                ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+                                     default_acl, ACL_TYPE_DEFAULT)
+                : 0) ?:
+               (acl
+                ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+                                     acl, ACL_TYPE_ACCESS)
+                : 0) ?:
+               (!tmpfile
+                ? __bch2_dirent_create(&trans, dir->v.i_ino,
+                                       &dir->ei_str_hash,
+                                       mode_to_type(mode),
+                                       &dentry->d_name,
+                                       inode_u.bi_inum,
+                                       BCH_HASH_SET_MUST_CREATE)
+               : 0) ?:
+               (!tmpfile
+                ? bch2_write_inode_trans(&trans, dir, &dir_u,
+                                         inode_update_for_create_fn,
+                                         &inode_u)
+                : 0) ?:
+               bch2_trans_commit(&trans, NULL,
+                                 &journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err_trans;
 
-       if (default_acl) {
-               ret = bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT);
-               if (unlikely(ret))
-                       goto err;
+       atomic_long_inc(&c->nr_inodes);
+
+       if (!tmpfile) {
+               bch2_inode_update_after_write(c, dir, &dir_u,
+                                             ATTR_MTIME|ATTR_CTIME);
+               journal_seq_copy(dir, inode->ei_journal_seq);
+               mutex_unlock(&dir->ei_update_lock);
        }
 
-       if (acl) {
-               ret = bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS);
-               if (unlikely(ret))
-                       goto err;
+       bch2_vfs_inode_init(c, inode, &inode_u);
+       journal_seq_copy(inode, journal_seq);
+
+       set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+       set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+       /*
+        * we must insert the new inode into the inode cache before calling
+        * bch2_trans_exit() and dropping locks, else we could race with another
+        * thread pulling the inode in and modifying it:
+        */
+
+       old = to_bch_ei(insert_inode_locked2(&inode->v));
+       if (unlikely(old)) {
+               /*
+                * We raced, another process pulled the new inode into cache
+                * before us:
+                */
+               old->ei_journal_seq = inode->ei_journal_seq;
+               make_bad_inode(&inode->v);
+               iput(&inode->v);
+
+               inode = old;
+       } else {
+               /*
+                * we really don't want insert_inode_locked2() to be setting
+                * I_NEW...
+                */
+               unlock_new_inode(&inode->v);
        }
 
-       insert_inode_hash(&inode->v);
-       atomic_long_inc(&c->nr_inodes);
+       bch2_trans_exit(&trans);
 out:
        posix_acl_release(default_acl);
        posix_acl_release(acl);
        return inode;
-err:
-       clear_nlink(&inode->v);
+err_trans:
+       if (!tmpfile)
+               mutex_unlock(&dir->ei_update_lock);
+
+       bch2_trans_exit(&trans);
+       make_bad_inode(&inode->v);
        iput(&inode->v);
+err:
+       bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
        inode = ERR_PTR(ret);
        goto out;
 }
 
-static int bch2_vfs_dirent_create(struct bch_fs *c,
-                                 struct bch_inode_info *dir,
-                                 u8 type, const struct qstr *name,
-                                 u64 dst)
-{
-       int ret;
-
-       ret = bch2_dirent_create(c, dir->v.i_ino, &dir->ei_str_hash,
-                               type, name, dst,
-                               &dir->ei_journal_seq,
-                               BCH_HASH_SET_MUST_CREATE);
-       if (unlikely(ret))
-               return ret;
-
-       dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb);
-       mark_inode_dirty_sync(&dir->v);
-       return 0;
-}
-
-static int __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
-                        umode_t mode, dev_t rdev)
-{
-       struct bch_fs *c = dir->v.i_sb->s_fs_info;
-       struct bch_inode_info *inode;
-       int ret;
-
-       inode = bch2_vfs_inode_create(c, dir, mode, rdev);
-       if (unlikely(IS_ERR(inode)))
-               return PTR_ERR(inode);
-
-       ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode),
-                                    &dentry->d_name, inode->v.i_ino);
-       if (unlikely(ret)) {
-               clear_nlink(&inode->v);
-               iput(&inode->v);
-               return ret;
-       }
-
-       if (dir->ei_journal_seq > inode->ei_journal_seq)
-               inode->ei_journal_seq = dir->ei_journal_seq;
-
-       d_instantiate(dentry, &inode->v);
-       return 0;
-}
-
 /* methods */
 
 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
@@ -331,7 +422,69 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 static int bch2_create(struct inode *vdir, struct dentry *dentry,
                       umode_t mode, bool excl)
 {
-       return __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0);
+       struct bch_inode_info *inode =
+               __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
+
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
+
+static int inode_update_for_link_fn(struct bch_inode_info *inode,
+                                   struct bch_inode_unpacked *bi,
+                                   void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       bi->bi_ctime = bch2_current_time(c);
+
+       if (bi->bi_flags & BCH_INODE_UNLINKED)
+               bi->bi_flags &= ~BCH_INODE_UNLINKED;
+       else
+               bi->bi_nlink++;
+
+       return 0;
+}
+
+static int __bch2_link(struct bch_fs *c,
+                      struct bch_inode_info *inode,
+                      struct bch_inode_info *dir,
+                      struct dentry *dentry)
+{
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
+       int ret;
+
+       mutex_lock(&inode->ei_update_lock);
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_dirent_create(&trans, dir->v.i_ino,
+                                    &dir->ei_str_hash,
+                                    mode_to_type(inode->v.i_mode),
+                                    &dentry->d_name,
+                                    inode->v.i_ino,
+                                    BCH_HASH_SET_MUST_CREATE) ?:
+               bch2_write_inode_trans(&trans, inode, &inode_u,
+                                      inode_update_for_link_fn,
+                                      NULL) ?:
+               bch2_trans_commit(&trans, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+
+       if (ret == -EINTR)
+               goto retry;
+
+       if (likely(!ret))
+               bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+
+       bch2_trans_exit(&trans);
+       mutex_unlock(&inode->ei_update_lock);
+       return ret;
 }
 
 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
@@ -344,23 +497,41 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 
        lockdep_assert_held(&inode->v.i_rwsem);
 
-       inode->v.i_ctime = current_fs_time(dir->v.i_sb);
-
-       ret = bch2_inc_nlink(c, inode);
-       if (ret)
+       ret = __bch2_link(c, inode, dir, dentry);
+       if (unlikely(ret))
                return ret;
 
        ihold(&inode->v);
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
 
-       ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->v.i_mode),
-                                    &dentry->d_name, inode->v.i_ino);
-       if (unlikely(ret)) {
-               bch2_dec_nlink(c, inode);
-               iput(&inode->v);
-               return ret;
-       }
+static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
+                                         struct bch_inode_unpacked *bi,
+                                         void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_inode_info *unlink_inode = p;
+
+       bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+
+       bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
+
+       return 0;
+}
+
+static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       bi->bi_ctime = bch2_current_time(c);
+       if (bi->bi_nlink)
+               bi->bi_nlink--;
+       else
+               bi->bi_flags |= BCH_INODE_UNLINKED;
 
-       d_instantiate(dentry, &inode->v);
        return 0;
 }
 
@@ -369,28 +540,46 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
        struct bch_fs *c = vdir->i_sb->s_fs_info;
        struct bch_inode_info *dir = to_bch_ei(vdir);
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+       struct bch_inode_unpacked dir_u, inode_u;
+       struct btree_trans trans;
        int ret;
 
-       lockdep_assert_held(&inode->v.i_rwsem);
-
-       ret = bch2_dirent_delete(c, dir->v.i_ino, &dir->ei_str_hash,
-                                &dentry->d_name, &dir->ei_journal_seq);
+       bch2_lock_inodes(dir, inode);
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_dirent_delete(&trans, dir->v.i_ino,
+                                    &dir->ei_str_hash,
+                                    &dentry->d_name) ?:
+               bch2_write_inode_trans(&trans, dir, &dir_u,
+                                      inode_update_dir_for_unlink_fn,
+                                      inode) ?:
+               bch2_write_inode_trans(&trans, inode, &inode_u,
+                                      inode_update_for_unlink_fn,
+                                      NULL) ?:
+               bch2_trans_commit(&trans, NULL,
+                                 &dir->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
        if (ret)
-               return ret;
+               goto err;
 
        if (dir->ei_journal_seq > inode->ei_journal_seq)
                inode->ei_journal_seq = dir->ei_journal_seq;
 
-       inode->v.i_ctime = dir->v.i_ctime;
-
-       if (S_ISDIR(inode->v.i_mode)) {
-               bch2_dec_nlink(c, dir);
-               drop_nlink(&inode->v);
-       }
-
-       bch2_dec_nlink(c, inode);
+       bch2_inode_update_after_write(c, dir, &dir_u,
+                                     ATTR_MTIME|ATTR_CTIME);
+       bch2_inode_update_after_write(c, inode, &inode_u,
+                                     ATTR_MTIME);
+err:
+       bch2_trans_exit(&trans);
+       bch2_unlock_inodes(dir, inode);
 
-       return 0;
+       return ret;
 }
 
 static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
@@ -400,7 +589,7 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
        struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
        int ret;
 
-       inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
+       inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
        if (unlikely(IS_ERR(inode)))
                return PTR_ERR(inode);
 
@@ -415,37 +604,28 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
        if (unlikely(ret))
                goto err;
 
-       /* XXX: racy */
-       if (dir->ei_journal_seq < inode->ei_journal_seq)
-               dir->ei_journal_seq = inode->ei_journal_seq;
+       journal_seq_copy(dir, inode->ei_journal_seq);
 
-       ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name,
-                                    inode->v.i_ino);
+       ret = __bch2_link(c, inode, dir, dentry);
        if (unlikely(ret))
                goto err;
 
        d_instantiate(dentry, &inode->v);
        return 0;
 err:
-       clear_nlink(&inode->v);
        iput(&inode->v);
        return ret;
 }
 
 static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
 {
-       struct bch_fs *c = vdir->i_sb->s_fs_info;
-       struct bch_inode_info *dir = to_bch_ei(vdir);
-       int ret;
+       struct bch_inode_info *inode =
+               __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
 
-       lockdep_assert_held(&dir->v.i_rwsem);
-
-       ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0);
-       if (unlikely(ret))
-               return ret;
-
-       bch2_inc_nlink(c, dir);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
 
+       d_instantiate(dentry, &inode->v);
        return 0;
 }
 
@@ -462,152 +642,307 @@ static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
 static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
                      umode_t mode, dev_t rdev)
 {
-       return __bch2_create(to_bch_ei(vdir), dentry, mode, rdev);
+       struct bch_inode_info *inode =
+               __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
+
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       d_instantiate(dentry, &inode->v);
+       return 0;
 }
 
-static int bch2_rename(struct bch_fs *c,
-                      struct bch_inode_info *old_dir,
-                      struct dentry *old_dentry,
-                      struct bch_inode_info *new_dir,
-                      struct dentry *new_dentry)
+struct rename_info {
+       u64                     now;
+       struct bch_inode_info   *src_dir;
+       struct bch_inode_info   *dst_dir;
+       struct bch_inode_info   *src_inode;
+       struct bch_inode_info   *dst_inode;
+       enum bch_rename_mode    mode;
+};
+
+static int inode_update_for_rename_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
 {
-       struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
-       struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-       struct timespec now = current_fs_time(old_dir->v.i_sb);
-       int ret;
+       struct rename_info *info = p;
+
+       if (inode == info->src_dir) {
+               bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
+               bi->bi_nlink += info->dst_inode &&
+                       S_ISDIR(info->dst_inode->v.i_mode) &&
+                       info->mode == BCH_RENAME_EXCHANGE;
+       }
 
-       lockdep_assert_held(&old_dir->v.i_rwsem);
-       lockdep_assert_held(&new_dir->v.i_rwsem);
+       if (inode == info->dst_dir) {
+               bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
+               bi->bi_nlink -= info->dst_inode &&
+                       S_ISDIR(info->dst_inode->v.i_mode);
+       }
 
-       if (new_inode)
-               filemap_write_and_wait_range(old_inode->v.i_mapping,
-                                            0, LLONG_MAX);
+       if (inode == info->dst_inode &&
+           info->mode == BCH_RENAME_OVERWRITE) {
+               BUG_ON(bi->bi_nlink &&
+                      S_ISDIR(info->dst_inode->v.i_mode));
 
-       if (new_inode && S_ISDIR(old_inode->v.i_mode)) {
-               lockdep_assert_held(&new_inode->v.i_rwsem);
+               if (bi->bi_nlink)
+                       bi->bi_nlink--;
+               else
+                       bi->bi_flags |= BCH_INODE_UNLINKED;
+       }
+
+       if (inode == info->src_dir ||
+           inode == info->dst_dir)
+               bi->bi_mtime = info->now;
+       bi->bi_ctime = info->now;
+
+       return 0;
+}
+
+static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
+                       struct inode *dst_vdir, struct dentry *dst_dentry,
+                       unsigned flags)
+{
+       struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+       struct rename_info i = {
+               .src_dir        = to_bch_ei(src_vdir),
+               .dst_dir        = to_bch_ei(dst_vdir),
+               .src_inode      = to_bch_ei(src_dentry->d_inode),
+               .dst_inode      = to_bch_ei(dst_dentry->d_inode),
+               .mode           = flags & RENAME_EXCHANGE
+                               ? BCH_RENAME_EXCHANGE
+                       : dst_dentry->d_inode
+                               ? BCH_RENAME_OVERWRITE : BCH_RENAME,
+       };
+       struct btree_trans trans;
+       struct bch_inode_unpacked dst_dir_u, src_dir_u;
+       struct bch_inode_unpacked src_inode_u, dst_inode_u;
+       u64 journal_seq = 0;
+       int ret;
+
+       if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+               return -EINVAL;
 
-               if (!S_ISDIR(new_inode->v.i_mode))
+       if (i.mode == BCH_RENAME_OVERWRITE) {
+               if (S_ISDIR(i.src_inode->v.i_mode) !=
+                   S_ISDIR(i.dst_inode->v.i_mode))
                        return -ENOTDIR;
 
-               if (bch2_empty_dir(c, new_inode->v.i_ino))
+               if (S_ISDIR(i.src_inode->v.i_mode) &&
+                   bch2_empty_dir(c, i.dst_inode->v.i_ino))
                        return -ENOTEMPTY;
 
-               ret = bch2_dirent_rename(c,
-                               old_dir, &old_dentry->d_name,
-                               new_dir, &new_dentry->d_name,
-                               &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
-               if (unlikely(ret))
+               ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
+                                                  0, LLONG_MAX);
+               if (ret)
                        return ret;
+       }
 
-               clear_nlink(&new_inode->v);
-               bch2_dec_nlink(c, old_dir);
-       } else if (new_inode) {
-               lockdep_assert_held(&new_inode->v.i_rwsem);
-
-               ret = bch2_dirent_rename(c,
-                               old_dir, &old_dentry->d_name,
-                               new_dir, &new_dentry->d_name,
-                               &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
-               if (unlikely(ret))
-                       return ret;
+       bch2_lock_inodes(i.src_dir,
+                        i.dst_dir,
+                        i.src_inode,
+                        i.dst_inode);
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+       i.now = bch2_current_time(c);
+
+       ret   = bch2_dirent_rename(&trans,
+                                  i.src_dir, &src_dentry->d_name,
+                                  i.dst_dir, &dst_dentry->d_name,
+                                  i.mode) ?:
+               bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
+                                      inode_update_for_rename_fn, &i) ?:
+               (i.src_dir != i.dst_dir
+                ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
+                                      inode_update_for_rename_fn, &i)
+                : 0 ) ?:
+               bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
+                                      inode_update_for_rename_fn, &i) ?:
+               (i.dst_inode
+                ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
+                                      inode_update_for_rename_fn, &i)
+                : 0 ) ?:
+               bch2_trans_commit(&trans, NULL,
+                                 &journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err;
 
-               new_inode->v.i_ctime = now;
-               bch2_dec_nlink(c, new_inode);
-       } else if (S_ISDIR(old_inode->v.i_mode)) {
-               ret = bch2_dirent_rename(c,
-                               old_dir, &old_dentry->d_name,
-                               new_dir, &new_dentry->d_name,
-                               &old_inode->ei_journal_seq, BCH_RENAME);
-               if (unlikely(ret))
-                       return ret;
+       bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
+                                     ATTR_MTIME|ATTR_CTIME);
+       journal_seq_copy(i.src_dir, journal_seq);
 
-               bch2_inc_nlink(c, new_dir);
-               bch2_dec_nlink(c, old_dir);
-       } else {
-               ret = bch2_dirent_rename(c,
-                               old_dir, &old_dentry->d_name,
-                               new_dir, &new_dentry->d_name,
-                               &old_inode->ei_journal_seq, BCH_RENAME);
-               if (unlikely(ret))
-                       return ret;
+       if (i.src_dir != i.dst_dir) {
+               bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
+                                             ATTR_MTIME|ATTR_CTIME);
+               journal_seq_copy(i.dst_dir, journal_seq);
        }
 
-       old_dir->v.i_ctime = old_dir->v.i_mtime = now;
-       new_dir->v.i_ctime = new_dir->v.i_mtime = now;
-       mark_inode_dirty_sync(&old_dir->v);
-       mark_inode_dirty_sync(&new_dir->v);
+       journal_seq_copy(i.src_inode, journal_seq);
+       if (i.dst_inode)
+               journal_seq_copy(i.dst_inode, journal_seq);
+
+       bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
+                                     ATTR_CTIME);
+       if (i.dst_inode)
+               bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
+                                             ATTR_CTIME);
+err:
+       bch2_trans_exit(&trans);
+       bch2_unlock_inodes(i.src_dir,
+                          i.dst_dir,
+                          i.src_inode,
+                          i.dst_inode);
 
-       old_inode->v.i_ctime = now;
-       mark_inode_dirty_sync(&old_inode->v);
+       return ret;
+}
+
+static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
+                                      struct bch_inode_unpacked *bi,
+                                      void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct iattr *attr = p;
+       unsigned int ia_valid = attr->ia_valid;
+
+       if (ia_valid & ATTR_UID)
+               bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid);
+       if (ia_valid & ATTR_GID)
+               bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid);
+
+       if (ia_valid & ATTR_ATIME)
+               bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
+       if (ia_valid & ATTR_MTIME)
+               bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
+       if (ia_valid & ATTR_CTIME)
+               bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
+
+       if (ia_valid & ATTR_MODE) {
+               umode_t mode = attr->ia_mode;
+               kgid_t gid = ia_valid & ATTR_GID
+                       ? attr->ia_gid
+                       : inode->v.i_gid;
+
+               if (!in_group_p(gid) &&
+                   !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
+                       mode &= ~S_ISGID;
+               bi->bi_mode = mode;
+       }
 
        return 0;
 }
 
-static int bch2_rename_exchange(struct bch_fs *c,
-                               struct bch_inode_info *old_dir,
-                               struct dentry *old_dentry,
-                               struct bch_inode_info *new_dir,
-                               struct dentry *new_dentry)
+static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
 {
-       struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
-       struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-       struct timespec now = current_fs_time(old_dir->v.i_sb);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_qid qid = inode->ei_qid;
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
+       struct posix_acl *acl = NULL;
+       unsigned qtypes = 0;
        int ret;
 
-       ret = bch2_dirent_rename(c,
-                                old_dir, &old_dentry->d_name,
-                                new_dir, &new_dentry->d_name,
-                                &old_inode->ei_journal_seq, BCH_RENAME_EXCHANGE);
-       if (unlikely(ret))
-               return ret;
+       mutex_lock(&inode->ei_update_lock);
 
-       if (S_ISDIR(old_inode->v.i_mode) !=
-           S_ISDIR(new_inode->v.i_mode)) {
-               if (S_ISDIR(old_inode->v.i_mode)) {
-                       bch2_inc_nlink(c, new_dir);
-                       bch2_dec_nlink(c, old_dir);
-               } else {
-                       bch2_dec_nlink(c, new_dir);
-                       bch2_inc_nlink(c, old_dir);
-               }
+       if (c->opts.usrquota &&
+           (iattr->ia_valid & ATTR_UID) &&
+           !uid_eq(iattr->ia_uid, inode->v.i_uid)) {
+               qid.q[QTYP_USR] = from_kuid(&init_user_ns, iattr->ia_uid),
+               qtypes |= 1 << QTYP_USR;
        }
 
-       old_dir->v.i_ctime = old_dir->v.i_mtime = now;
-       new_dir->v.i_ctime = new_dir->v.i_mtime = now;
-       mark_inode_dirty_sync(&old_dir->v);
-       mark_inode_dirty_sync(&new_dir->v);
+       if (c->opts.grpquota &&
+           (iattr->ia_valid & ATTR_GID) &&
+           !gid_eq(iattr->ia_gid, inode->v.i_gid)) {
+               qid.q[QTYP_GRP] = from_kgid(&init_user_ns, iattr->ia_gid);
+               qtypes |= 1 << QTYP_GRP;
+       }
+
+       if (qtypes) {
+               ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid,
+                                         inode->v.i_blocks +
+                                         inode->ei_quota_reserved);
+               if (ret)
+                       goto err;
+       }
 
-       old_inode->v.i_ctime = now;
-       new_inode->v.i_ctime = now;
-       mark_inode_dirty_sync(&old_inode->v);
-       mark_inode_dirty_sync(&new_inode->v);
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+       kfree(acl);
+       acl = NULL;
+
+       ret = bch2_write_inode_trans(&trans, inode, &inode_u,
+                               inode_update_for_setattr_fn, iattr) ?:
+               (iattr->ia_valid & ATTR_MODE
+                ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
+                : 0) ?:
+               bch2_trans_commit(&trans, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err_trans;
 
-       return 0;
+       bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
+
+       if (acl)
+               set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+       bch2_trans_exit(&trans);
+err:
+       mutex_unlock(&inode->ei_update_lock);
+
+       return ret;
 }
 
-static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry,
-                       struct inode *new_vdir, struct dentry *new_dentry,
-                       unsigned flags)
+static int bch2_getattr(const struct path *path, struct kstat *stat,
+                       u32 request_mask, unsigned query_flags)
 {
-       struct bch_fs *c = old_vdir->i_sb->s_fs_info;
-       struct bch_inode_info *old_dir = to_bch_ei(old_vdir);
-       struct bch_inode_info *new_dir = to_bch_ei(new_vdir);
+       struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
-       if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
-               return -EINVAL;
+       stat->dev       = inode->v.i_sb->s_dev;
+       stat->ino       = inode->v.i_ino;
+       stat->mode      = inode->v.i_mode;
+       stat->nlink     = inode->v.i_nlink;
+       stat->uid       = inode->v.i_uid;
+       stat->gid       = inode->v.i_gid;
+       stat->rdev      = inode->v.i_rdev;
+       stat->size      = i_size_read(&inode->v);
+       stat->atime     = inode->v.i_atime;
+       stat->mtime     = inode->v.i_mtime;
+       stat->ctime     = inode->v.i_ctime;
+       stat->blksize   = block_bytes(c);
+       stat->blocks    = inode->v.i_blocks;
+
+       if (request_mask & STATX_BTIME) {
+               stat->result_mask |= STATX_BTIME;
+               stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
+       }
 
-       if (flags & RENAME_EXCHANGE)
-               return bch2_rename_exchange(c, old_dir, old_dentry,
-                                           new_dir, new_dentry);
+       if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+               stat->attributes |= STATX_ATTR_IMMUTABLE;
+       if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+               stat->attributes |= STATX_ATTR_APPEND;
+       if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+               stat->attributes |= STATX_ATTR_NODUMP;
 
-       return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry);
+       return 0;
 }
 
 static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       int ret = 0;
+       int ret;
 
        lockdep_assert_held(&inode->v.i_rwsem);
 
@@ -615,36 +950,21 @@ static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
        if (ret)
                return ret;
 
-       if (iattr->ia_valid & ATTR_SIZE) {
-               ret = bch2_truncate(inode, iattr);
-       } else {
-               mutex_lock(&inode->ei_update_lock);
-               setattr_copy(&inode->v, iattr);
-               ret = bch2_write_inode(c, inode);
-               mutex_unlock(&inode->ei_update_lock);
-       }
-
-       if (unlikely(ret))
-               return ret;
-
-       if (iattr->ia_valid & ATTR_MODE)
-               ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
-
-       return ret;
+       return iattr->ia_valid & ATTR_SIZE
+               ? bch2_truncate(inode, iattr)
+               : bch2_setattr_nonsize(inode, iattr);
 }
 
 static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
 {
-       struct bch_fs *c = vdir->i_sb->s_fs_info;
-       struct bch_inode_info *dir = to_bch_ei(vdir);
-       struct bch_inode_info *inode;
+       struct bch_inode_info *inode =
+               __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true);
 
-       /* XXX: i_nlink should be 0? */
-       inode = bch2_vfs_inode_create(c, dir, mode, 0);
-       if (unlikely(IS_ERR(inode)))
+       if (IS_ERR(inode))
                return PTR_ERR(inode);
 
-       d_tmpfile(dentry, &inode->v);
+       d_mark_tmpfile(dentry, &inode->v);
+       d_instantiate(dentry, &inode->v);
        return 0;
 }
 
@@ -653,27 +973,27 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
 {
        if (bkey_extent_is_data(&k->k)) {
                struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
-               const struct bch_extent_ptr *ptr;
-               const union bch_extent_crc *crc;
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
                int ret;
 
-               extent_for_each_ptr_crc(e, ptr, crc) {
+               extent_for_each_ptr_decode(e, p, entry) {
                        int flags2 = 0;
-                       u64 offset = ptr->offset;
+                       u64 offset = p.ptr.offset;
 
-                       if (crc_compression_type(crc))
+                       if (p.crc.compression_type)
                                flags2 |= FIEMAP_EXTENT_ENCODED;
                        else
-                               offset += crc_offset(crc);
+                               offset += p.crc.offset;
 
                        if ((offset & (PAGE_SECTORS - 1)) ||
                            (e.k->size & (PAGE_SECTORS - 1)))
                                flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
                        ret = fiemap_fill_next_extent(info,
-                                                     bkey_start_offset(e.k) << 9,
-                                                     offset << 9,
-                                                     e.k->size << 9, flags|flags2);
+                                               bkey_start_offset(e.k) << 9,
+                                               offset << 9,
+                                               e.k->size << 9, flags|flags2);
                        if (ret)
                                return ret;
                }
@@ -776,6 +1096,7 @@ static const struct file_operations bch_file_operations = {
 };
 
 static const struct inode_operations bch_file_inode_operations = {
+       .getattr        = bch2_getattr,
        .setattr        = bch2_setattr,
        .fiemap         = bch2_fiemap,
        .listxattr      = bch2_xattr_list,
@@ -795,6 +1116,7 @@ static const struct inode_operations bch_dir_inode_operations = {
        .rmdir          = bch2_rmdir,
        .mknod          = bch2_mknod,
        .rename         = bch2_rename2,
+       .getattr        = bch2_getattr,
        .setattr        = bch2_setattr,
        .tmpfile        = bch2_tmpfile,
        .listxattr      = bch2_xattr_list,
@@ -817,6 +1139,7 @@ static const struct file_operations bch_dir_file_operations = {
 
 static const struct inode_operations bch_symlink_inode_operations = {
        .get_link       = page_get_link,
+       .getattr        = bch2_getattr,
        .setattr        = bch2_setattr,
        .listxattr      = bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
@@ -826,6 +1149,7 @@ static const struct inode_operations bch_symlink_inode_operations = {
 };
 
 static const struct inode_operations bch_special_inode_operations = {
+       .getattr        = bch2_getattr,
        .setattr        = bch2_setattr,
        .listxattr      = bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
@@ -895,27 +1219,18 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
                                struct bch_inode_info *inode,
                                struct bch_inode_unpacked *bi)
 {
-       inode->v.i_mode         = bi->bi_mode;
-       i_uid_write(&inode->v, bi->bi_uid);
-       i_gid_write(&inode->v, bi->bi_gid);
+       bch2_inode_update_after_write(c, inode, bi, ~0);
+
        inode->v.i_blocks       = bi->bi_sectors;
        inode->v.i_ino          = bi->bi_inum;
-       set_nlink(&inode->v, bi->bi_nlink + nlink_bias(inode->v.i_mode));
        inode->v.i_rdev         = bi->bi_dev;
        inode->v.i_generation   = bi->bi_generation;
        inode->v.i_size         = bi->bi_size;
-       inode->v.i_atime        = bch2_time_to_timespec(c, bi->bi_atime);
-       inode->v.i_mtime        = bch2_time_to_timespec(c, bi->bi_mtime);
-       inode->v.i_ctime        = bch2_time_to_timespec(c, bi->bi_ctime);
 
        inode->ei_journal_seq   = 0;
-       inode->ei_size          = bi->bi_size;
-       inode->ei_flags         = bi->bi_flags;
-       atomic64_set(&inode->ei_sectors, bi->bi_sectors);
+       inode->ei_quota_reserved = 0;
        inode->ei_str_hash      = bch2_hash_info_init(c, bi);
 
-       bch2_inode_flags_to_vfs(inode);
-
        inode->v.i_mapping->a_ops = &bch_address_space_operations;
 
        switch (inode->v.i_mode & S_IFMT) {
@@ -948,9 +1263,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 
        inode_init_once(&inode->v);
        mutex_init(&inode->ei_update_lock);
+       mutex_init(&inode->ei_quota_lock);
        inode->ei_journal_seq = 0;
-       atomic_long_set(&inode->ei_size_dirty_count, 0);
-       atomic_long_set(&inode->ei_sectors_dirty_count, 0);
 
        return &inode->v;
 }
@@ -968,6 +1282,19 @@ static void bch2_destroy_inode(struct inode *vinode)
        call_rcu(&vinode->i_rcu, bch2_i_callback);
 }
 
+static int inode_update_times_fn(struct bch_inode_info *inode,
+                                struct bch_inode_unpacked *bi,
+                                void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
+       bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
+       bi->bi_ctime    = timespec_to_bch2_time(c, inode->v.i_ctime);
+
+       return 0;
+}
+
 static int bch2_vfs_write_inode(struct inode *vinode,
                                struct writeback_control *wbc)
 {
@@ -976,7 +1303,8 @@ static int bch2_vfs_write_inode(struct inode *vinode,
        int ret;
 
        mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode);
+       ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+                              ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
        mutex_unlock(&inode->ei_update_lock);
 
        if (c->opts.journal_flush_disabled)
@@ -995,17 +1323,19 @@ static void bch2_evict_inode(struct inode *vinode)
 
        truncate_inode_pages_final(&inode->v.i_data);
 
-       if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) {
-               /* XXX - we want to check this stuff iff there weren't IO errors: */
-               BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count));
-               BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks);
-       }
-
        clear_inode(&inode->v);
 
+       BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
+
        if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+               bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
+                               BCH_QUOTA_WARN);
+               bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
+                               BCH_QUOTA_WARN);
                bch2_inode_rm(c, inode->v.i_ino);
-               atomic_long_dec(&c->nr_inodes);
+
+               WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
+                         "nr_inodes < 0");
        }
 }
 
@@ -1013,12 +1343,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct bch_fs *c = sb->s_fs_info;
+       struct bch_fs_usage usage = bch2_fs_usage_read(c);
+       u64 hidden_metadata = usage.buckets[BCH_DATA_SB] +
+               usage.buckets[BCH_DATA_JOURNAL];
+       unsigned shift = sb->s_blocksize_bits - 9;
        u64 fsid;
 
        buf->f_type     = BCACHEFS_STATFS_MAGIC;
        buf->f_bsize    = sb->s_blocksize;
-       buf->f_blocks   = c->capacity >> PAGE_SECTOR_SHIFT;
-       buf->f_bfree    = (c->capacity - bch2_fs_sectors_used(c)) >> PAGE_SECTOR_SHIFT;
+       buf->f_blocks   = (c->capacity - hidden_metadata) >> shift;
+       buf->f_bfree    = (c->capacity - bch2_fs_sectors_used(c, usage)) >> shift;
        buf->f_bavail   = buf->f_bfree;
        buf->f_files    = atomic_long_read(&c->nr_inodes);
        buf->f_ffree    = U64_MAX;
@@ -1027,7 +1361,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
               le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
-       buf->f_namelen  = NAME_MAX;
+       buf->f_namelen  = BCH_NAME_MAX;
 
        return 0;
 }
@@ -1044,81 +1378,100 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
        return bch2_journal_flush(&c->journal);
 }
 
-static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
-                                            struct bch_opts opts)
+static struct bch_fs *bch2_path_to_fs(const char *dev)
 {
-       size_t nr_devs = 0, i = 0;
-       char *dev_name, *s, **devs;
-       struct bch_fs *c = NULL;
-       const char *err = "cannot allocate memory";
+       struct bch_fs *c;
+       struct block_device *bdev = lookup_bdev(dev);
 
-       dev_name = kstrdup(_dev_name, GFP_KERNEL);
-       if (!dev_name)
-               return NULL;
+       if (IS_ERR(bdev))
+               return ERR_CAST(bdev);
 
-       for (s = dev_name; s; s = strchr(s + 1, ':'))
-               nr_devs++;
+       c = bch2_bdev_to_fs(bdev);
+       bdput(bdev);
+       return c ?: ERR_PTR(-ENOENT);
+}
 
-       devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
-       if (!devs)
-               goto err;
+static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs,
+                                              unsigned nr_devs, struct bch_opts opts)
+{
+       struct bch_fs *c, *c1, *c2;
+       size_t i;
 
-       for (i = 0, s = dev_name;
-            s;
-            (s = strchr(s, ':')) && (*s++ = '\0'))
-               devs[i++] = s;
+       if (!nr_devs)
+               return ERR_PTR(-EINVAL);
+
+       c = bch2_fs_open(devs, nr_devs, opts);
 
-       err = bch2_fs_open(devs, nr_devs, opts, &c);
-       if (err) {
+       if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) {
                /*
                 * Already open?
                 * Look up each block device, make sure they all belong to a
                 * filesystem and they all belong to the _same_ filesystem
                 */
 
-               for (i = 0; i < nr_devs; i++) {
-                       struct block_device *bdev = lookup_bdev(devs[i]);
-                       struct bch_fs *c2;
-
-                       if (IS_ERR(bdev))
-                               goto err;
-
-                       c2 = bch2_bdev_to_fs(bdev);
-                       bdput(bdev);
+               c1 = bch2_path_to_fs(devs[0]);
+               if (!c1)
+                       return c;
 
-                       if (!c)
-                               c = c2;
-                       else if (c2)
+               for (i = 1; i < nr_devs; i++) {
+                       c2 = bch2_path_to_fs(devs[i]);
+                       if (!IS_ERR(c2))
                                closure_put(&c2->cl);
 
-                       if (!c)
-                               goto err;
-                       if (c != c2) {
-                               closure_put(&c->cl);
-                               goto err;
+                       if (c1 != c2) {
+                               closure_put(&c1->cl);
+                               return c;
                        }
                }
 
-               mutex_lock(&c->state_lock);
+               c = c1;
+       }
 
-               if (!bch2_fs_running(c)) {
-                       mutex_unlock(&c->state_lock);
-                       closure_put(&c->cl);
-                       err = "incomplete filesystem";
-                       c = NULL;
-                       goto err;
-               }
+       if (IS_ERR(c))
+               return c;
+
+       mutex_lock(&c->state_lock);
 
+       if (!bch2_fs_running(c)) {
                mutex_unlock(&c->state_lock);
+               closure_put(&c->cl);
+               pr_err("err mounting %s: incomplete filesystem", dev_name);
+               return ERR_PTR(-EINVAL);
        }
 
+       mutex_unlock(&c->state_lock);
+
        set_bit(BCH_FS_BDEV_MOUNTED, &c->flags);
+       return c;
+}
+
+static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name,
+                                            struct bch_opts opts)
+{
+       char *dev_name = NULL, **devs = NULL, *s;
+       struct bch_fs *c = ERR_PTR(-ENOMEM);
+       size_t i, nr_devs = 0;
+
+       dev_name = kstrdup(_dev_name, GFP_KERNEL);
+       if (!dev_name)
+               goto err;
+
+       for (s = dev_name; s; s = strchr(s + 1, ':'))
+               nr_devs++;
+
+       devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL);
+       if (!devs)
+               goto err;
+
+       for (i = 0, s = dev_name;
+            s;
+            (s = strchr(s, ':')) && (*s++ = '\0'))
+               devs[i++] = s;
+
+       c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts);
 err:
        kfree(devs);
        kfree(dev_name);
-
-       if (!c)
-               pr_err("bch_fs_open err %s", err);
        return c;
 }
 
@@ -1168,6 +1521,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 {
        struct bch_fs *c = root->d_sb->s_fs_info;
        enum bch_opt_id i;
+       char buf[512];
 
        for (i = 0; i < bch2_opts_nr; i++) {
                const struct bch_option *opt = &bch2_opt_table[i];
@@ -1179,17 +1533,10 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
                if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
                        continue;
 
-               switch (opt->type) {
-               case BCH_OPT_BOOL:
-                       seq_printf(seq, ",%s%s", v ? "" : "no", opt->attr.name);
-                       break;
-               case BCH_OPT_UINT:
-                       seq_printf(seq, ",%s=%llu", opt->attr.name, v);
-                       break;
-               case BCH_OPT_STR:
-                       seq_printf(seq, ",%s=%s", opt->attr.name, opt->choices[v]);
-                       break;
-               }
+               bch2_opt_to_text(c, buf, sizeof(buf), opt, v,
+                                OPT_SHOW_MOUNT_STYLE);
+               seq_putc(seq, ',');
+               seq_puts(seq, buf);
        }
 
        return 0;
@@ -1241,8 +1588,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
                return ERR_PTR(ret);
 
        c = bch2_open_as_blockdevs(dev_name, opts);
-       if (!c)
-               return ERR_PTR(-ENOENT);
+       if (IS_ERR(c))
+               return ERR_CAST(c);
 
        sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|MS_NOSEC, c);
        if (IS_ERR(sb)) {
@@ -1268,13 +1615,24 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
        sb->s_maxbytes          = MAX_LFS_FILESIZE;
        sb->s_op                = &bch_super_operations;
        sb->s_export_op         = &bch_export_ops;
+#ifdef CONFIG_BCACHEFS_QUOTA
+       sb->s_qcop              = &bch2_quotactl_operations;
+       sb->s_quota_types       = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
+#endif
        sb->s_xattr             = bch2_xattr_handlers;
        sb->s_magic             = BCACHEFS_STATFS_MAGIC;
        sb->s_time_gran         = c->sb.time_precision;
        c->vfs_sb               = sb;
-       sb->s_bdi               = &c->bdi;
        strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
 
+       ret = super_setup_bdi(sb);
+       if (ret)
+               goto err_put_super;
+
+       sb->s_bdi->congested_fn         = bch2_congested;
+       sb->s_bdi->congested_data       = c;
+       sb->s_bdi->ra_pages             = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+
        for_each_online_member(ca, c, i) {
                struct block_device *bdev = ca->disk_sb.bdev;
 
@@ -1336,12 +1694,6 @@ MODULE_ALIAS_FS("bcachefs");
 void bch2_vfs_exit(void)
 {
        unregister_filesystem(&bcache_fs_type);
-       if (bch2_dio_write_bioset)
-               bioset_free(bch2_dio_write_bioset);
-       if (bch2_dio_read_bioset)
-               bioset_free(bch2_dio_read_bioset);
-       if (bch2_writepage_bioset)
-               bioset_free(bch2_writepage_bioset);
        if (bch2_inode_cache)
                kmem_cache_destroy(bch2_inode_cache);
 }
@@ -1354,20 +1706,6 @@ int __init bch2_vfs_init(void)
        if (!bch2_inode_cache)
                goto err;
 
-       bch2_writepage_bioset =
-               bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio));
-       if (!bch2_writepage_bioset)
-               goto err;
-
-       bch2_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
-       if (!bch2_dio_read_bioset)
-               goto err;
-
-       bch2_dio_write_bioset =
-               bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio));
-       if (!bch2_dio_write_bioset)
-               goto err;
-
        ret = register_filesystem(&bcache_fs_type);
        if (ret)
                goto err;