]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to eab3b355cf bcachefs: trace transaction restarts
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 16 Jul 2018 07:58:54 +0000 (03:58 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 16 Jul 2018 08:00:44 +0000 (04:00 -0400)
32 files changed:
.bcachefs_revision
cmd_migrate.c
libbcachefs/acl.c
libbcachefs/acl.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.h
libbcachefs/bset.c
libbcachefs/btree_cache.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_leaf.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/fs-io.c
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/fsck.h
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/recovery.c
libbcachefs/str_hash.h
libbcachefs/util.c
libbcachefs/xattr.c
libbcachefs/xattr.h

index f1807172b5f87fde0f5116607fa728cb1b60b0ae..dddb04437eb6f5c534372f169fbe4c4149bbd346 100644 (file)
@@ -1 +1 @@
-940d6ca657ea70758f3f43323bfd531019a40d3c
+eab3b355cf6fcabbf07d7a9032c68e95cab37ad0
index 6186653427e7449a18cce66d9c3e77b7aef31297..44283c3cc3f84d5871d7cca00fe577dc32ed3bb9 100644 (file)
@@ -239,8 +239,9 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
 
                const struct xattr_handler *h = xattr_resolve_name(&attr);
 
-               int ret = bch2_xattr_set(c, dst->bi_inum, &hash_info, attr,
-                                        val, val_size, 0, h->flags, NULL);
+               int ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+                               bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr,
+                                              val, val_size, h->flags, 0));
                if (ret < 0)
                        die("error creating xattr: %s", strerror(-ret));
        }
index a8735bc04b4d04e4c3508d4e93792f47de2d4123..534ea94e545b66350623d3fc1a73d6ef8a68e1c4 100644 (file)
@@ -132,7 +132,8 @@ invalid:
  * Convert from in-memory to filesystem representation.
  */
 static struct bkey_i_xattr *
-bch2_acl_to_xattr(const struct posix_acl *acl,
+bch2_acl_to_xattr(struct btree_trans *trans,
+                 const struct posix_acl *acl,
                  int type)
 {
        struct bkey_i_xattr *xattr;
@@ -164,7 +165,7 @@ bch2_acl_to_xattr(const struct posix_acl *acl,
        if (u64s > U8_MAX)
                return ERR_PTR(-E2BIG);
 
-       xattr = kmalloc(u64s * sizeof(u64), GFP_KERNEL);
+       xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
        if (IS_ERR(xattr))
                return xattr;
 
@@ -214,20 +215,29 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
 {
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_iter iter;
+       struct btree_trans trans;
+       struct btree_iter *iter;
        struct bkey_s_c_xattr xattr;
-       struct bkey_s_c k;
        struct posix_acl *acl = NULL;
-       int name_index = acl_to_xattr_type(type);
 
-       k = bch2_xattr_get_iter(c, &iter, inode, "", name_index);
-       if (IS_ERR(k.k)) {
-               if (PTR_ERR(k.k) != -ENOENT)
-                       acl = ERR_CAST(k.k);
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+                       &inode->ei_str_hash, inode->v.i_ino,
+                       &X_SEARCH(acl_to_xattr_type(type), "", 0),
+                       0);
+       if (IS_ERR(iter)) {
+               if (PTR_ERR(iter) == -EINTR)
+                       goto retry;
+
+               if (PTR_ERR(iter) != -ENOENT)
+                       acl = ERR_CAST(iter);
                goto out;
        }
 
-       xattr = bkey_s_c_to_xattr(k);
+       xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
 
        acl = bch2_acl_from_disk(xattr_val(xattr.v),
                        le16_to_cpu(xattr.v->x_val_len));
@@ -235,49 +245,59 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
        if (!IS_ERR(acl))
                set_cached_acl(&inode->v, type, acl);
 out:
-       bch2_btree_iter_unlock(&iter);
+       bch2_trans_exit(&trans);
        return acl;
 }
 
-int __bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
+int bch2_set_acl_trans(struct btree_trans *trans,
+                      struct bch_inode_unpacked *inode_u,
+                      const struct bch_hash_info *hash_info,
+                      struct posix_acl *acl, int type)
 {
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
        int ret;
 
        if (type == ACL_TYPE_DEFAULT &&
-           !S_ISDIR(inode->v.i_mode))
+           !S_ISDIR(inode_u->bi_mode))
                return acl ? -EACCES : 0;
 
        if (acl) {
                struct bkey_i_xattr *xattr =
-                       bch2_acl_to_xattr(acl, type);
+                       bch2_acl_to_xattr(trans, acl, type);
                if (IS_ERR(xattr))
                        return PTR_ERR(xattr);
 
-               ret = bch2_hash_set(bch2_xattr_hash_desc, &inode->ei_str_hash,
-                                   c, inode->v.i_ino, &inode->ei_journal_seq,
-                                   &xattr->k_i, 0);
-               kfree(xattr);
+               ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+                                     inode_u->bi_inum, &xattr->k_i, 0);
        } else {
                struct xattr_search_key search =
                        X_SEARCH(acl_to_xattr_type(type), "", 0);
 
-               ret = bch2_hash_delete(bch2_xattr_hash_desc, &inode->ei_str_hash,
-                                      c, inode->v.i_ino, &inode->ei_journal_seq,
-                                      &search);
+               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
+                                      inode_u->bi_inum, &search);
        }
 
-       if (!ret)
-               set_cached_acl(&inode->v, type, acl);
+       return ret == -ENOENT ? 0 : ret;
+}
 
-       return ret;
+static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
+                                      struct bch_inode_unpacked *bi,
+                                      void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct timespec now = current_time(&inode->v);
+       umode_t mode = (unsigned long) p;
+
+       bi->bi_ctime    = timespec_to_bch2_time(c, now);
+       bi->bi_mode     = mode;
+       return 0;
 }
 
 int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
 {
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
        umode_t mode = inode->v.i_mode;
        int ret;
 
@@ -287,19 +307,76 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
                        return ret;
        }
 
-       ret = __bch2_set_acl(vinode, acl, type);
-       if (ret)
-               return ret;
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = bch2_set_acl_trans(&trans,
+                                  &inode->ei_inode,
+                                  &inode->ei_str_hash,
+                                  acl, type) ?:
+               bch2_write_inode_trans(&trans, inode, &inode_u,
+                                      inode_update_for_set_acl_fn,
+                                      (void *)(unsigned long) mode) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err;
+
+       bch2_inode_update_after_write(c, inode, &inode_u,
+                                     ATTR_CTIME|ATTR_MODE);
+
+       set_cached_acl(&inode->v, type, acl);
+err:
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans,
+                  struct bch_inode_info *inode,
+                  umode_t mode,
+                  struct posix_acl **new_acl)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c_xattr xattr;
+       struct bkey_i_xattr *new;
+       struct posix_acl *acl;
+       int ret = 0;
 
-       if (mode != inode->v.i_mode) {
-               mutex_lock(&inode->ei_update_lock);
-               inode->v.i_mode = mode;
-               inode->v.i_ctime = current_time(&inode->v);
+       iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+                       &inode->ei_str_hash, inode->v.i_ino,
+                       &X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+                       BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+
+       xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+       acl = bch2_acl_from_disk(xattr_val(xattr.v),
+                       le16_to_cpu(xattr.v->x_val_len));
+       if (IS_ERR_OR_NULL(acl))
+               return PTR_ERR(acl);
+
+       ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+       if (ret)
+               goto err;
 
-               ret = bch2_write_inode(c, inode);
-               mutex_unlock(&inode->ei_update_lock);
+       new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+       if (IS_ERR(new)) {
+               ret = PTR_ERR(new);
+               goto err;
        }
 
+       bch2_trans_update(trans, iter, &new->k_i, 0);
+       *new_acl = acl;
+       acl = NULL;
+err:
+       kfree(acl);
        return ret;
 }
 
index 0be31ee9e59d3270f0e563d403b66e242954c3ab..e06724309ff8fa9a1b51e47bad5d30c33e10284b 100644 (file)
@@ -1,6 +1,11 @@
 #ifndef _BCACHEFS_ACL_H
 #define _BCACHEFS_ACL_H
 
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
 
 #define BCH_ACL_VERSION        0x0001
@@ -20,20 +25,30 @@ typedef struct {
        __le32          a_version;
 } bch_acl_header;
 
-struct posix_acl;
+struct posix_acl *bch2_get_acl(struct inode *, int);
 
-extern struct posix_acl *bch2_get_acl(struct inode *, int);
-extern int __bch2_set_acl(struct inode *, struct posix_acl *, int);
-extern int bch2_set_acl(struct inode *, struct posix_acl *, int);
+int bch2_set_acl_trans(struct btree_trans *,
+                      struct bch_inode_unpacked *,
+                      const struct bch_hash_info *,
+                      struct posix_acl *, int);
+int bch2_set_acl(struct inode *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+                  umode_t, struct posix_acl **);
 
 #else
 
-static inline int __bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+static inline int bch2_set_acl_trans(struct btree_trans *trans,
+                                    struct bch_inode_unpacked *inode_u,
+                                    const struct bch_hash_info *hash_info,
+                                    struct posix_acl *acl, int type)
 {
        return 0;
 }
 
-static inline int bch2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+static inline int bch2_acl_chmod(struct btree_trans *trans,
+                                struct bch_inode_info *inode,
+                                umode_t mode,
+                                struct posix_acl **new_acl)
 {
        return 0;
 }
index 1482b80a8672655edf2f37e0a42ccbaa176f8fcf..bd5ea6fc59d7d8bf0180adc25f2d92f86d57e743 100644 (file)
@@ -262,7 +262,11 @@ do {                                                                       \
        BCH_DEBUG_PARAM(journal_seq_verify,                             \
                "Store the journal sequence number in the version "     \
                "number of every btree key, and verify that btree "     \
-               "update ordering is preserved during recovery")
+               "update ordering is preserved during recovery")         \
+       BCH_DEBUG_PARAM(inject_invalid_keys,                            \
+               "Store the journal sequence number in the version "     \
+               "number of every btree key, and verify that btree "     \
+               "update ordering is preserved during recovery")         \
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -465,6 +469,7 @@ enum {
        /* misc: */
        BCH_FS_BDEV_MOUNTED,
        BCH_FS_FSCK_FIXED_ERRORS,
+       BCH_FS_FSCK_UNFIXED_ERRORS,
        BCH_FS_FIXED_GENS,
        BCH_FS_REBUILD_REPLICAS,
        BCH_FS_HOLD_BTREE_WRITES,
index b6e7b983bc5bf76297ff9de1d209d204cec5c149..e300738d6c61b6eb21bdbe86d6807035d98b0112 100644 (file)
@@ -722,9 +722,7 @@ enum {
 
        __BCH_INODE_I_SIZE_DIRTY= 5,
        __BCH_INODE_I_SECTORS_DIRTY= 6,
-
-       /* not implemented yet: */
-       __BCH_INODE_HAS_XATTRS  = 7, /* has xattrs in xattr btree */
+       __BCH_INODE_UNLINKED    = 7,
 
        /* bits 20+ reserved for packed fields below: */
 };
@@ -736,7 +734,7 @@ enum {
 #define BCH_INODE_NOATIME      (1 << __BCH_INODE_NOATIME)
 #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
 #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
-#define BCH_INODE_HAS_XATTRS   (1 << __BCH_INODE_HAS_XATTRS)
+#define BCH_INODE_UNLINKED     (1 << __BCH_INODE_UNLINKED)
 
 LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 32);
@@ -1222,6 +1220,7 @@ enum bch_sb_features {
        BCH_FEATURE_LZ4                 = 0,
        BCH_FEATURE_GZIP                = 1,
        BCH_FEATURE_ZSTD                = 2,
+       BCH_FEATURE_ATOMIC_NLINK        = 3,
 };
 
 /* options: */
index 2f62bd8e32582c570df0bd4fcb87d4cd9494ed09..bd1d21b0e49b17f609757091526add4826e0b71d 100644 (file)
@@ -206,14 +206,12 @@ void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
 static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
 {
-       if (l.hi != r.hi)
-               return l.hi < r.hi ? -1 : 1;
-       if (l.lo != r.lo)
-               return l.lo < r.lo ? -1 : 1;
-       return 0;
+       return  (l.hi > r.hi) - (l.hi < r.hi) ?:
+               (l.lo > r.lo) - (l.lo < r.lo);
 }
 
 #define ZERO_VERSION   ((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION    ((struct bversion) { .hi = ~0, .lo = ~0ULL })
 
 static __always_inline int bversion_zero(struct bversion v)
 {
index 5c77787214c77e145ce1a62d6c6e00d1c81171ee..8c77fc509b55fdef2f005b254ab9d87ae1a5f26b 100644 (file)
@@ -1449,7 +1449,7 @@ static struct bkey_packed *bch2_bset_search(struct btree *b,
                       !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater))
                        m = bkey_next(m);
 
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+       if (btree_keys_expensive_checks(b)) {
                struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
                BUG_ON(prev &&
index f15a415e37e359951c64649d4bfdde335ec0b161..db3712a83dc1453bfcfd42f7f6ae0609b2be0524 100644 (file)
@@ -730,6 +730,7 @@ retry:
                        if (bch2_btree_node_relock(iter, level + 1))
                                goto retry;
 
+                       trans_restart();
                        return ERR_PTR(-EINTR);
                }
        }
index 847dfd685eacda5af224a19b98ac7f10c272b767..94f56dbbeac33326c0b86ea3bcc6dda6f40d61ef 100644 (file)
@@ -1298,7 +1298,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
                const char *invalid = bch2_bkey_val_invalid(c, type, u);
 
-               if (invalid) {
+               if (invalid ||
+                   (inject_invalid_keys(c) &&
+                    !bversion_cmp(u.k->version, MAX_VERSION))) {
                        char buf[160];
 
                        bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
@@ -1310,6 +1312,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
                                          (u64 *) vstruct_end(i) - (u64 *) k);
+                       set_btree_bset_end(b, b->set);
                        continue;
                }
 
index 097b68e073992dc7887508bf98f21e2e657e8d09..a52ec12e9058d88b7c64cfdfd8f1498e6ef9b1a2 100644 (file)
@@ -262,6 +262,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
        if (ret)
                __btree_node_lock_type(c, b, type);
+       else
+               trans_restart();
+
        return ret;
 }
 
@@ -1555,6 +1558,7 @@ void bch2_btree_iter_unlink(struct btree_iter *iter)
        for_each_linked_btree_iter(iter, linked)
                if (linked->next == iter) {
                        linked->next = iter->next;
+                       iter->next = iter;
                        return;
                }
 
@@ -1571,8 +1575,9 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                unsigned nr_iters = 0;
 
-               for_each_btree_iter(iter, new)
-                       nr_iters++;
+               for_each_btree_iter(new, iter)
+                       if (iter->btree_id == new->btree_id)
+                               nr_iters++;
 
                BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
        }
@@ -1580,8 +1585,278 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
 
 void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
 {
+       unsigned i;
+
        __bch2_btree_iter_unlock(dst);
        memcpy(dst, src, offsetof(struct btree_iter, next));
-       dst->nodes_locked = dst->nodes_intent_locked = 0;
-       dst->uptodate = BTREE_ITER_NEED_RELOCK;
+
+       for (i = 0; i < BTREE_MAX_DEPTH; i++)
+               if (btree_node_locked(dst, i))
+                       six_lock_increment(&dst->l[i].b->lock,
+                                          __btree_lock_want(dst, i));
+}
+
+/* new transactional stuff: */
+
+static void btree_trans_verify(struct btree_trans *trans)
+{
+       unsigned i;
+
+       for (i = 0; i < trans->nr_iters; i++) {
+               struct btree_iter *iter = &trans->iters[i];
+
+               BUG_ON(btree_iter_linked(iter) !=
+                      ((trans->iters_linked & (1 << i)) &&
+                       !is_power_of_2(trans->iters_linked)));
+       }
+}
+
+void bch2_trans_iter_free(struct btree_trans *trans,
+                         struct btree_iter *iter)
+{
+       unsigned idx;
+
+       for (idx = 0; idx < trans->nr_iters; idx++)
+               if (&trans->iters[idx] == iter)
+                       goto found;
+       BUG();
+found:
+       BUG_ON(!(trans->iters_linked & (1U << idx)));
+
+       trans->iters_live       &= ~(1U << idx);
+       trans->iters_linked     &= ~(1U << idx);
+       bch2_btree_iter_unlink(iter);
+}
+
+static int btree_trans_realloc_iters(struct btree_trans *trans)
+{
+       struct btree_iter *new_iters;
+       unsigned i;
+
+       bch2_trans_unlock(trans);
+
+       new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX,
+                           GFP_NOFS);
+       if (!new_iters)
+               return -ENOMEM;
+
+       memcpy(new_iters, trans->iters,
+              sizeof(struct btree_iter) * trans->nr_iters);
+       trans->iters = new_iters;
+
+       for (i = 0; i < trans->nr_iters; i++)
+               trans->iters[i].next = &trans->iters[i];
+
+       if (trans->iters_linked) {
+               unsigned first_linked = __ffs(trans->iters_linked);
+
+               for (i = first_linked + 1; i < trans->nr_iters; i++)
+                       if (trans->iters_linked & (1 << i))
+                               bch2_btree_iter_link(&trans->iters[first_linked],
+                                                    &trans->iters[i]);
+       }
+
+       btree_trans_verify(trans);
+
+       if (trans->iters_live) {
+               trans_restart();
+               return -EINTR;
+       }
+
+       return 0;
+}
+
+int bch2_trans_preload_iters(struct btree_trans *trans)
+{
+       if (trans->iters != trans->iters_onstack)
+               return 0;
+
+       return btree_trans_realloc_iters(trans);
+}
+
+static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
+                                                unsigned btree_id,
+                                                unsigned flags, u64 iter_id)
+{
+       struct btree_iter *iter;
+       int idx;
+
+       BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
+
+       for (idx = 0; idx < trans->nr_iters; idx++)
+               if (trans->iter_ids[idx] == iter_id)
+                       goto found;
+       idx = -1;
+found:
+       if (idx < 0) {
+               idx = ffz(trans->iters_linked);
+               if (idx < trans->nr_iters)
+                       goto got_slot;
+
+               BUG_ON(trans->nr_iters == BTREE_ITER_MAX);
+
+               if (trans->iters == trans->iters_onstack &&
+                   trans->nr_iters == ARRAY_SIZE(trans->iters_onstack)) {
+                       int ret = btree_trans_realloc_iters(trans);
+                       if (ret)
+                               return ERR_PTR(ret);
+               }
+
+               idx = trans->nr_iters++;
+got_slot:
+               trans->iter_ids[idx] = iter_id;
+               iter = &trans->iters[idx];
+
+               bch2_btree_iter_init(iter, trans->c, btree_id, POS_MIN, flags);
+       } else {
+               iter = &trans->iters[idx];
+
+               BUG_ON(iter->btree_id != btree_id);
+               BUG_ON((iter->flags ^ flags) &
+                      (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS));
+
+               iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+               iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+       }
+
+       BUG_ON(trans->iters_live & (1 << idx));
+       trans->iters_live |= 1 << idx;
+
+       if (trans->iters_linked &&
+           !(trans->iters_linked & (1 << idx)))
+               bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
+                                    iter);
+
+       trans->iters_linked |= 1 << idx;
+
+       btree_trans_verify(trans);
+
+       return iter;
+}
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+                                        enum btree_id btree_id,
+                                        struct bpos pos, unsigned flags,
+                                        u64 iter_id)
+{
+       struct btree_iter *iter =
+               __btree_trans_get_iter(trans, btree_id, flags, iter_id);
+
+       if (!IS_ERR(iter))
+               bch2_btree_iter_set_pos(iter, pos);
+       return iter;
+}
+
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
+                                         struct btree_iter *src,
+                                         u64 iter_id)
+{
+       struct btree_iter *iter =
+               __btree_trans_get_iter(trans, src->btree_id,
+                                      src->flags, iter_id);
+
+       if (!IS_ERR(iter))
+               bch2_btree_iter_copy(iter, src);
+       return iter;
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *trans,
+                        size_t size)
+{
+       void *ret;
+
+       if (trans->mem_top + size > trans->mem_bytes) {
+               size_t old_bytes = trans->mem_bytes;
+               size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
+               void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+
+               if (!new_mem)
+                       return ERR_PTR(-ENOMEM);
+
+               trans->mem = new_mem;
+               trans->mem_bytes = new_bytes;
+
+               if (old_bytes) {
+                       trans_restart();
+                       return ERR_PTR(-EINTR);
+               }
+       }
+
+       ret = trans->mem + trans->mem_top;
+       trans->mem_top += size;
+       return ret;
+}
+
+int bch2_trans_unlock(struct btree_trans *trans)
+{
+       unsigned iters = trans->iters_linked;
+       int ret = 0;
+
+       while (iters) {
+               unsigned idx = __ffs(iters);
+               struct btree_iter *iter = &trans->iters[idx];
+
+               if (iter->flags & BTREE_ITER_ERROR)
+                       ret = -EIO;
+
+               __bch2_btree_iter_unlock(iter);
+               iters ^= 1 << idx;
+       }
+
+       return ret;
+}
+
+void __bch2_trans_begin(struct btree_trans *trans)
+{
+       unsigned idx;
+
+       btree_trans_verify(trans);
+
+       /*
+        * On transaction restart, the transaction isn't required to allocate
+        * all the same iterators it on the last iteration:
+        *
+        * Unlink any iterators it didn't use this iteration, assuming it got
+        * further (allocated an iter with a higher idx) than where the iter
+        * was originally allocated:
+        */
+       while (trans->iters_linked &&
+              trans->iters_live &&
+              (idx = __fls(trans->iters_linked)) >
+              __fls(trans->iters_live)) {
+               trans->iters_linked ^= 1 << idx;
+               bch2_btree_iter_unlink(&trans->iters[idx]);
+       }
+
+       trans->iters_live       = 0;
+       trans->nr_updates       = 0;
+       trans->mem_top          = 0;
+
+       btree_trans_verify(trans);
+}
+
+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
+{
+       trans->c                = c;
+       trans->nr_restarts      = 0;
+       trans->nr_iters         = 0;
+       trans->iters_live       = 0;
+       trans->iters_linked     = 0;
+       trans->nr_updates       = 0;
+       trans->mem_top          = 0;
+       trans->mem_bytes        = 0;
+       trans->mem              = NULL;
+       trans->iters            = trans->iters_onstack;
+}
+
+int bch2_trans_exit(struct btree_trans *trans)
+{
+       int ret = bch2_trans_unlock(trans);
+
+       kfree(trans->mem);
+       if (trans->iters != trans->iters_onstack)
+               kfree(trans->iters);
+       trans->mem      = (void *) 0x1;
+       trans->iters    = (void *) 0x1;
+       return ret;
 }
index 5db1cc581f56a91f79bf2009c5c17ff5e18b9c60..d046ad71a7ba21213b283709a58b6f2380bd8176 100644 (file)
@@ -269,4 +269,68 @@ static inline int btree_iter_err(struct bkey_s_c k)
        return PTR_ERR_OR_ZERO(k.k);
 }
 
+/* new multiple iterator interface: */
+
+int bch2_trans_preload_iters(struct btree_trans *);
+void bch2_trans_iter_free(struct btree_trans *,
+                               struct btree_iter *);
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+                                        struct bpos, unsigned, u64);
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
+                                         struct btree_iter *, u64);
+
+static __always_inline u64 __btree_iter_id(void)
+{
+       u64 ret = 0;
+
+       ret <<= 32;
+       ret |= _RET_IP_ & U32_MAX;
+       ret <<= 32;
+       ret |= _THIS_IP_ & U32_MAX;
+       return ret;
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
+                   struct bpos pos, unsigned flags)
+{
+       return __bch2_trans_get_iter(trans, btree_id, pos, flags,
+                                    __btree_iter_id());
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+{
+
+       return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
+}
+
+void __bch2_trans_begin(struct btree_trans *);
+
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+int bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_init(struct btree_trans *, struct bch_fs *);
+int bch2_trans_exit(struct btree_trans *);
+
+#ifdef TRACE_TRANSACTION_RESTARTS
+#define bch2_trans_begin(_trans)                                       \
+do {                                                                   \
+       if (is_power_of_2((_trans)->nr_restarts) &&                     \
+           (_trans)->nr_restarts >= 8)                                 \
+               pr_info("nr restarts: %zu", (_trans)->nr_restarts);     \
+                                                                       \
+       (_trans)->nr_restarts++;                                        \
+       __bch2_trans_begin(_trans);                                     \
+} while (0)
+#else
+#define bch2_trans_begin(_trans)       __bch2_trans_begin(_trans)
+#endif
+
+#ifdef TRACE_TRANSACTION_RESTARTS_ALL
+#define trans_restart(...) pr_info("transaction restart" __VA_ARGS__)
+#else
+#define trans_restart(...) no_printk("transaction restart" __VA_ARGS__)
+#endif
+
 #endif /* _BCACHEFS_BTREE_ITER_H */
index daa648c639d359fa3cd12727adbbed765f51cbc5..39e2db757f9a92f7555faa5f953a72528059d600 100644 (file)
@@ -253,6 +253,40 @@ struct btree_iter {
        struct btree_iter       *next;
 };
 
+#define BTREE_ITER_MAX         8
+
+struct btree_insert_entry {
+       struct btree_iter *iter;
+       struct bkey_i   *k;
+       unsigned        extra_res;
+       /*
+        * true if entire key was inserted - can only be false for
+        * extents
+        */
+       bool            done;
+};
+
+struct btree_trans {
+       struct bch_fs           *c;
+       size_t                  nr_restarts;
+
+       u8                      nr_iters;
+       u8                      iters_live;
+       u8                      iters_linked;
+       u8                      nr_updates;
+
+       unsigned                mem_top;
+       unsigned                mem_bytes;
+       void                    *mem;
+
+       struct btree_iter       *iters;
+       u64                     iter_ids[BTREE_ITER_MAX];
+
+       struct btree_insert_entry updates[BTREE_ITER_MAX];
+
+       struct btree_iter       iters_onstack[2];
+};
+
 #define BTREE_FLAG(flag)                                               \
 static inline bool btree_node_ ## flag(struct btree *b)                        \
 {      return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
index aac97958cc3b2b90f7a0a866e08648acc74bfd30..5e47d4cd7c48727131437ff035a684a8e8a09b52 100644 (file)
@@ -27,16 +27,7 @@ struct btree_insert {
        bool                    did_work;
 
        unsigned short          nr;
-       struct btree_insert_entry {
-               struct btree_iter *iter;
-               struct bkey_i   *k;
-               unsigned        extra_res;
-               /*
-                * true if entire key was inserted - can only be false for
-                * extents
-                */
-               bool            done;
-       }                       *entries;
+       struct btree_insert_entry  *entries;
 };
 
 int __bch2_btree_insert_at(struct btree_insert *);
@@ -149,4 +140,31 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
                               struct btree *, struct bkey_i_extent *);
 
+/* new transactional interface: */
+
+void bch2_trans_update(struct btree_trans *, struct btree_iter *,
+                            struct bkey_i *, unsigned);
+int bch2_trans_commit(struct btree_trans *,
+                     struct disk_reservation *,
+                     struct extent_insert_hook *,
+                     u64 *, unsigned);
+
+#define bch2_trans_do(_c, _journal_seq, _flags, _do)                   \
+({                                                                     \
+       struct btree_trans trans;                                       \
+       int _ret;                                                       \
+                                                                       \
+       bch2_trans_init(&trans, (_c));                                  \
+                                                                       \
+       do {                                                            \
+               bch2_trans_begin(&trans);                               \
+                                                                       \
+               _ret = (_do) ?: bch2_trans_commit(&trans, NULL, NULL,   \
+                                       (_journal_seq), (_flags));      \
+       } while (_ret == -EINTR);                                       \
+                                                                       \
+       bch2_trans_exit(&trans);                                        \
+       _ret;                                                           \
+})
+
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
index 588a1997e5eee1c9ad441034ccad782ad9ff211c..a481b0d632d9c3af249171fbd7231b7c0bfe6a72 100644 (file)
@@ -309,8 +309,10 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
        unsigned u64s;
        int ret;
 
-       trans_for_each_entry(trans, i)
+       trans_for_each_entry(trans, i) {
                BUG_ON(i->done);
+               BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+       }
 
        u64s = 0;
        trans_for_each_entry(trans, i)
@@ -330,6 +332,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 
        if (race_fault()) {
                ret = -EINTR;
+               trans_restart(" (race)");
                goto out;
        }
 
@@ -354,10 +357,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
                }
        }
 
-       if (journal_seq_verify(c) &&
-           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-               trans_for_each_entry(trans, i)
-                       i->k->k.version.lo = trans->journal_res.seq;
+       if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+               if (journal_seq_verify(c))
+                       trans_for_each_entry(trans, i)
+                               i->k->k.version.lo = trans->journal_res.seq;
+               else if (inject_invalid_keys(c))
+                       trans_for_each_entry(trans, i)
+                               i->k->k.version = MAX_VERSION;
+       }
 
        trans_for_each_entry(trans, i) {
                switch (btree_insert_key_leaf(trans, i)) {
@@ -398,6 +405,17 @@ out:
        return ret;
 }
 
+static inline void btree_insert_entry_checks(struct bch_fs *c,
+                                            struct btree_insert_entry *i)
+{
+       BUG_ON(i->iter->level);
+       BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+       BUG_ON(debug_check_bkeys(c) &&
+              !bkey_deleted(&i->k->k) &&
+              bch2_bkey_invalid(c, i->iter->btree_id,
+                                bkey_i_to_s_c(i->k)));
+}
+
 /**
  * __bch_btree_insert_at - insert keys at given iterator positions
  *
@@ -418,20 +436,16 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
        unsigned flags;
        int ret;
 
+       BUG_ON(!trans->nr);
+
        for_each_btree_iter(trans->entries[0].iter, linked)
                bch2_btree_iter_verify_locks(linked);
 
        /* for the sake of sanity: */
        BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
 
-       trans_for_each_entry(trans, i) {
-               BUG_ON(i->iter->level);
-               BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
-               BUG_ON(debug_check_bkeys(c) &&
-                      !bkey_deleted(&i->k->k) &&
-                      bch2_bkey_invalid(c, i->iter->btree_id,
-                                        bkey_i_to_s_c(i->k)));
-       }
+       trans_for_each_entry(trans, i)
+               btree_insert_entry_checks(c, i);
 
        bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
 
@@ -442,7 +456,12 @@ retry:
        cycle_gc_lock = false;
 
        trans_for_each_entry(trans, i) {
+               unsigned old_locks_want = i->iter->locks_want;
+               unsigned old_uptodate = i->iter->uptodate;
+
                if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
+                       trans_restart(" (failed upgrade, locks_want %u uptodate %u)",
+                                     old_locks_want, old_uptodate);
                        ret = -EINTR;
                        goto err;
                }
@@ -515,8 +534,10 @@ err:
                 * don't care if we got ENOSPC because we told split it
                 * couldn't block:
                 */
-               if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
+               if (!ret || (flags & BTREE_INSERT_NOUNLOCK)) {
+                       trans_restart(" (split)");
                        ret = -EINTR;
+               }
        }
 
        if (cycle_gc_lock) {
@@ -531,13 +552,16 @@ err:
        }
 
        if (ret == -EINTR) {
-               if (flags & BTREE_INSERT_NOUNLOCK)
+               if (flags & BTREE_INSERT_NOUNLOCK) {
+                       trans_restart(" (can't unlock)");
                        goto out;
+               }
 
                trans_for_each_entry(trans, i) {
                        int ret2 = bch2_btree_iter_traverse(i->iter);
                        if (ret2) {
                                ret = ret2;
+                               trans_restart(" (traverse)");
                                goto out;
                        }
 
@@ -550,11 +574,56 @@ err:
                 */
                if (!(flags & BTREE_INSERT_ATOMIC))
                        goto retry;
+
+               trans_restart(" (atomic)");
        }
 
        goto out;
 }
 
+void bch2_trans_update(struct btree_trans *trans,
+                      struct btree_iter *iter,
+                      struct bkey_i *k,
+                      unsigned extra_journal_res)
+{
+       struct btree_insert_entry *i;
+
+       BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
+
+       i = &trans->updates[trans->nr_updates++];
+
+       *i = (struct btree_insert_entry) {
+               .iter   = iter,
+               .k              = k,
+               .extra_res      = extra_journal_res,
+       };
+
+       btree_insert_entry_checks(trans->c, i);
+}
+
+int bch2_trans_commit(struct btree_trans *trans,
+                     struct disk_reservation *disk_res,
+                     struct extent_insert_hook *hook,
+                     u64 *journal_seq,
+                     unsigned flags)
+{
+       struct btree_insert insert = {
+               .c              = trans->c,
+               .disk_res       = disk_res,
+               .journal_seq    = journal_seq,
+               .flags          = flags,
+               .nr             = trans->nr_updates,
+               .entries        = trans->updates,
+       };
+
+       if (!trans->nr_updates)
+               return 0;
+
+       trans->nr_updates = 0;
+
+       return __bch2_btree_insert_at(&insert);
+}
+
 int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
 {
        struct bkey_i k;
index d3dd3eb71837062b9d2f44e1b3d026efe35e540c..d979ae0eaa17ec4de6d6270428e65f01e6191c74 100644 (file)
@@ -141,8 +141,8 @@ void bch2_dirent_to_text(struct bch_fs *c, char *buf,
        }
 }
 
-static struct bkey_i_dirent *dirent_create_key(u8 type,
-                               const struct qstr *name, u64 dst)
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+                               u8 type, const struct qstr *name, u64 dst)
 {
        struct bkey_i_dirent *dirent;
        unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
@@ -152,9 +152,9 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
 
        BUG_ON(u64s > U8_MAX);
 
-       dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
-       if (!dirent)
-               return ERR_PTR(-ENOMEM);
+       dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+       if (IS_ERR(dirent))
+               return dirent;
 
        bkey_dirent_init(&dirent->k_i);
        dirent->k.u64s = u64s;
@@ -172,23 +172,31 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
        return dirent;
 }
 
-int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
-                      const struct bch_hash_info *hash_info,
-                      u8 type, const struct qstr *name, u64 dst_inum,
-                      u64 *journal_seq, int flags)
+int __bch2_dirent_create(struct btree_trans *trans,
+                        u64 dir_inum, const struct bch_hash_info *hash_info,
+                        u8 type, const struct qstr *name, u64 dst_inum,
+                        int flags)
 {
        struct bkey_i_dirent *dirent;
        int ret;
 
-       dirent = dirent_create_key(type, name, dst_inum);
-       if (IS_ERR(dirent))
-               return PTR_ERR(dirent);
+       dirent = dirent_create_key(trans, type, name, dst_inum);
+       ret = PTR_ERR_OR_ZERO(dirent);
+       if (ret)
+               return ret;
 
-       ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum,
-                          journal_seq, &dirent->k_i, flags);
-       kfree(dirent);
+       return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+                              dir_inum, &dirent->k_i, flags);
+}
 
-       return ret;
+int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
+                      const struct bch_hash_info *hash_info,
+                      u8 type, const struct qstr *name, u64 dst_inum,
+                      u64 *journal_seq, int flags)
+{
+       return bch2_trans_do(c, journal_seq, flags,
+               __bch2_dirent_create(&trans, dir_inum, hash_info,
+                                    type, name, dst_inum, flags));
 }
 
 static void dirent_copy_target(struct bkey_i_dirent *dst,
@@ -204,151 +212,117 @@ static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
        return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
 }
 
-int bch2_dirent_rename(struct bch_fs *c,
+int bch2_dirent_rename(struct btree_trans *trans,
                struct bch_inode_info *src_dir, const struct qstr *src_name,
                struct bch_inode_info *dst_dir, const struct qstr *dst_name,
-               u64 *journal_seq, enum bch_rename_mode mode)
+               enum bch_rename_mode mode)
 {
-       struct btree_iter src_iter, dst_iter, whiteout_iter;
+       struct btree_iter *src_iter, *dst_iter;
        struct bkey_s_c old_src, old_dst;
-       struct bkey delete;
        struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
-       struct bpos src_pos = bch2_dirent_pos(src_dir, src_name);
        struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
-       bool need_whiteout;
        int ret;
 
-       bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       bch2_btree_iter_init(&dst_iter, c, BTREE_ID_DIRENTS, dst_pos,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       bch2_btree_iter_link(&src_iter, &dst_iter);
-
-       bch2_btree_iter_init(&whiteout_iter, c, BTREE_ID_DIRENTS, src_pos,
-                            BTREE_ITER_SLOTS);
-       bch2_btree_iter_link(&src_iter, &whiteout_iter);
-
-       if (mode == BCH_RENAME_EXCHANGE) {
-               new_src = dirent_create_key(0, src_name, 0);
-               if (IS_ERR(new_src)) {
-                       ret = PTR_ERR(new_src);
-                       goto err;
-               }
-       } else {
-               new_src = (void *) &delete;
-       }
-
-       new_dst = dirent_create_key(0, dst_name, 0);
-       if (IS_ERR(new_dst)) {
-               ret = PTR_ERR(new_dst);
-               goto err;
-       }
-retry:
-       /*
-        * Note that on -EINTR/dropped locks we're not restarting the lookup
-        * from the original hashed position (like we do when creating dirents,
-        * in bch_hash_set) -  we never move existing dirents to different slot:
-        */
-       old_src = bch2_hash_lookup_at(bch2_dirent_hash_desc,
-                                    &src_dir->ei_str_hash,
-                                    &src_iter, src_name);
-       if ((ret = btree_iter_err(old_src)))
-               goto err;
-
-       ret = bch2_hash_needs_whiteout(bch2_dirent_hash_desc,
-                               &src_dir->ei_str_hash,
-                               &whiteout_iter, &src_iter);
-       if (ret < 0)
-               goto err;
-       need_whiteout = ret;
-
        /*
+        * Lookup dst:
+        *
         * Note that in BCH_RENAME mode, we're _not_ checking if
         * the target already exists - we're relying on the VFS
         * to do that check for us for correctness:
         */
-       old_dst = mode == BCH_RENAME
-               ? bch2_hash_hole_at(bch2_dirent_hash_desc, &dst_iter)
-               : bch2_hash_lookup_at(bch2_dirent_hash_desc,
-                                    &dst_dir->ei_str_hash,
-                                    &dst_iter, dst_name);
-       if ((ret = btree_iter_err(old_dst)))
-               goto err;
-
-       switch (mode) {
-       case BCH_RENAME:
-               bkey_init(&new_src->k);
-               dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-
-               if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
-                   bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
-                       /*
-                        * If we couldn't insert new_dst at its hashed
-                        * position (dst_pos) due to a hash collision,
-                        * and we're going to be deleting in
-                        * between the hashed position and first empty
-                        * slot we found - just overwrite the pos we
-                        * were going to delete:
-                        *
-                        * Note: this is a correctness issue, in this
-                        * situation bch2_hash_needs_whiteout() could
-                        * return false when the whiteout would have
-                        * been needed if we inserted at the pos
-                        * __dirent_find_hole() found
-                        */
-                       new_dst->k.p = src_iter.pos;
-                       ret = bch2_btree_insert_at(c, NULL, NULL,
-                                       journal_seq,
-                                       BTREE_INSERT_ATOMIC,
-                                       BTREE_INSERT_ENTRY(&src_iter,
-                                                          &new_dst->k_i));
-                       goto err;
-               }
+       dst_iter = mode == BCH_RENAME
+               ? bch2_hash_hole(trans, bch2_dirent_hash_desc,
+                                &dst_dir->ei_str_hash,
+                                dst_dir->v.i_ino, dst_name)
+               : bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+                                  &dst_dir->ei_str_hash,
+                                  dst_dir->v.i_ino, dst_name,
+                                  BTREE_ITER_INTENT);
+       if (IS_ERR(dst_iter))
+               return PTR_ERR(dst_iter);
+       old_dst = bch2_btree_iter_peek_slot(dst_iter);
+
+       /* Lookup src: */
+       src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+                                   &src_dir->ei_str_hash,
+                                   src_dir->v.i_ino, src_name,
+                                   BTREE_ITER_INTENT);
+       if (IS_ERR(src_iter))
+               return PTR_ERR(src_iter);
+       old_src = bch2_btree_iter_peek_slot(src_iter);
+
+       /* Create new dst key: */
+       new_dst = dirent_create_key(trans, 0, dst_name, 0);
+       if (IS_ERR(new_dst))
+               return PTR_ERR(new_dst);
+
+       dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+       new_dst->k.p = dst_iter->pos;
+
+       /* Create new src key: */
+       if (mode == BCH_RENAME_EXCHANGE) {
+               new_src = dirent_create_key(trans, 0, src_name, 0);
+               if (IS_ERR(new_src))
+                       return PTR_ERR(new_src);
 
-               if (need_whiteout)
-                       new_src->k.type = BCH_DIRENT_WHITEOUT;
-               break;
-       case BCH_RENAME_OVERWRITE:
+               dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+               new_src->k.p = src_iter->pos;
+       } else {
+               new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+               if (IS_ERR(new_src))
+                       return PTR_ERR(new_src);
                bkey_init(&new_src->k);
-               dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+               new_src->k.p = src_iter->pos;
 
-               if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
-                   bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
+               if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
+                   bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
                        /*
-                        * Same case described above -
-                        * bch_hash_needs_whiteout could spuriously
-                        * return false, but we have to insert at
-                        * dst_iter.pos because we're overwriting
-                        * another dirent:
+                        * We have a hash collision for the new dst key,
+                        * and new_src - the key we're deleting - is between
+                        * new_dst's hashed slot and the slot we're going to be
+                        * inserting it into - oops.  This will break the hash
+                        * table if we don't deal with it:
                         */
-                       new_src->k.type = BCH_DIRENT_WHITEOUT;
-               } else if (need_whiteout)
-                       new_src->k.type = BCH_DIRENT_WHITEOUT;
-               break;
-       case BCH_RENAME_EXCHANGE:
-               dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
-               dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-               break;
+                       if (mode == BCH_RENAME) {
+                               /*
+                                * If we're not overwriting, we can just insert
+                                * new_dst at the src position:
+                                */
+                               new_dst->k.p = src_iter->pos;
+                               bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
+                               return 0;
+                       } else {
+                               /* If we're overwriting, we can't insert new_dst
+                                * at a different slot because it has to
+                                * overwrite old_dst - just make sure to use a
+                                * whiteout when deleting src:
+                                */
+                               new_src->k.type = BCH_DIRENT_WHITEOUT;
+                       }
+               } else {
+                       /* Check if we need a whiteout to delete src: */
+                       ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+                                                      &src_dir->ei_str_hash,
+                                                      src_iter);
+                       if (ret < 0)
+                               return ret;
+
+                       if (ret)
+                               new_src->k.type = BCH_DIRENT_WHITEOUT;
+               }
        }
 
-       new_src->k.p = src_iter.pos;
-       new_dst->k.p = dst_iter.pos;
-       ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
-                       BTREE_INSERT_ATOMIC,
-                       BTREE_INSERT_ENTRY(&src_iter, &new_src->k_i),
-                       BTREE_INSERT_ENTRY(&dst_iter, &new_dst->k_i));
-err:
-       if (ret == -EINTR)
-               goto retry;
-
-       bch2_btree_iter_unlock(&whiteout_iter);
-       bch2_btree_iter_unlock(&dst_iter);
-       bch2_btree_iter_unlock(&src_iter);
-
-       if (new_src != (void *) &delete)
-               kfree(new_src);
-       kfree(new_dst);
-       return ret;
+       bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
+       bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+       return 0;
+}
+
+int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
+                        const struct bch_hash_info *hash_info,
+                        const struct qstr *name)
+{
+       return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
+                               dir_inum, name);
 }
 
 int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
@@ -356,28 +330,34 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
                       const struct qstr *name,
                       u64 *journal_seq)
 {
-       return bch2_hash_delete(bch2_dirent_hash_desc, hash_info,
-                              c, dir_inum, journal_seq, name);
+       return bch2_trans_do(c, journal_seq,
+                            BTREE_INSERT_ATOMIC|
+                            BTREE_INSERT_NOFAIL,
+               __bch2_dirent_delete(&trans, dir_inum, hash_info, name));
 }
 
 u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
                       const struct bch_hash_info *hash_info,
                       const struct qstr *name)
 {
-       struct btree_iter iter;
+       struct btree_trans trans;
+       struct btree_iter *iter;
        struct bkey_s_c k;
-       u64 inum;
+       u64 inum = 0;
 
-       k = bch2_hash_lookup(bch2_dirent_hash_desc, hash_info, c,
-                           dir_inum, &iter, name);
-       if (IS_ERR(k.k)) {
-               bch2_btree_iter_unlock(&iter);
-               return 0;
+       bch2_trans_init(&trans, c);
+
+       iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
+                               hash_info, dir_inum, name, 0);
+       if (IS_ERR(iter)) {
+               BUG_ON(PTR_ERR(iter) == -EINTR);
+               goto out;
        }
 
+       k = bch2_btree_iter_peek_slot(iter);
        inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-       bch2_btree_iter_unlock(&iter);
-
+out:
+       bch2_trans_exit(&trans);
        return inum;
 }
 
index 5d066af18f9533d6548c241b589f23c0bbc5537d..4d92ffba144ee13b345513c2887a5f60e0f352d1 100644 (file)
@@ -21,8 +21,16 @@ struct bch_hash_info;
 struct bch_inode_info;
 
 unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+
+int __bch2_dirent_create(struct btree_trans *, u64,
+                        const struct bch_hash_info *, u8,
+                        const struct qstr *, u64, int);
 int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
                       u8, const struct qstr *, u64, u64 *, int);
+
+int __bch2_dirent_delete(struct btree_trans *, u64,
+                        const struct bch_hash_info *,
+                        const struct qstr *);
 int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
                       const struct qstr *, u64 *);
 
@@ -32,10 +40,10 @@ enum bch_rename_mode {
        BCH_RENAME_EXCHANGE,
 };
 
-int bch2_dirent_rename(struct bch_fs *,
+int bch2_dirent_rename(struct btree_trans *,
                       struct bch_inode_info *, const struct qstr *,
                       struct bch_inode_info *, const struct qstr *,
-                      u64 *, enum bch_rename_mode);
+                      enum bch_rename_mode);
 
 u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
                       const struct qstr *);
index 2a357fc33ef71d689ca7a7efff013d4f6c841e63..9505b6e6d3752d50ed29703ccef53b75429dc3aa 100644 (file)
@@ -131,8 +131,9 @@ print:
 
        mutex_unlock(&c->fsck_error_lock);
 
-       if (fix)
-               set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
+       set_bit(fix
+               ? BCH_FS_FSCK_FIXED_ERRORS
+               : BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags);
 
        return fix                              ? FSCK_ERR_FIX
                : flags & FSCK_CAN_IGNORE       ? FSCK_ERR_IGNORE
index f65ef132461e78afdcce484f19d38bd37fb64d94..588e763f0440686d9af7999c1edb5df4aa132764 100644 (file)
@@ -147,12 +147,18 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 #define need_fsck_err_on(cond, c, ...)                                 \
        __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
 
+#define need_fsck_err(c, ...)                                          \
+       __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
 #define mustfix_fsck_err(c, ...)                                       \
        __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
 
 #define mustfix_fsck_err_on(cond, c, ...)                              \
        __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
 
+#define fsck_err(c, ...)                                               \
+       __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
 #define fsck_err_on(cond, c, ...)                                      \
        __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
 
index 9e78798a4d08de7f7b58eb3d52723fff80e6e8e6..e4d2b39e0d8271a69bf88795c99023ed497565f4 100644 (file)
@@ -193,7 +193,7 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
                                              struct bch_inode_info *inode,
                                              loff_t new_size)
 {
-       return __bch2_write_inode(c, inode, inode_set_size, &new_size);
+       return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0);
 }
 
 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
@@ -259,7 +259,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
        mutex_lock(&h->inode->ei_update_lock);
        i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
 
-       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
+       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
 
        if (!ret && h->new_i_size != U64_MAX)
                i_size_write(&h->inode->v, h->new_i_size);
@@ -289,7 +289,7 @@ static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
        int ret;
 
        mutex_lock(&h->inode->ei_update_lock);
-       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h);
+       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
        mutex_unlock(&h->inode->ei_update_lock);
 
        return ret;
@@ -390,7 +390,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
        struct bchfs_write_op *op = container_of(wop,
                                struct bchfs_write_op, op);
        struct keylist *keys = &op->op.insert_keys;
-       struct btree_iter extent_iter, inode_iter;
+       struct btree_trans trans;
+       struct btree_iter *extent_iter, *inode_iter = NULL;
        struct bchfs_extent_trans_hook hook;
        struct bkey_i *k = bch2_keylist_front(keys);
        s64 orig_sectors_added = op->sectors_added;
@@ -398,12 +399,13 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
        BUG_ON(k->k.p.inode != op->inode->v.i_ino);
 
-       bch2_btree_iter_init(&extent_iter, wop->c, BTREE_ID_EXTENTS,
-                            bkey_start_pos(&bch2_keylist_front(keys)->k),
-                            BTREE_ITER_INTENT);
-       bch2_btree_iter_init(&inode_iter, wop->c, BTREE_ID_INODES,
-                            POS(extent_iter.pos.inode, 0),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_trans_init(&trans, wop->c);
+
+       extent_iter = bch2_trans_get_iter(&trans,
+                               BTREE_ID_EXTENTS,
+                               bkey_start_pos(&bch2_keylist_front(keys)->k),
+                               BTREE_ITER_INTENT);
+       BUG_ON(IS_ERR(extent_iter));
 
        hook.op                 = op;
        hook.hook.fn            = bchfs_extent_update_hook;
@@ -416,19 +418,29 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
                    op->inode->ei_inode.bi_size)
                        hook.need_inode_update = true;
 
+               /* optimization for fewer transaction restarts: */
+               ret = bch2_btree_iter_traverse(extent_iter);
+               if (ret)
+                       goto err;
+
                if (hook.need_inode_update) {
                        struct bkey_s_c inode;
 
-                       if (!btree_iter_linked(&inode_iter))
-                               bch2_btree_iter_link(&extent_iter, &inode_iter);
+                       if (!inode_iter) {
+                               inode_iter = bch2_trans_get_iter(&trans,
+                                       BTREE_ID_INODES,
+                                       POS(extent_iter->pos.inode, 0),
+                                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+                               BUG_ON(IS_ERR(inode_iter));
+                       }
 
-                       inode = bch2_btree_iter_peek_slot(&inode_iter);
+                       inode = bch2_btree_iter_peek_slot(inode_iter);
                        if ((ret = btree_iter_err(inode)))
                                goto err;
 
                        if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
                                      "inode %llu not found when updating",
-                                     extent_iter.pos.inode)) {
+                                     extent_iter->pos.inode)) {
                                ret = -ENOENT;
                                break;
                        }
@@ -436,7 +448,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
                        if (WARN_ONCE(bkey_bytes(inode.k) >
                                      sizeof(hook.inode_p),
                                      "inode %llu too big (%zu bytes, buf %zu)",
-                                     extent_iter.pos.inode,
+                                     extent_iter->pos.inode,
                                      bkey_bytes(inode.k),
                                      sizeof(hook.inode_p))) {
                                ret = -ENOENT;
@@ -448,7 +460,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
                                               &hook.inode_u);
                        if (WARN_ONCE(ret,
                                      "error %i unpacking inode %llu",
-                                     ret, extent_iter.pos.inode)) {
+                                     ret, extent_iter->pos.inode)) {
                                ret = -ENOENT;
                                break;
                        }
@@ -458,8 +470,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
                                        BTREE_INSERT_NOFAIL|
                                        BTREE_INSERT_ATOMIC|
                                        BTREE_INSERT_USE_RESERVE,
-                                       BTREE_INSERT_ENTRY(&extent_iter, k),
-                                       BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
+                                       BTREE_INSERT_ENTRY(extent_iter, k),
+                                       BTREE_INSERT_ENTRY_EXTRA_RES(inode_iter,
                                                        &hook.inode_p.inode.k_i, 2));
                } else {
                        ret = bch2_btree_insert_at(wop->c, &wop->res,
@@ -467,10 +479,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
                                        BTREE_INSERT_NOFAIL|
                                        BTREE_INSERT_ATOMIC|
                                        BTREE_INSERT_USE_RESERVE,
-                                       BTREE_INSERT_ENTRY(&extent_iter, k));
+                                       BTREE_INSERT_ENTRY(extent_iter, k));
                }
 
-               BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
+               BUG_ON(bkey_cmp(extent_iter->pos, bkey_start_pos(&k->k)));
 
                if (WARN_ONCE(!ret != !k->k.size,
                              "ret %i k->size %u", ret, k->k.size))
@@ -481,12 +493,11 @@ err:
                if (ret)
                        break;
 
-               BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
+               BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0);
                bch2_keylist_pop_front(keys);
        } while (!bch2_keylist_empty(keys));
 
-       bch2_btree_iter_unlock(&extent_iter);
-       bch2_btree_iter_unlock(&inode_iter);
+       bch2_trans_exit(&trans);
 
        if (op->is_dio) {
                struct dio_write *dio = container_of(op, struct dio_write, iop);
@@ -2338,8 +2349,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       struct btree_iter src;
-       struct btree_iter dst;
+       struct btree_trans trans;
+       struct btree_iter *src, *dst;
        BKEY_PADDED(k) copy;
        struct bkey_s_c k;
        struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
@@ -2349,13 +2360,17 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
        if ((offset | len) & (block_bytes(c) - 1))
                return -EINVAL;
 
-       bch2_btree_iter_init(&dst, c, BTREE_ID_EXTENTS,
+       bch2_trans_init(&trans, c);
+
+       dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
                             POS(inode->v.i_ino, offset >> 9),
                             BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       BUG_ON(IS_ERR(dst));
+
        /* position will be set from dst iter's position: */
-       bch2_btree_iter_init(&src, c, BTREE_ID_EXTENTS, POS_MIN,
+       src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
                             BTREE_ITER_SLOTS);
-       bch2_btree_iter_link(&src, &dst);
+       BUG_ON(IS_ERR(src));
 
        /*
         * We need i_mutex to keep the page cache consistent with the extents
@@ -2384,24 +2399,24 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
        if (ret)
                goto err;
 
-       while (bkey_cmp(dst.pos,
+       while (bkey_cmp(dst->pos,
                        POS(inode->v.i_ino,
                            round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
                struct disk_reservation disk_res;
 
-               bch2_btree_iter_set_pos(&src,
-                       POS(dst.pos.inode, dst.pos.offset + (len >> 9)));
+               bch2_btree_iter_set_pos(src,
+                       POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
 
-               k = bch2_btree_iter_peek_slot(&src);
+               k = bch2_btree_iter_peek_slot(src);
                if ((ret = btree_iter_err(k)))
                        goto btree_iter_err;
 
                bkey_reassemble(&copy.k, k);
 
-               bch2_cut_front(src.pos, &copy.k);
+               bch2_cut_front(src->pos, &copy.k);
                copy.k.k.p.offset -= len >> 9;
 
-               BUG_ON(bkey_cmp(dst.pos, bkey_start_pos(&copy.k.k)));
+               BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
 
                ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
                                bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k)),
@@ -2412,14 +2427,13 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
                                           &inode->ei_journal_seq,
                                           BTREE_INSERT_ATOMIC|
                                           BTREE_INSERT_NOFAIL,
-                                          BTREE_INSERT_ENTRY(&dst, &copy.k));
+                                          BTREE_INSERT_ENTRY(dst, &copy.k));
                bch2_disk_reservation_put(c, &disk_res);
 btree_iter_err:
                if (ret == -EINTR)
                        ret = 0;
                if (ret) {
-                       bch2_btree_iter_unlock(&src);
-                       bch2_btree_iter_unlock(&dst);
+                       bch2_trans_exit(&trans);
                        goto err_put_sectors_dirty;
                }
                /*
@@ -2427,11 +2441,10 @@ btree_iter_err:
                 * pointers... which isn't a _super_ serious problem...
                 */
 
-               bch2_btree_iter_cond_resched(&src);
+               bch2_btree_iter_cond_resched(src);
        }
 
-       bch2_btree_iter_unlock(&src);
-       bch2_btree_iter_unlock(&dst);
+       bch2_trans_exit(&trans);
 
        ret = bch2_inode_truncate(c, inode->v.i_ino,
                                 round_up(new_size, block_bytes(c)) >> 9,
index 2c1ecf7732cd29a1568a1e9191ebf412e619ab1b..336dbd4ba8d6b8554328e6c6f01ff9754978779c 100644 (file)
@@ -87,6 +87,8 @@ void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
 struct flags_set {
        unsigned                mask;
        unsigned                flags;
+
+       unsigned                projid;
 };
 
 static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -150,7 +152,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
        }
 
        mutex_lock(&inode->ei_update_lock);
-       ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s);
+       ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
 
        if (!ret)
                bch2_inode_flags_to_vfs(inode);
@@ -185,9 +187,9 @@ static int bch2_set_projid(struct bch_fs *c,
 
        qid.q[QTYP_PRJ] = projid;
 
-       ret = bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
-                                 inode->v.i_blocks +
-                                 inode->ei_quota_reserved);
+       return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
+                                  inode->v.i_blocks +
+                                  inode->ei_quota_reserved);
        if (ret)
                return ret;
 
@@ -195,6 +197,17 @@ static int bch2_set_projid(struct bch_fs *c,
        return 0;
 }
 
+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct flags_set *s = p;
+
+       bi->bi_project = s->projid;
+
+       return bch2_inode_flags_set(inode, bi, p);
+}
+
 static int bch2_ioc_fssetxattr(struct bch_fs *c,
                               struct file *file,
                               struct bch_inode_info *inode,
@@ -211,6 +224,8 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
        if (fa.fsx_xflags)
                return -EOPNOTSUPP;
 
+       s.projid = fa.fsx_projid;
+
        ret = mnt_want_write_file(file);
        if (ret)
                return ret;
@@ -226,7 +241,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
        if (ret)
                goto err_unlock;
 
-       ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s);
+       ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
        if (!ret)
                bch2_inode_flags_to_vfs(inode);
 err_unlock:
index 3b7f78e731b8c1381e62e91fd3bdb77ed0252dd4..c51a65da0fb38fc79d285913ee32c9149c6815e4 100644 (file)
@@ -34,6 +34,19 @@ static void bch2_vfs_inode_init(struct bch_fs *,
                                struct bch_inode_info *,
                                struct bch_inode_unpacked *);
 
+static void journal_seq_copy(struct bch_inode_info *dst,
+                            u64 journal_seq)
+{
+       u64 old, v = READ_ONCE(dst->ei_journal_seq);
+
+       do {
+               old = v;
+
+               if (old >= journal_seq)
+                       break;
+       } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+}
+
 /*
  * I_SIZE_DIRTY requires special handling:
  *
@@ -62,127 +75,113 @@ static void bch2_vfs_inode_init(struct bch_fs *,
  * be set explicitly.
  */
 
-int __must_check __bch2_write_inode(struct bch_fs *c,
-                                   struct bch_inode_info *inode,
-                                   inode_set_fn set,
-                                   void *p)
+void bch2_inode_update_after_write(struct bch_fs *c,
+                                  struct bch_inode_info *inode,
+                                  struct bch_inode_unpacked *bi,
+                                  unsigned fields)
 {
-       struct btree_iter iter;
-       struct bch_inode_unpacked inode_u;
-       struct bkey_inode_buf inode_p;
+       set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED
+                 ? 0
+                 : bi->bi_nlink + nlink_bias(inode->v.i_mode));
+       i_uid_write(&inode->v, bi->bi_uid);
+       i_gid_write(&inode->v, bi->bi_gid);
+       inode->v.i_mode = bi->bi_mode;
+
+       if (fields & ATTR_ATIME)
+               inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+       if (fields & ATTR_MTIME)
+               inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+       if (fields & ATTR_CTIME)
+               inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
+
+       inode->ei_inode         = *bi;
+       inode->ei_qid           = bch_qid(bi);
+}
+
+int __must_check bch2_write_inode_trans(struct btree_trans *trans,
+                               struct bch_inode_info *inode,
+                               struct bch_inode_unpacked *inode_u,
+                               inode_set_fn set,
+                               void *p)
+{
+       struct btree_iter *iter;
+       struct bkey_inode_buf *inode_p;
+       struct bkey_s_c k;
        u64 inum = inode->v.i_ino;
-       unsigned i_nlink = READ_ONCE(inode->v.i_nlink);
        int ret;
 
-       /*
-        * We can't write an inode with i_nlink == 0 because it's stored biased;
-        * however, we don't need to because if i_nlink is 0 the inode is
-        * getting deleted when it's evicted.
-        */
-       if (!i_nlink)
-               return 0;
-
        lockdep_assert_held(&inode->ei_update_lock);
 
-       bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inum, 0),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
+                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
 
-       do {
-               struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+       k = bch2_btree_iter_peek_slot(iter);
+       if ((ret = btree_iter_err(k)))
+               return ret;
 
-               if ((ret = btree_iter_err(k)))
-                       goto out;
+       if (WARN_ONCE(k.k->type != BCH_INODE_FS,
+                     "inode %llu not found when updating", inum))
+               return -ENOENT;
 
-               if (WARN_ONCE(k.k->type != BCH_INODE_FS,
-                             "inode %llu not found when updating", inum)) {
-                       bch2_btree_iter_unlock(&iter);
-                       return -ENOENT;
-               }
+       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
+       if (WARN_ONCE(ret,
+                     "error %i unpacking inode %llu", ret, inum))
+               return -ENOENT;
 
-               ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
-               if (WARN_ONCE(ret,
-                             "error %i unpacking inode %llu", ret, inum)) {
-                       ret = -ENOENT;
-                       break;
-               }
+       BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size);
 
-               BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size);
+       BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size &&
+              !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+              inode_u->bi_size > i_size_read(&inode->v));
 
-               if (set) {
-                       ret = set(inode, &inode_u, p);
-                       if (ret)
-                               goto out;
-               }
-
-               BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
-
-               BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size &&
-                      !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-                      inode_u.bi_size > i_size_read(&inode->v));
-
-               inode_u.bi_mode = inode->v.i_mode;
-               inode_u.bi_uid  = i_uid_read(&inode->v);
-               inode_u.bi_gid  = i_gid_read(&inode->v);
-               inode_u.bi_project = inode->ei_qid.q[QTYP_PRJ];
-               inode_u.bi_nlink= i_nlink - nlink_bias(inode->v.i_mode);
-               inode_u.bi_dev  = inode->v.i_rdev;
-               inode_u.bi_atime= timespec_to_bch2_time(c, inode->v.i_atime);
-               inode_u.bi_mtime= timespec_to_bch2_time(c, inode->v.i_mtime);
-               inode_u.bi_ctime= timespec_to_bch2_time(c, inode->v.i_ctime);
-
-               bch2_inode_pack(&inode_p, &inode_u);
-
-               ret = bch2_btree_insert_at(c, NULL, NULL,
-                               &inode->ei_journal_seq,
-                               BTREE_INSERT_ATOMIC|
-                               BTREE_INSERT_NOUNLOCK|
-                               BTREE_INSERT_NOFAIL,
-                               BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
-       } while (ret == -EINTR);
-
-       if (!ret) {
-               /*
-                * the btree node lock protects inode->ei_inode, not
-                * ei_update_lock; this is important for inode updates via
-                * bchfs_write_index_update
-                */
-               inode->ei_inode = inode_u;
-               inode->ei_qid   = bch_qid(&inode_u);
+       if (set) {
+               ret = set(inode, inode_u, p);
+               if (ret)
+                       return ret;
        }
-out:
-       bch2_btree_iter_unlock(&iter);
 
-       return ret < 0 ? ret : 0;
-}
+       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+       if (IS_ERR(inode_p))
+               return PTR_ERR(inode_p);
 
-int __must_check bch2_write_inode(struct bch_fs *c,
-                                 struct bch_inode_info *inode)
-{
-       return __bch2_write_inode(c, inode, NULL, NULL);
+       bch2_inode_pack(inode_p, inode_u);
+       bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+       return 0;
 }
 
-static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+int __must_check __bch2_write_inode(struct bch_fs *c,
+                                   struct bch_inode_info *inode,
+                                   inode_set_fn set,
+                                   void *p, unsigned fields)
 {
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
        int ret;
 
-       mutex_lock(&inode->ei_update_lock);
-       inc_nlink(&inode->v);
-       ret = bch2_write_inode(c, inode);
-       mutex_unlock(&inode->ei_update_lock);
-
-       return ret;
-}
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
 
-static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
-{
-       int ret = 0;
+       ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
 
-       mutex_lock(&inode->ei_update_lock);
-       drop_nlink(&inode->v);
-       ret = bch2_write_inode(c, inode);
-       mutex_unlock(&inode->ei_update_lock);
+       /*
+        * the btree node lock protects inode->ei_inode, not ei_update_lock;
+        * this is important for inode updates via bchfs_write_index_update
+        */
+       if (!ret)
+               bch2_inode_update_after_write(c, inode, &inode_u, fields);
 
-       return ret;
+       bch2_trans_exit(&trans);
+       return ret < 0 ? ret : 0;
 }
 
 static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
@@ -212,125 +211,173 @@ static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
        return &inode->v;
 }
 
-static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
-                                                   struct bch_inode_info *dir,
-                                                   umode_t mode, dev_t rdev)
+static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u,
+                                 const struct inode *dir, umode_t mode)
 {
-       struct posix_acl *default_acl = NULL, *acl = NULL;
-       struct bch_inode_info *inode;
-       struct bch_inode_unpacked inode_u;
-       int ret;
+       kuid_t uid = current_fsuid();
+       kgid_t gid;
+
+       if (dir && dir->i_mode & S_ISGID) {
+               gid = dir->i_gid;
+               if (S_ISDIR(mode))
+                       mode |= S_ISGID;
+       } else
+               gid = current_fsgid();
+
+       inode_u->bi_uid         = from_kuid(dir->i_sb->s_user_ns, uid);
+       inode_u->bi_gid         = from_kgid(dir->i_sb->s_user_ns, gid);
+       inode_u->bi_mode        = mode;
+}
 
-       inode = to_bch_ei(new_inode(c->vfs_sb));
-       if (unlikely(!inode))
-               return ERR_PTR(-ENOMEM);
+static int inode_update_for_create_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_inode_unpacked *new_inode = p;
+       struct timespec now = current_time(&inode->v);
 
-       inode_init_owner(&inode->v, &dir->v, mode);
+       bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-       ret = posix_acl_create(&dir->v, &inode->v.i_mode, &default_acl, &acl);
-       if (ret)
-               goto err_make_bad;
-#endif
+       if (S_ISDIR(new_inode->bi_mode))
+               bi->bi_nlink++;
 
-       bch2_inode_init(c, &inode_u,
-                       i_uid_read(&inode->v),
-                       i_gid_read(&inode->v),
-                       inode->v.i_mode, rdev,
-                       &dir->ei_inode);
+       return 0;
+}
+
+static struct bch_inode_info *
+__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
+             umode_t mode, dev_t rdev, bool tmpfile)
+{
+       struct bch_fs *c = dir->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct bch_inode_unpacked dir_u;
+       struct bch_inode_info *inode, *old;
+       struct bch_inode_unpacked inode_u;
+       struct bch_hash_info hash_info;
+       struct posix_acl *default_acl = NULL, *acl = NULL;
+       int ret;
+
+       bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
+       bch2_inode_init_owner(&inode_u, &dir->v, mode);
 
        inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ];
 
+       hash_info = bch2_hash_info_init(c, &inode_u);
+
+       if (tmpfile)
+               inode_u.bi_flags |= BCH_INODE_UNLINKED;
+
        ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, BCH_QUOTA_PREALLOC);
        if (ret)
-               goto err_make_bad;
+               return ERR_PTR(ret);
 
-       ret = bch2_inode_create(c, &inode_u,
-                               BLOCKDEV_INODE_MAX, 0,
-                               &c->unused_inode_hint);
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+       ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl);
+       if (ret)
+               goto err;
+#endif
+
+       /*
+        * preallocate vfs inode before btree transaction, so that nothing can
+        * fail after the transaction succeeds:
+        */
+       inode = to_bch_ei(new_inode(c->vfs_sb));
+       if (unlikely(!inode)) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_inode_create(&trans, &inode_u,
+                                   BLOCKDEV_INODE_MAX, 0,
+                                   &c->unused_inode_hint) ?:
+               (default_acl
+                ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+                                     default_acl, ACL_TYPE_DEFAULT)
+                : 0) ?:
+               (acl
+                ? bch2_set_acl_trans(&trans, &inode_u, &hash_info,
+                                     acl, ACL_TYPE_ACCESS)
+                : 0) ?:
+               (!tmpfile
+                ? __bch2_dirent_create(&trans, dir->v.i_ino,
+                                       &dir->ei_str_hash,
+                                       mode_to_type(mode),
+                                       &dentry->d_name,
+                                       inode_u.bi_inum,
+                                       BCH_HASH_SET_MUST_CREATE)
+               : 0) ?:
+               (!tmpfile
+                ? bch2_write_inode_trans(&trans, dir, &dir_u,
+                                         inode_update_for_create_fn,
+                                         &inode_u)
+                : 0) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+       if (ret == -EINTR)
+               goto retry;
        if (unlikely(ret))
-               goto err_acct_quota;
+               goto err_trans;
 
-       bch2_vfs_inode_init(c, inode, &inode_u);
        atomic_long_inc(&c->nr_inodes);
 
-       if (default_acl) {
-               ret = __bch2_set_acl(&inode->v, default_acl, ACL_TYPE_DEFAULT);
-               if (unlikely(ret))
-                       goto err;
+       if (!tmpfile) {
+               bch2_inode_update_after_write(c, dir, &dir_u,
+                                             ATTR_MTIME|ATTR_CTIME);
+               journal_seq_copy(dir, inode->ei_journal_seq);
        }
 
-       if (acl) {
-               ret = __bch2_set_acl(&inode->v, acl, ACL_TYPE_ACCESS);
-               if (unlikely(ret))
-                       goto err;
+       bch2_vfs_inode_init(c, inode, &inode_u);
+
+       set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+       set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
+
+       /*
+        * we must insert the new inode into the inode cache before calling
+        * bch2_trans_exit() and dropping locks, else we could race with another
+        * thread pulling the inode in and modifying it:
+        */
+
+       old = to_bch_ei(insert_inode_locked2(&inode->v));
+       if (unlikely(old)) {
+               /*
+                * We raced, another process pulled the new inode into cache
+                * before us:
+                */
+               old->ei_journal_seq = inode->ei_journal_seq;
+               make_bad_inode(&inode->v);
+               iput(&inode->v);
+
+               inode = old;
+       } else {
+               /*
+                * we really don't want insert_inode_locked2() to be setting
+                * I_NEW...
+                */
+               unlock_new_inode(&inode->v);
        }
 
-       insert_inode_hash(&inode->v);
+       bch2_trans_exit(&trans);
 out:
        posix_acl_release(default_acl);
        posix_acl_release(acl);
        return inode;
-err_acct_quota:
-       bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
-err_make_bad:
-       /*
-        * indicate to bch_evict_inode that the inode was never actually
-        * created:
-        */
+err_trans:
+       bch2_trans_exit(&trans);
        make_bad_inode(&inode->v);
-err:
-       clear_nlink(&inode->v);
        iput(&inode->v);
+err:
+       bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, BCH_QUOTA_WARN);
        inode = ERR_PTR(ret);
        goto out;
 }
 
-static int bch2_vfs_dirent_create(struct bch_fs *c,
-                                 struct bch_inode_info *dir,
-                                 u8 type, const struct qstr *name,
-                                 u64 dst)
-{
-       int ret;
-
-       ret = bch2_dirent_create(c, dir->v.i_ino, &dir->ei_str_hash,
-                               type, name, dst,
-                               &dir->ei_journal_seq,
-                               BCH_HASH_SET_MUST_CREATE);
-       if (unlikely(ret))
-               return ret;
-
-       dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
-       mark_inode_dirty_sync(&dir->v);
-       return 0;
-}
-
-static int __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
-                        umode_t mode, dev_t rdev)
-{
-       struct bch_fs *c = dir->v.i_sb->s_fs_info;
-       struct bch_inode_info *inode;
-       int ret;
-
-       inode = bch2_vfs_inode_create(c, dir, mode, rdev);
-       if (unlikely(IS_ERR(inode)))
-               return PTR_ERR(inode);
-
-       ret = bch2_vfs_dirent_create(c, dir, mode_to_type(mode),
-                                    &dentry->d_name, inode->v.i_ino);
-       if (unlikely(ret)) {
-               clear_nlink(&inode->v);
-               iput(&inode->v);
-               return ret;
-       }
-
-       if (dir->ei_journal_seq > inode->ei_journal_seq)
-               inode->ei_journal_seq = dir->ei_journal_seq;
-
-       d_instantiate(dentry, &inode->v);
-       return 0;
-}
-
 /* methods */
 
 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
@@ -354,7 +401,70 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 static int bch2_create(struct inode *vdir, struct dentry *dentry,
                       umode_t mode, bool excl)
 {
-       return __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0);
+       struct bch_inode_info *inode =
+               __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFREG, 0, false);
+
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
+
+static int inode_update_for_link_fn(struct bch_inode_info *inode,
+                                   struct bch_inode_unpacked *bi,
+                                   void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct timespec now = current_time(&inode->v);
+
+       bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+       if (bi->bi_flags & BCH_INODE_UNLINKED)
+               bi->bi_flags &= ~BCH_INODE_UNLINKED;
+       else
+               bi->bi_nlink++;
+
+       return 0;
+}
+
+static int __bch2_link(struct bch_fs *c,
+                      struct bch_inode_info *inode,
+                      struct bch_inode_info *dir,
+                      struct dentry *dentry)
+{
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
+       int ret;
+
+       lockdep_assert_held(&inode->v.i_rwsem);
+
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_dirent_create(&trans, dir->v.i_ino,
+                                    &dir->ei_str_hash,
+                                    mode_to_type(inode->v.i_mode),
+                                    &dentry->d_name,
+                                    inode->v.i_ino,
+                                    BCH_HASH_SET_MUST_CREATE) ?:
+               bch2_write_inode_trans(&trans, inode, &inode_u,
+                                      inode_update_for_link_fn,
+                                      NULL) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+
+       if (ret == -EINTR)
+               goto retry;
+
+       if (likely(!ret))
+               bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+
+       bch2_trans_exit(&trans);
+       return ret;
 }
 
 static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
@@ -365,25 +475,43 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
        struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
        int ret;
 
-       lockdep_assert_held(&inode->v.i_rwsem);
-
-       inode->v.i_ctime = current_time(&dir->v);
-
-       ret = bch2_inc_nlink(c, inode);
-       if (ret)
+       ret = __bch2_link(c, inode, dir, dentry);
+       if (unlikely(ret))
                return ret;
 
        ihold(&inode->v);
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
 
-       ret = bch2_vfs_dirent_create(c, dir, mode_to_type(inode->v.i_mode),
-                                    &dentry->d_name, inode->v.i_ino);
-       if (unlikely(ret)) {
-               bch2_dec_nlink(c, inode);
-               iput(&inode->v);
-               return ret;
-       }
+static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
+                                         struct bch_inode_unpacked *bi,
+                                         void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_inode_info *unlink_inode = p;
+       struct timespec now = current_time(&inode->v);
+
+       bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+
+       bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
+
+       return 0;
+}
+
+static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct timespec now = current_time(&inode->v);
+
+       bi->bi_ctime = timespec_to_bch2_time(c, now);
+       if (bi->bi_nlink)
+               bi->bi_nlink--;
+       else
+               bi->bi_flags |= BCH_INODE_UNLINKED;
 
-       d_instantiate(dentry, &inode->v);
        return 0;
 }
 
@@ -392,28 +520,44 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
        struct bch_fs *c = vdir->i_sb->s_fs_info;
        struct bch_inode_info *dir = to_bch_ei(vdir);
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+       struct bch_inode_unpacked dir_u, inode_u;
+       struct btree_trans trans;
        int ret;
 
-       lockdep_assert_held(&inode->v.i_rwsem);
-
-       ret = bch2_dirent_delete(c, dir->v.i_ino, &dir->ei_str_hash,
-                                &dentry->d_name, &dir->ei_journal_seq);
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret   = __bch2_dirent_delete(&trans, dir->v.i_ino,
+                                    &dir->ei_str_hash,
+                                    &dentry->d_name) ?:
+               bch2_write_inode_trans(&trans, dir, &dir_u,
+                                      inode_update_dir_for_unlink_fn,
+                                      inode) ?:
+               bch2_write_inode_trans(&trans, inode, &inode_u,
+                                      inode_update_for_unlink_fn,
+                                      NULL) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &dir->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
        if (ret)
-               return ret;
+               goto err;
 
        if (dir->ei_journal_seq > inode->ei_journal_seq)
                inode->ei_journal_seq = dir->ei_journal_seq;
 
-       inode->v.i_ctime = dir->v.i_ctime;
-
-       if (S_ISDIR(inode->v.i_mode)) {
-               bch2_dec_nlink(c, dir);
-               drop_nlink(&inode->v);
-       }
-
-       bch2_dec_nlink(c, inode);
+       bch2_inode_update_after_write(c, dir, &dir_u,
+                                     ATTR_MTIME|ATTR_CTIME);
+       bch2_inode_update_after_write(c, inode, &inode_u,
+                                     ATTR_MTIME);
+err:
+       bch2_trans_exit(&trans);
 
-       return 0;
+       return ret;
 }
 
 static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
@@ -423,7 +567,7 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
        struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
        int ret;
 
-       inode = bch2_vfs_inode_create(c, dir, S_IFLNK|S_IRWXUGO, 0);
+       inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
        if (unlikely(IS_ERR(inode)))
                return PTR_ERR(inode);
 
@@ -438,37 +582,28 @@ static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
        if (unlikely(ret))
                goto err;
 
-       /* XXX: racy */
-       if (dir->ei_journal_seq < inode->ei_journal_seq)
-               dir->ei_journal_seq = inode->ei_journal_seq;
+       journal_seq_copy(dir, inode->ei_journal_seq);
 
-       ret = bch2_vfs_dirent_create(c, dir, DT_LNK, &dentry->d_name,
-                                    inode->v.i_ino);
+       ret = __bch2_link(c, inode, dir, dentry);
        if (unlikely(ret))
                goto err;
 
        d_instantiate(dentry, &inode->v);
        return 0;
 err:
-       clear_nlink(&inode->v);
        iput(&inode->v);
        return ret;
 }
 
 static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
 {
-       struct bch_fs *c = vdir->i_sb->s_fs_info;
-       struct bch_inode_info *dir = to_bch_ei(vdir);
-       int ret;
-
-       lockdep_assert_held(&dir->v.i_rwsem);
-
-       ret = __bch2_create(dir, dentry, mode|S_IFDIR, 0);
-       if (unlikely(ret))
-               return ret;
+       struct bch_inode_info *inode =
+               __bch2_create(to_bch_ei(vdir), dentry, mode|S_IFDIR, 0, false);
 
-       bch2_inc_nlink(c, dir);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
 
+       d_instantiate(dentry, &inode->v);
        return 0;
 }
 
@@ -485,151 +620,197 @@ static int bch2_rmdir(struct inode *vdir, struct dentry *dentry)
 static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
                      umode_t mode, dev_t rdev)
 {
-       return __bch2_create(to_bch_ei(vdir), dentry, mode, rdev);
-}
-
-static int bch2_rename(struct bch_fs *c,
-                      struct bch_inode_info *old_dir,
-                      struct dentry *old_dentry,
-                      struct bch_inode_info *new_dir,
-                      struct dentry *new_dentry)
-{
-       struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
-       struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-       struct timespec now = current_time(&old_dir->v);
-       int ret;
-
-       lockdep_assert_held(&old_dir->v.i_rwsem);
-       lockdep_assert_held(&new_dir->v.i_rwsem);
+       struct bch_inode_info *inode =
+               __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
 
-       if (new_inode)
-               filemap_write_and_wait_range(old_inode->v.i_mapping,
-                                            0, LLONG_MAX);
-
-       if (new_inode && S_ISDIR(old_inode->v.i_mode)) {
-               lockdep_assert_held(&new_inode->v.i_rwsem);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
 
-               if (!S_ISDIR(new_inode->v.i_mode))
-                       return -ENOTDIR;
+       d_instantiate(dentry, &inode->v);
+       return 0;
+}
 
-               if (bch2_empty_dir(c, new_inode->v.i_ino))
-                       return -ENOTEMPTY;
+struct rename_info {
+       u64                     now;
+       struct bch_inode_info   *src_dir;
+       struct bch_inode_info   *dst_dir;
+       struct bch_inode_info   *src_inode;
+       struct bch_inode_info   *dst_inode;
+       enum bch_rename_mode    mode;
+};
 
-               ret = bch2_dirent_rename(c,
-                               old_dir, &old_dentry->d_name,
-                               new_dir, &new_dentry->d_name,
-                               &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
-               if (unlikely(ret))
-                       return ret;
+static int inode_update_for_rename_fn(struct bch_inode_info *inode,
+                                     struct bch_inode_unpacked *bi,
+                                     void *p)
+{
+       struct rename_info *info = p;
 
-               clear_nlink(&new_inode->v);
-               bch2_dec_nlink(c, old_dir);
-       } else if (new_inode) {
-               lockdep_assert_held(&new_inode->v.i_rwsem);
+       if (inode == info->src_dir) {
+               bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode);
+               bi->bi_nlink += info->dst_inode &&
+                       S_ISDIR(info->dst_inode->v.i_mode) &&
+                       info->mode == BCH_RENAME_EXCHANGE;
+       }
 
-               ret = bch2_dirent_rename(c,
-                               old_dir, &old_dentry->d_name,
-                               new_dir, &new_dentry->d_name,
-                               &old_inode->ei_journal_seq, BCH_RENAME_OVERWRITE);
-               if (unlikely(ret))
-                       return ret;
+       if (inode == info->dst_dir) {
+               bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode);
+               bi->bi_nlink -= info->dst_inode &&
+                       S_ISDIR(info->dst_inode->v.i_mode);
+       }
 
-               new_inode->v.i_ctime = now;
-               bch2_dec_nlink(c, new_inode);
-       } else if (S_ISDIR(old_inode->v.i_mode)) {
-               ret = bch2_dirent_rename(c,
-                               old_dir, &old_dentry->d_name,
-                               new_dir, &new_dentry->d_name,
-                               &old_inode->ei_journal_seq, BCH_RENAME);
-               if (unlikely(ret))
-                       return ret;
+       if (inode == info->dst_inode &&
+           info->mode == BCH_RENAME_OVERWRITE) {
+               BUG_ON(bi->bi_nlink &&
+                      S_ISDIR(info->dst_inode->v.i_mode));
 
-               bch2_inc_nlink(c, new_dir);
-               bch2_dec_nlink(c, old_dir);
-       } else {
-               ret = bch2_dirent_rename(c,
-                               old_dir, &old_dentry->d_name,
-                               new_dir, &new_dentry->d_name,
-                               &old_inode->ei_journal_seq, BCH_RENAME);
-               if (unlikely(ret))
-                       return ret;
+               if (bi->bi_nlink)
+                       bi->bi_nlink--;
+               else
+                       bi->bi_flags |= BCH_INODE_UNLINKED;
        }
 
-       old_dir->v.i_ctime = old_dir->v.i_mtime = now;
-       new_dir->v.i_ctime = new_dir->v.i_mtime = now;
-       mark_inode_dirty_sync(&old_dir->v);
-       mark_inode_dirty_sync(&new_dir->v);
-
-       old_inode->v.i_ctime = now;
-       mark_inode_dirty_sync(&old_inode->v);
+       if (inode == info->src_dir ||
+           inode == info->dst_dir)
+               bi->bi_mtime = info->now;
+       bi->bi_ctime = info->now;
 
        return 0;
 }
 
-static int bch2_rename_exchange(struct bch_fs *c,
-                               struct bch_inode_info *old_dir,
-                               struct dentry *old_dentry,
-                               struct bch_inode_info *new_dir,
-                               struct dentry *new_dentry)
+static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
+                       struct inode *dst_vdir, struct dentry *dst_dentry,
+                       unsigned flags)
 {
-       struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
-       struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-       struct timespec now = current_time(&old_dir->v);
+       struct bch_fs *c = src_vdir->i_sb->s_fs_info;
+       struct rename_info i = {
+               .now            = timespec_to_bch2_time(c,
+                                               current_time(src_vdir)),
+               .src_dir        = to_bch_ei(src_vdir),
+               .dst_dir        = to_bch_ei(dst_vdir),
+               .src_inode      = to_bch_ei(src_dentry->d_inode),
+               .dst_inode      = to_bch_ei(dst_dentry->d_inode),
+               .mode           = flags & RENAME_EXCHANGE
+                               ? BCH_RENAME_EXCHANGE
+                       : dst_dentry->d_inode
+                               ? BCH_RENAME_OVERWRITE : BCH_RENAME,
+       };
+       struct btree_trans trans;
+       struct bch_inode_unpacked dst_dir_u, src_dir_u;
+       struct bch_inode_unpacked src_inode_u, dst_inode_u;
+       u64 journal_seq = 0;
        int ret;
 
-       ret = bch2_dirent_rename(c,
-                                old_dir, &old_dentry->d_name,
-                                new_dir, &new_dentry->d_name,
-                                &old_inode->ei_journal_seq, BCH_RENAME_EXCHANGE);
-       if (unlikely(ret))
-               return ret;
+       if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+               return -EINVAL;
 
-       if (S_ISDIR(old_inode->v.i_mode) !=
-           S_ISDIR(new_inode->v.i_mode)) {
-               if (S_ISDIR(old_inode->v.i_mode)) {
-                       bch2_inc_nlink(c, new_dir);
-                       bch2_dec_nlink(c, old_dir);
-               } else {
-                       bch2_dec_nlink(c, new_dir);
-                       bch2_inc_nlink(c, old_dir);
-               }
+       if (i.mode == BCH_RENAME_OVERWRITE) {
+               if (S_ISDIR(i.src_inode->v.i_mode) !=
+                   S_ISDIR(i.dst_inode->v.i_mode))
+                       return -ENOTDIR;
+
+               if (S_ISDIR(i.src_inode->v.i_mode) &&
+                   bch2_empty_dir(c, i.dst_inode->v.i_ino))
+                       return -ENOTEMPTY;
+
+               ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping,
+                                                  0, LLONG_MAX);
+               if (ret)
+                       return ret;
        }
 
-       old_dir->v.i_ctime = old_dir->v.i_mtime = now;
-       new_dir->v.i_ctime = new_dir->v.i_mtime = now;
-       mark_inode_dirty_sync(&old_dir->v);
-       mark_inode_dirty_sync(&new_dir->v);
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+       i.now = timespec_to_bch2_time(c, current_time(src_vdir)),
+
+       ret   = bch2_dirent_rename(&trans,
+                                  i.src_dir, &src_dentry->d_name,
+                                  i.dst_dir, &dst_dentry->d_name,
+                                  i.mode) ?:
+               bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u,
+                                      inode_update_for_rename_fn, &i) ?:
+               (i.src_dir != i.dst_dir
+                ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u,
+                                      inode_update_for_rename_fn, &i)
+                : 0 ) ?:
+               bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u,
+                                      inode_update_for_rename_fn, &i) ?:
+               (i.dst_inode
+                ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
+                                      inode_update_for_rename_fn, &i)
+                : 0 ) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err;
 
-       old_inode->v.i_ctime = now;
-       new_inode->v.i_ctime = now;
-       mark_inode_dirty_sync(&old_inode->v);
-       mark_inode_dirty_sync(&new_inode->v);
+       bch2_inode_update_after_write(c, i.src_dir, &src_dir_u,
+                                     ATTR_MTIME|ATTR_CTIME);
+       journal_seq_copy(i.src_dir, journal_seq);
 
-       return 0;
-}
+       if (i.src_dir != i.dst_dir) {
+               bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u,
+                                             ATTR_MTIME|ATTR_CTIME);
+               journal_seq_copy(i.dst_dir, journal_seq);
+       }
 
-static int bch2_rename2(struct inode *old_vdir, struct dentry *old_dentry,
-                       struct inode *new_vdir, struct dentry *new_dentry,
-                       unsigned flags)
-{
-       struct bch_fs *c = old_vdir->i_sb->s_fs_info;
-       struct bch_inode_info *old_dir = to_bch_ei(old_vdir);
-       struct bch_inode_info *new_dir = to_bch_ei(new_vdir);
+       bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
+                                     ATTR_CTIME);
+       if (i.dst_inode)
+               bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u,
+                                             ATTR_CTIME);
+err:
+       bch2_trans_exit(&trans);
 
-       if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
-               return -EINVAL;
+       return ret;
+}
 
-       if (flags & RENAME_EXCHANGE)
-               return bch2_rename_exchange(c, old_dir, old_dentry,
-                                           new_dir, new_dentry);
+static int inode_update_for_setattr_fn(struct bch_inode_info *inode,
+                                      struct bch_inode_unpacked *bi,
+                                      void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct iattr *attr = p;
+       unsigned int ia_valid = attr->ia_valid;
+
+       if (ia_valid & ATTR_UID)
+               bi->bi_uid = from_kuid(inode->v.i_sb->s_user_ns, attr->ia_uid);
+       if (ia_valid & ATTR_GID)
+               bi->bi_gid = from_kgid(inode->v.i_sb->s_user_ns, attr->ia_gid);
+
+       if (ia_valid & ATTR_ATIME)
+               bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
+       if (ia_valid & ATTR_MTIME)
+               bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
+       if (ia_valid & ATTR_CTIME)
+               bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
+
+       if (ia_valid & ATTR_MODE) {
+               umode_t mode = attr->ia_mode;
+               kgid_t gid = ia_valid & ATTR_GID
+                       ? attr->ia_gid
+                       : inode->v.i_gid;
+
+               if (!in_group_p(gid) &&
+                   !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
+                       mode &= ~S_ISGID;
+               bi->bi_mode = mode;
+       }
 
-       return bch2_rename(c, old_dir, old_dentry, new_dir, new_dentry);
+       return 0;
 }
 
 static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iattr)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_qid qid = inode->ei_qid;
+       struct btree_trans trans;
+       struct bch_inode_unpacked inode_u;
+       struct posix_acl *acl = NULL;
        unsigned qtypes = 0;
        int ret;
 
@@ -654,18 +835,38 @@ static int bch2_setattr_nonsize(struct bch_inode_info *inode, struct iattr *iatt
                                          inode->v.i_blocks +
                                          inode->ei_quota_reserved);
                if (ret)
-                       goto out_unlock;
+                       goto err;
        }
 
-       setattr_copy(&inode->v, iattr);
+       bch2_trans_init(&trans, c);
+retry:
+       bch2_trans_begin(&trans);
+       kfree(acl);
+       acl = NULL;
+
+       ret = bch2_write_inode_trans(&trans, inode, &inode_u,
+                               inode_update_for_setattr_fn, iattr) ?:
+               (iattr->ia_valid & ATTR_MODE
+                ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
+                : 0) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
+                                 &inode->ei_journal_seq,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOUNLOCK|
+                                 BTREE_INSERT_NOFAIL);
+       if (ret == -EINTR)
+               goto retry;
+       if (unlikely(ret))
+               goto err_trans;
 
-       ret = bch2_write_inode(c, inode);
-out_unlock:
-       mutex_unlock(&inode->ei_update_lock);
+       bch2_inode_update_after_write(c, inode, &inode_u, iattr->ia_valid);
 
-       if (!ret &&
-           iattr->ia_valid & ATTR_MODE)
-               ret = posix_acl_chmod(&inode->v, inode->v.i_mode);
+       if (acl)
+               set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
+err_trans:
+       bch2_trans_exit(&trans);
+err:
+       mutex_unlock(&inode->ei_update_lock);
 
        return ret;
 }
@@ -723,16 +924,14 @@ static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
 
 static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
 {
-       struct bch_fs *c = vdir->i_sb->s_fs_info;
-       struct bch_inode_info *dir = to_bch_ei(vdir);
-       struct bch_inode_info *inode;
+       struct bch_inode_info *inode =
+               __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true);
 
-       /* XXX: i_nlink should be 0? */
-       inode = bch2_vfs_inode_create(c, dir, mode, 0);
-       if (unlikely(IS_ERR(inode)))
+       if (IS_ERR(inode))
                return PTR_ERR(inode);
 
-       d_tmpfile(dentry, &inode->v);
+       d_mark_tmpfile(dentry, &inode->v);
+       d_instantiate(dentry, &inode->v);
        return 0;
 }
 
@@ -987,24 +1186,17 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
                                struct bch_inode_info *inode,
                                struct bch_inode_unpacked *bi)
 {
-       inode->v.i_mode         = bi->bi_mode;
-       i_uid_write(&inode->v, bi->bi_uid);
-       i_gid_write(&inode->v, bi->bi_gid);
+       bch2_inode_update_after_write(c, inode, bi, ~0);
+
        inode->v.i_blocks       = bi->bi_sectors;
        inode->v.i_ino          = bi->bi_inum;
-       set_nlink(&inode->v, bi->bi_nlink + nlink_bias(inode->v.i_mode));
        inode->v.i_rdev         = bi->bi_dev;
        inode->v.i_generation   = bi->bi_generation;
        inode->v.i_size         = bi->bi_size;
-       inode->v.i_atime        = bch2_time_to_timespec(c, bi->bi_atime);
-       inode->v.i_mtime        = bch2_time_to_timespec(c, bi->bi_mtime);
-       inode->v.i_ctime        = bch2_time_to_timespec(c, bi->bi_ctime);
 
        inode->ei_journal_seq   = 0;
        inode->ei_quota_reserved = 0;
-       inode->ei_qid           = bch_qid(bi);
        inode->ei_str_hash      = bch2_hash_info_init(c, bi);
-       inode->ei_inode         = *bi;
 
        bch2_inode_flags_to_vfs(inode);
 
@@ -1059,6 +1251,19 @@ static void bch2_destroy_inode(struct inode *vinode)
        call_rcu(&vinode->i_rcu, bch2_i_callback);
 }
 
+static int inode_update_times_fn(struct bch_inode_info *inode,
+                                struct bch_inode_unpacked *bi,
+                                void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
+       bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
+       bi->bi_ctime    = timespec_to_bch2_time(c, inode->v.i_ctime);
+
+       return 0;
+}
+
 static int bch2_vfs_write_inode(struct inode *vinode,
                                struct writeback_control *wbc)
 {
@@ -1067,7 +1272,8 @@ static int bch2_vfs_write_inode(struct inode *vinode,
        int ret;
 
        mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode);
+       ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
        mutex_unlock(&inode->ei_update_lock);
 
        if (c->opts.journal_flush_disabled)
@@ -1096,7 +1302,9 @@ static void bch2_evict_inode(struct inode *vinode)
                bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
                                BCH_QUOTA_WARN);
                bch2_inode_rm(c, inode->v.i_ino);
-               atomic_long_dec(&c->nr_inodes);
+
+               WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0,
+                         "nr_inodes < 0");
        }
 }
 
index fbbc7a3a3cb7f555edcfffe77d398f5060c615bf..e2fc2706da44b31b8eca0806b9f6b082433acb9d 100644 (file)
@@ -51,8 +51,16 @@ struct bch_inode_unpacked;
 typedef int (*inode_set_fn)(struct bch_inode_info *,
                            struct bch_inode_unpacked *, void *);
 
+void bch2_inode_update_after_write(struct bch_fs *,
+                                  struct bch_inode_info *,
+                                  struct bch_inode_unpacked *,
+                                  unsigned);
+int __must_check bch2_write_inode_trans(struct btree_trans *,
+                               struct bch_inode_info *,
+                               struct bch_inode_unpacked *,
+                               inode_set_fn, void *);
 int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
-                                   inode_set_fn, void *);
+                                   inode_set_fn, void *, unsigned);
 int __must_check bch2_write_inode(struct bch_fs *,
                                  struct bch_inode_info *);
 
index edf714f7b98d864698c2c59b76dbdcd4c8217044..f6035cc7859a2568e1831d4da43fa5a5844df443 100644 (file)
@@ -126,16 +126,22 @@ static int walk_inode(struct bch_fs *c, struct inode_walker *w, u64 inum)
 
 struct hash_check {
        struct bch_hash_info    info;
-       struct btree_iter       chain;
-       struct btree_iter       iter;
+       struct btree_trans      *trans;
+
+       /* start of current chain of hash collisions: */
+       struct btree_iter       *chain;
+
+       /* next offset in current chain of hash collisions: */
        u64                     next;
 };
 
 static void hash_check_init(const struct bch_hash_desc desc,
-                           struct hash_check *h, struct bch_fs *c)
+                           struct btree_trans *trans,
+                           struct hash_check *h)
 {
-       bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN, 0);
-       bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN, 0);
+       h->trans = trans;
+       h->chain = bch2_trans_get_iter(trans, desc.btree_id, POS_MIN, 0);
+       h->next = -1;
 }
 
 static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
@@ -173,6 +179,75 @@ err:
        return ret;
 }
 
+/* fsck hasn't been converted to new transactions yet: */
+static int fsck_hash_delete_at(const struct bch_hash_desc desc,
+                              struct bch_hash_info *info,
+                              struct btree_iter *orig_iter)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       int ret;
+
+       bch2_btree_iter_unlock(orig_iter);
+
+       bch2_trans_init(&trans, orig_iter->c);
+retry:
+       bch2_trans_begin(&trans);
+
+       iter = bch2_trans_copy_iter(&trans, orig_iter);
+       if (IS_ERR(iter)) {
+               ret = PTR_ERR(iter);
+               goto err;
+       }
+
+       ret   = bch2_hash_delete_at(&trans, desc, info, iter) ?:
+               bch2_trans_commit(&trans, NULL, NULL, NULL,
+                                 BTREE_INSERT_ATOMIC|
+                                 BTREE_INSERT_NOFAIL);
+err:
+       if (ret == -EINTR)
+               goto retry;
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int hash_check_duplicates(const struct bch_hash_desc desc,
+                                struct hash_check *h, struct bch_fs *c,
+                                struct btree_iter *k_iter, struct bkey_s_c k)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c k2;
+       char buf[200];
+       int ret = 0;
+
+       if (!bkey_cmp(h->chain->pos, k_iter->pos))
+               return 0;
+
+       iter = bch2_trans_copy_iter(h->trans, h->chain);
+       BUG_ON(IS_ERR(iter));
+
+       for_each_btree_key_continue(iter, 0, k2) {
+               if (bkey_cmp(k2.k->p, k.k->p) >= 0)
+                       break;
+
+               if (fsck_err_on(k2.k->type == desc.key_type &&
+                               !desc.cmp_bkey(k, k2), c,
+                               "duplicate hash table keys:\n%s",
+                               (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
+                                                      buf, sizeof(buf), k), buf))) {
+                       ret = fsck_hash_delete_at(desc, &h->info, k_iter);
+                       if (ret)
+                               return ret;
+                       ret = 1;
+                       break;
+               }
+       }
+fsck_err:
+       bch2_trans_iter_free(h->trans, iter);
+       return ret;
+}
+
 static int hash_check_key(const struct bch_hash_desc desc,
                          struct hash_check *h, struct bch_fs *c,
                          struct btree_iter *k_iter, struct bkey_s_c k)
@@ -185,13 +260,8 @@ static int hash_check_key(const struct bch_hash_desc desc,
            k.k->type != desc.key_type)
                return 0;
 
-       if (k.k->p.offset != h->next) {
-               if (!btree_iter_linked(&h->chain)) {
-                       bch2_btree_iter_link(k_iter, &h->chain);
-                       bch2_btree_iter_link(k_iter, &h->iter);
-               }
-               bch2_btree_iter_copy(&h->chain, k_iter);
-       }
+       if (k.k->p.offset != h->next)
+               bch2_btree_iter_copy(h->chain, k_iter);
        h->next = k.k->p.offset + 1;
 
        if (k.k->type != desc.key_type)
@@ -199,11 +269,11 @@ static int hash_check_key(const struct bch_hash_desc desc,
 
        hashed = desc.hash_bkey(&h->info, k);
 
-       if (fsck_err_on(hashed < h->chain.pos.offset ||
+       if (fsck_err_on(hashed < h->chain->pos.offset ||
                        hashed > k.k->p.offset, c,
                        "hash table key at wrong offset: %llu, "
                        "hashed to %llu chain starts at %llu\n%s",
-                       k.k->p.offset, hashed, h->chain.pos.offset,
+                       k.k->p.offset, hashed, h->chain->pos.offset,
                        (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
                                               buf, sizeof(buf), k), buf))) {
                ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
@@ -214,25 +284,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
                return 1;
        }
 
-       if (!bkey_cmp(h->chain.pos, k_iter->pos))
-               return 0;
-
-       bch2_btree_iter_copy(&h->iter, &h->chain);
-       while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) {
-               struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter);
-
-               if (fsck_err_on(k2.k->type == desc.key_type &&
-                               !desc.cmp_bkey(k, k2), c,
-                               "duplicate hash table keys:\n%s",
-                               (bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
-                                                      buf, sizeof(buf), k), buf))) {
-                       ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
-                       if (ret)
-                               return ret;
-                       return 1;
-               }
-               bch2_btree_iter_next(&h->iter);
-       }
+       ret = hash_check_duplicates(desc, h, c, k_iter, k);
 fsck_err:
        return ret;
 }
@@ -250,6 +302,8 @@ static int check_extents(struct bch_fs *c)
        u64 i_sectors;
        int ret = 0;
 
+       bch_verbose(c, "checking extents");
+
        for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
                           POS(BCACHEFS_ROOT_INO, 0), 0, k) {
                ret = walk_inode(c, &w, k.k->p.inode);
@@ -332,16 +386,25 @@ static int check_dirents(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
        struct hash_check h;
-       struct btree_iter iter;
+       struct btree_trans trans;
+       struct btree_iter *iter;
        struct bkey_s_c k;
        unsigned name_len;
        char buf[200];
        int ret = 0;
 
-       hash_check_init(bch2_dirent_hash_desc, &h, c);
+       bch_verbose(c, "checking dirents");
 
-       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
-                          POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+       bch2_trans_init(&trans, c);
+
+       BUG_ON(bch2_trans_preload_iters(&trans));
+
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+                                  POS(BCACHEFS_ROOT_INO, 0), 0);
+
+       hash_check_init(bch2_dirent_hash_desc, &trans, &h);
+
+       for_each_btree_key_continue(iter, 0, k) {
                struct bkey_s_c_dirent d;
                struct bch_inode_unpacked target;
                bool have_target;
@@ -360,7 +423,7 @@ static int check_dirents(struct bch_fs *c)
                                mode_to_type(w.inode.bi_mode),
                                (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
                                                       buf, sizeof(buf), k), buf))) {
-                       ret = bch2_btree_delete_at(&iter, 0);
+                       ret = bch2_btree_delete_at(iter, 0);
                        if (ret)
                                goto err;
                        continue;
@@ -369,7 +432,7 @@ static int check_dirents(struct bch_fs *c)
                if (w.first_this_inode && w.have_inode)
                        hash_check_set_inode(&h, c, &w.inode);
 
-               ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k);
+               ret = hash_check_key(bch2_dirent_hash_desc, &h, c, iter, k);
                if (ret > 0) {
                        ret = 0;
                        continue;
@@ -393,7 +456,7 @@ static int check_dirents(struct bch_fs *c)
                    fsck_err_on(name_len == 2 &&
                                !memcmp(d.v->d_name, "..", 2), c,
                                ".. dirent")) {
-                       ret = remove_dirent(c, &iter, d);
+                       ret = remove_dirent(c, iter, d);
                        if (ret)
                                goto err;
                        continue;
@@ -403,7 +466,7 @@ static int check_dirents(struct bch_fs *c)
                                "dirent points to own directory:\n%s",
                                (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
                                                       buf, sizeof(buf), k), buf))) {
-                       ret = remove_dirent(c, &iter, d);
+                       ret = remove_dirent(c, iter, d);
                        if (ret)
                                goto err;
                        continue;
@@ -420,7 +483,7 @@ static int check_dirents(struct bch_fs *c)
                                "dirent points to missing inode:\n%s",
                                (bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
                                                       buf, sizeof(buf), k), buf))) {
-                       ret = remove_dirent(c, &iter, d);
+                       ret = remove_dirent(c, iter, d);
                        if (ret)
                                goto err;
                        continue;
@@ -446,7 +509,7 @@ static int check_dirents(struct bch_fs *c)
 
                        ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
                                        BTREE_INSERT_NOFAIL,
-                                       BTREE_INSERT_ENTRY(&iter, &n->k_i));
+                                       BTREE_INSERT_ENTRY(iter, &n->k_i));
                        kfree(n);
                        if (ret)
                                goto err;
@@ -455,9 +518,7 @@ static int check_dirents(struct bch_fs *c)
        }
 err:
 fsck_err:
-       bch2_btree_iter_unlock(&h.chain);
-       bch2_btree_iter_unlock(&h.iter);
-       return bch2_btree_iter_unlock(&iter) ?: ret;
+       return bch2_trans_exit(&trans) ?: ret;
 }
 
 /*
@@ -468,14 +529,23 @@ static int check_xattrs(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
        struct hash_check h;
-       struct btree_iter iter;
+       struct btree_trans trans;
+       struct btree_iter *iter;
        struct bkey_s_c k;
        int ret = 0;
 
-       hash_check_init(bch2_xattr_hash_desc, &h, c);
+       bch_verbose(c, "checking xattrs");
 
-       for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
-                          POS(BCACHEFS_ROOT_INO, 0), 0, k) {
+       bch2_trans_init(&trans, c);
+
+       BUG_ON(bch2_trans_preload_iters(&trans));
+
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
+                                  POS(BCACHEFS_ROOT_INO, 0), 0);
+
+       hash_check_init(bch2_xattr_hash_desc, &trans, &h);
+
+       for_each_btree_key_continue(iter, 0, k) {
                ret = walk_inode(c, &w, k.k->p.inode);
                if (ret)
                        break;
@@ -483,7 +553,7 @@ static int check_xattrs(struct bch_fs *c)
                if (fsck_err_on(!w.have_inode, c,
                                "xattr for missing inode %llu",
                                k.k->p.inode)) {
-                       ret = bch2_btree_delete_at(&iter, 0);
+                       ret = bch2_btree_delete_at(iter, 0);
                        if (ret)
                                goto err;
                        continue;
@@ -492,15 +562,13 @@ static int check_xattrs(struct bch_fs *c)
                if (w.first_this_inode && w.have_inode)
                        hash_check_set_inode(&h, c, &w.inode);
 
-               ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k);
+               ret = hash_check_key(bch2_xattr_hash_desc, &h, c, iter, k);
                if (ret)
                        goto fsck_err;
        }
 err:
 fsck_err:
-       bch2_btree_iter_unlock(&h.chain);
-       bch2_btree_iter_unlock(&h.iter);
-       return bch2_btree_iter_unlock(&iter) ?: ret;
+       return bch2_trans_exit(&trans) ?: ret;
 }
 
 /* Get root directory, create if it doesn't exist: */
@@ -509,6 +577,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
        struct bkey_inode_buf packed;
        int ret;
 
+       bch_verbose(c, "checking root directory");
+
        ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
        if (ret && ret != -ENOENT)
                return ret;
@@ -546,6 +616,8 @@ static int check_lostfound(struct bch_fs *c,
        u64 inum;
        int ret;
 
+       bch_verbose(c, "checking lost+found");
+
        inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
                                 &lostfound);
        if (!inum) {
@@ -672,6 +744,8 @@ static int check_directory_structure(struct bch_fs *c,
        u64 d_inum;
        int ret = 0;
 
+       bch_verbose(c, "checking directory structure");
+
        /* DFS: */
 restart_dfs:
        had_unreachable = false;
@@ -872,64 +946,134 @@ s64 bch2_count_inode_sectors(struct bch_fs *c, u64 inum)
        return bch2_btree_iter_unlock(&iter) ?: sectors;
 }
 
-static int bch2_gc_do_inode(struct bch_fs *c,
-                          struct bch_inode_unpacked *lostfound_inode,
-                          struct btree_iter *iter,
-                          struct bkey_s_c_inode inode, struct nlink link)
+static int check_inode_nlink(struct bch_fs *c,
+                            struct bch_inode_unpacked *lostfound_inode,
+                            struct bch_inode_unpacked *u,
+                            struct nlink *link,
+                            bool *do_update)
 {
-       struct bch_inode_unpacked u;
+       u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED
+               ? 0
+               : u->bi_nlink + nlink_bias(u->bi_mode);
+       u32 real_i_nlink =
+               link->count * nlink_bias(u->bi_mode) +
+               link->dir_count;
        int ret = 0;
-       u32 i_nlink, real_i_nlink;
-       bool do_update = false;
 
-       ret = bch2_inode_unpack(inode, &u);
-       if (bch2_fs_inconsistent_on(ret, c,
-                        "error unpacking inode %llu in fsck",
-                        inode.k->p.inode))
-               return ret;
+       /*
+        * These should have been caught/fixed by earlier passes, we don't
+        * repair them here:
+        */
+       if (S_ISDIR(u->bi_mode) && link->count > 1) {
+               need_fsck_err(c, "directory %llu with multiple hardlinks: %u",
+                             u->bi_inum, link->count);
+               return 0;
+       }
 
-       i_nlink = u.bi_nlink + nlink_bias(u.bi_mode);
+       if (S_ISDIR(u->bi_mode) && !link->count) {
+               need_fsck_err(c, "unreachable directory found (inum %llu)",
+                             u->bi_inum);
+               return 0;
+       }
 
-       fsck_err_on(i_nlink < link.count, c,
-                   "inode %llu i_link too small (%u < %u, type %i)",
-                   inode.k->p.inode, i_nlink,
-                   link.count, mode_to_type(u.bi_mode));
+       if (!S_ISDIR(u->bi_mode) && link->dir_count) {
+               need_fsck_err(c, "non directory with subdirectories",
+                             u->bi_inum);
+               return 0;
+       }
 
-       /* These should have been caught/fixed by earlier passes: */
-       if (S_ISDIR(u.bi_mode)) {
-               need_fsck_err_on(link.count > 1, c,
-                       "directory %llu with multiple hardlinks: %u",
-                       inode.k->p.inode, link.count);
+       if (!link->count &&
+           !(u->bi_flags & BCH_INODE_UNLINKED) &&
+           (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+               if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)",
+                            u->bi_inum, mode_to_type(u->bi_mode)) ==
+                   FSCK_ERR_IGNORE)
+                       return 0;
 
-               real_i_nlink = link.count * 2 + link.dir_count;
-       } else {
-               need_fsck_err_on(link.dir_count, c,
-                       "found dirents for non directory %llu",
-                       inode.k->p.inode);
+               ret = reattach_inode(c, lostfound_inode, u->bi_inum);
+               if (ret)
+                       return ret;
 
-               real_i_nlink = link.count + link.dir_count;
+               link->count = 1;
+               real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count;
+               goto set_i_nlink;
        }
 
-       if (!link.count) {
-               fsck_err_on(c->sb.clean, c,
-                           "filesystem marked clean, "
-                           "but found orphaned inode %llu",
-                           inode.k->p.inode);
-
-               if (fsck_err_on(S_ISDIR(u.bi_mode) &&
-                               bch2_empty_dir(c, inode.k->p.inode), c,
-                               "non empty directory with link count 0, "
-                               "inode nlink %u, dir links found %u",
-                               i_nlink, link.dir_count)) {
-                       ret = reattach_inode(c, lostfound_inode,
-                                            inode.k->p.inode);
-                       if (ret)
-                               return ret;
+       if (i_nlink < link->count) {
+               if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)",
+                            u->bi_inum, i_nlink, link->count,
+                            mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE)
+                       return 0;
+               goto set_i_nlink;
+       }
+
+       if (i_nlink != real_i_nlink &&
+           c->sb.clean) {
+               if (fsck_err(c, "filesystem marked clean, "
+                            "but inode %llu has wrong i_nlink "
+                            "(type %u i_nlink %u, should be %u)",
+                            u->bi_inum, mode_to_type(u->bi_mode),
+                            i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+                       return 0;
+               goto set_i_nlink;
+       }
+
+       if (i_nlink != real_i_nlink &&
+           (c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
+               if (fsck_err(c, "inode %llu has wrong i_nlink "
+                            "(type %u i_nlink %u, should be %u)",
+                            u->bi_inum, mode_to_type(u->bi_mode),
+                            i_nlink, real_i_nlink) == FSCK_ERR_IGNORE)
+                       return 0;
+               goto set_i_nlink;
+       }
+
+       if (real_i_nlink && i_nlink != real_i_nlink)
+               bch_verbose(c, "setting inode %llu nlink from %u to %u",
+                           u->bi_inum, i_nlink, real_i_nlink);
+set_i_nlink:
+       if (i_nlink != real_i_nlink) {
+               if (real_i_nlink) {
+                       u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode);
+                       u->bi_flags &= ~BCH_INODE_UNLINKED;
+               } else {
+                       u->bi_nlink = 0;
+                       u->bi_flags |= BCH_INODE_UNLINKED;
                }
 
-               bch_verbose(c, "deleting inode %llu", inode.k->p.inode);
+               *do_update = true;
+       }
+fsck_err:
+       return ret;
+}
+
+static int check_inode(struct bch_fs *c,
+                      struct bch_inode_unpacked *lostfound_inode,
+                      struct btree_iter *iter,
+                      struct bkey_s_c_inode inode,
+                      struct nlink *link)
+{
+       struct bch_inode_unpacked u;
+       bool do_update = false;
+       int ret = 0;
+
+       ret = bch2_inode_unpack(inode, &u);
+       if (bch2_fs_inconsistent_on(ret, c,
+                        "error unpacking inode %llu in fsck",
+                        inode.k->p.inode))
+               return ret;
+
+       if (link) {
+               ret = check_inode_nlink(c, lostfound_inode, &u, link,
+                                       &do_update);
+               if (ret)
+                       return ret;
+       }
+
+       if (u.bi_flags & BCH_INODE_UNLINKED) {
+               bch_verbose(c, "deleting inode %llu", u.bi_inum);
 
-               ret = bch2_inode_rm(c, inode.k->p.inode);
+               ret = bch2_inode_rm(c, u.bi_inum);
                if (ret)
                        bch_err(c, "error in fs gc: error %i "
                                "while deleting inode", ret);
@@ -940,16 +1084,16 @@ static int bch2_gc_do_inode(struct bch_fs *c,
                fsck_err_on(c->sb.clean, c,
                            "filesystem marked clean, "
                            "but inode %llu has i_size dirty",
-                           inode.k->p.inode);
+                           u.bi_inum);
 
-               bch_verbose(c, "truncating inode %llu", inode.k->p.inode);
+               bch_verbose(c, "truncating inode %llu", u.bi_inum);
 
                /*
                 * XXX: need to truncate partial blocks too here - or ideally
                 * just switch units to bytes and that issue goes away
                 */
 
-               ret = bch2_inode_truncate(c, inode.k->p.inode,
+               ret = bch2_inode_truncate(c, u.bi_inum,
                                round_up(u.bi_size, PAGE_SIZE) >> 9,
                                NULL, NULL);
                if (ret) {
@@ -974,12 +1118,12 @@ static int bch2_gc_do_inode(struct bch_fs *c,
                fsck_err_on(c->sb.clean, c,
                            "filesystem marked clean, "
                            "but inode %llu has i_sectors dirty",
-                           inode.k->p.inode);
+                           u.bi_inum);
 
                bch_verbose(c, "recounting sectors for inode %llu",
-                           inode.k->p.inode);
+                           u.bi_inum);
 
-               sectors = bch2_count_inode_sectors(c, inode.k->p.inode);
+               sectors = bch2_count_inode_sectors(c, u.bi_inum);
                if (sectors < 0) {
                        bch_err(c, "error in fs gc: error %i "
                                "recounting inode sectors",
@@ -992,20 +1136,6 @@ static int bch2_gc_do_inode(struct bch_fs *c,
                do_update = true;
        }
 
-       if (i_nlink != real_i_nlink) {
-               fsck_err_on(c->sb.clean, c,
-                           "filesystem marked clean, "
-                           "but inode %llu has wrong i_nlink "
-                           "(type %u i_nlink %u, should be %u)",
-                           inode.k->p.inode, mode_to_type(u.bi_mode),
-                           i_nlink, real_i_nlink);
-
-               bch_verbose(c, "setting inode %llu nlinks from %u to %u",
-                           inode.k->p.inode, i_nlink, real_i_nlink);
-               u.bi_nlink = real_i_nlink - nlink_bias(u.bi_mode);
-               do_update = true;
-       }
-
        if (do_update) {
                struct bkey_inode_buf p;
 
@@ -1024,9 +1154,9 @@ fsck_err:
 
 noinline_for_stack
 static int bch2_gc_walk_inodes(struct bch_fs *c,
-                             struct bch_inode_unpacked *lostfound_inode,
-                             nlink_table *links,
-                             u64 range_start, u64 range_end)
+                              struct bch_inode_unpacked *lostfound_inode,
+                              nlink_table *links,
+                              u64 range_start, u64 range_end)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
@@ -1065,10 +1195,9 @@ peek_nlinks:     link = genradix_iter_peek(&nlinks_iter, links);
                         */
                        bch2_btree_iter_unlock(&iter);
 
-                       ret = bch2_gc_do_inode(c, lostfound_inode, &iter,
-                                             bkey_s_c_to_inode(k), *link);
-                       if (ret == -EINTR)
-                               continue;
+                       ret = check_inode(c, lostfound_inode, &iter,
+                                         bkey_s_c_to_inode(k), link);
+                       BUG_ON(ret == -EINTR);
                        if (ret)
                                break;
 
@@ -1103,6 +1232,8 @@ static int check_inode_nlinks(struct bch_fs *c,
        u64 this_iter_range_start, next_iter_range_start = 0;
        int ret = 0;
 
+       bch_verbose(c, "checking inode nlinks");
+
        genradix_init(&links);
 
        do {
@@ -1129,68 +1260,103 @@ static int check_inode_nlinks(struct bch_fs *c,
        return ret;
 }
 
+noinline_for_stack
+static int check_inodes_fast(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_inode inode;
+       unsigned long nr_inodes = 0;
+       int ret = 0;
+
+       for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) {
+               if (k.k->type != BCH_INODE_FS)
+                       continue;
+
+               inode = bkey_s_c_to_inode(k);
+
+               if (!(inode.v->bi_flags & BCH_INODE_UNLINKED))
+                       nr_inodes++;
+
+               if (inode.v->bi_flags &
+                   (BCH_INODE_I_SIZE_DIRTY|
+                    BCH_INODE_I_SECTORS_DIRTY|
+                    BCH_INODE_UNLINKED)) {
+                       fsck_err_on(c->sb.clean, c,
+                               "filesystem marked clean but found inode %llu with flags %x",
+                               inode.k->p.inode, inode.v->bi_flags);
+                       ret = check_inode(c, NULL, &iter, inode, NULL);
+                       BUG_ON(ret == -EINTR);
+                       if (ret)
+                               break;
+               }
+       }
+       atomic_long_set(&c->nr_inodes, nr_inodes);
+fsck_err:
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
 /*
  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
  * Doesn't fix them yet, mainly because they haven't yet been observed:
  */
-int bch2_fsck(struct bch_fs *c, bool full_fsck)
+static int bch2_fsck_full(struct bch_fs *c)
 {
        struct bch_inode_unpacked root_inode, lostfound_inode;
        int ret;
 
-       if (full_fsck) {
-               bch_verbose(c, "checking extents");
-               ret = check_extents(c);
-               if (ret)
-                       return ret;
+       bch_verbose(c, "starting fsck:");
+       ret =   check_extents(c) ?:
+               check_dirents(c) ?:
+               check_xattrs(c) ?:
+               check_root(c, &root_inode) ?:
+               check_lostfound(c, &root_inode, &lostfound_inode) ?:
+               check_directory_structure(c, &lostfound_inode) ?:
+               check_inode_nlinks(c, &lostfound_inode);
 
-               bch_verbose(c, "checking dirents");
-               ret = check_dirents(c);
-               if (ret)
-                       return ret;
+       bch2_flush_fsck_errs(c);
+       bch_verbose(c, "fsck done");
 
-               bch_verbose(c, "checking xattrs");
-               ret = check_xattrs(c);
-               if (ret)
-                       return ret;
+       return ret;
+}
 
-               bch_verbose(c, "checking root directory");
-               ret = check_root(c, &root_inode);
-               if (ret)
-                       return ret;
+static int bch2_fsck_inode_nlink(struct bch_fs *c)
+{
+       struct bch_inode_unpacked root_inode, lostfound_inode;
+       int ret;
 
-               bch_verbose(c, "checking lost+found");
-               ret = check_lostfound(c, &root_inode, &lostfound_inode);
-               if (ret)
-                       return ret;
+       bch_verbose(c, "checking inode link counts:");
+       ret =   check_root(c, &root_inode) ?:
+               check_lostfound(c, &root_inode, &lostfound_inode) ?:
+               check_inode_nlinks(c, &lostfound_inode);
 
-               bch_verbose(c, "checking directory structure");
-               ret = check_directory_structure(c, &lostfound_inode);
-               if (ret)
-                       return ret;
+       bch2_flush_fsck_errs(c);
+       bch_verbose(c, "done");
 
-               bch_verbose(c, "checking inode nlinks");
-               ret = check_inode_nlinks(c, &lostfound_inode);
-               if (ret)
-                       return ret;
-       } else {
-               bch_verbose(c, "checking root directory");
-               ret = check_root(c, &root_inode);
-               if (ret)
-                       return ret;
+       return ret;
+}
 
-               bch_verbose(c, "checking lost+found");
-               ret = check_lostfound(c, &root_inode, &lostfound_inode);
-               if (ret)
-                       return ret;
+static int bch2_fsck_walk_inodes_only(struct bch_fs *c)
+{
+       int ret;
 
-               bch_verbose(c, "checking inode nlinks");
-               ret = check_inode_nlinks(c, &lostfound_inode);
-               if (ret)
-                       return ret;
-       }
+       bch_verbose(c, "walking inodes:");
+       ret = check_inodes_fast(c);
 
        bch2_flush_fsck_errs(c);
+       bch_verbose(c, "done");
 
-       return 0;
+       return ret;
+}
+
+int bch2_fsck(struct bch_fs *c)
+{
+       if (!c->opts.nofsck)
+               return bch2_fsck_full(c);
+
+       if (!c->sb.clean &&
+           !(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK)))
+               return bch2_fsck_inode_nlink(c);
+
+       return bch2_fsck_walk_inodes_only(c);
 }
index f9af1305dc2a1bdd2a806e918d8c856d54891eb3..bc9caaf237978353f75d4aabb0d3670e07f997de 100644 (file)
@@ -2,6 +2,6 @@
 #define _BCACHEFS_FSCK_H
 
 s64 bch2_count_inode_sectors(struct bch_fs *, u64);
-int bch2_fsck(struct bch_fs *, bool);
+int bch2_fsck(struct bch_fs *);
 
 #endif /* _BCACHEFS_FSCK_H */
index 3ae5ac975dfb0aec9d62e6865daf6ad3e786eb77..d4139faa341a088a46bd744a8bab634c2fa7eb6b 100644 (file)
@@ -203,6 +203,10 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
                if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
                        return "invalid data checksum type";
 
+               if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+                   unpacked.bi_nlink != 0)
+                       return "flagged as unlinked but bi_nlink != 0";
+
                return NULL;
        }
        case BCH_INODE_BLOCKDEV:
@@ -276,12 +280,27 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
        }
 }
 
-int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-                     u64 min, u64 max, u64 *hint)
+static inline u32 bkey_generation(struct bkey_s_c k)
 {
-       struct bkey_inode_buf inode_p;
-       struct btree_iter iter;
-       bool searched_from_start = false;
+       switch (k.k->type) {
+       case BCH_INODE_BLOCKDEV:
+       case BCH_INODE_FS:
+               BUG();
+       case BCH_INODE_GENERATION:
+               return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+       default:
+               return 0;
+       }
+}
+
+int __bch2_inode_create(struct btree_trans *trans,
+                       struct bch_inode_unpacked *inode_u,
+                       u64 min, u64 max, u64 *hint)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_inode_buf *inode_p;
+       struct btree_iter *iter;
+       u64 start;
        int ret;
 
        if (!max)
@@ -290,82 +309,66 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
        if (c->opts.inodes_32bit)
                max = min_t(u64, max, U32_MAX);
 
-       if (*hint >= max || *hint < min)
-               *hint = min;
+       start = READ_ONCE(*hint);
 
-       if (*hint == min)
-               searched_from_start = true;
-again:
-       bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(*hint, 0),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (start >= max || start < min)
+               start = min;
 
+       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+       if (IS_ERR(inode_p))
+               return PTR_ERR(inode_p);
+
+       iter = bch2_trans_get_iter(trans,
+                       BTREE_ID_INODES, POS(start, 0),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+again:
        while (1) {
-               struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
-               u32 bi_generation = 0;
+               struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 
                ret = btree_iter_err(k);
-               if (ret) {
-                       bch2_btree_iter_unlock(&iter);
+               if (ret)
                        return ret;
-               }
 
                switch (k.k->type) {
                case BCH_INODE_BLOCKDEV:
                case BCH_INODE_FS:
                        /* slot used */
-                       if (iter.pos.inode == max)
+                       if (iter->pos.inode >= max)
                                goto out;
 
-                       bch2_btree_iter_next_slot(&iter);
+                       bch2_btree_iter_next_slot(iter);
                        break;
 
-               case BCH_INODE_GENERATION: {
-                       struct bkey_s_c_inode_generation g =
-                               bkey_s_c_to_inode_generation(k);
-                       bi_generation = le32_to_cpu(g.v->bi_generation);
-                       /* fallthrough: */
-               }
                default:
-                       inode_u->bi_generation = bi_generation;
-
-                       bch2_inode_pack(&inode_p, inode_u);
-                       inode_p.inode.k.p = k.k->p;
-
-                       ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
-                                       BTREE_INSERT_ATOMIC,
-                                       BTREE_INSERT_ENTRY(&iter,
-                                                          &inode_p.inode.k_i));
-
-                       if (ret != -EINTR) {
-                               bch2_btree_iter_unlock(&iter);
-
-                               if (!ret) {
-                                       inode_u->bi_inum =
-                                               inode_p.inode.k.p.inode;
-                                       *hint = inode_p.inode.k.p.inode + 1;
-                               }
-
-                               return ret;
-                       }
-
-                       if (ret == -EINTR)
-                               continue;
+                       *hint                   = k.k->p.inode;
+                       inode_u->bi_inum        = k.k->p.inode;
+                       inode_u->bi_generation  = bkey_generation(k);
 
+                       bch2_inode_pack(inode_p, inode_u);
+                       bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+                       return 0;
                }
        }
 out:
-       bch2_btree_iter_unlock(&iter);
-
-       if (!searched_from_start) {
+       if (start != min) {
                /* Retry from start */
-               *hint = min;
-               searched_from_start = true;
+               start = min;
+               bch2_btree_iter_set_pos(iter, POS(start, 0));
                goto again;
        }
 
        return -ENOSPC;
 }
 
+int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+                     u64 min, u64 max, u64 *hint)
+{
+       return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+                       __bch2_inode_create(&trans, inode_u, min, max, hint));
+}
+
 int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
                        struct extent_insert_hook *hook, u64 *journal_seq)
 {
index 26461063f774d35581f0c78ad795ab73d44ccef1..a47194ab93e3f1df5679b26d8f97ffbb4c1929ee 100644 (file)
@@ -38,8 +38,13 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
                     uid_t, gid_t, umode_t, dev_t,
                     struct bch_inode_unpacked *);
+
+int __bch2_inode_create(struct btree_trans *,
+                       struct bch_inode_unpacked *,
+                       u64, u64, u64 *);
 int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
                      u64, u64, u64 *);
+
 int bch2_inode_truncate(struct bch_fs *, u64, u64,
                       struct extent_insert_hook *, u64 *);
 int bch2_inode_rm(struct bch_fs *, u64);
index 58aee7aeef82f53cd84bbc27e4f812e53270e10c..0af136d674c42e8328246757c6ed924c7d476a1d 100644 (file)
@@ -5,6 +5,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
+#include "dirent.h"
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
@@ -14,6 +15,8 @@
 
 #include <linux/stat.h>
 
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
 struct bkey_i *btree_root_find(struct bch_fs *c,
                               struct bch_sb_field_clean *clean,
                               struct jset *j,
@@ -233,7 +236,8 @@ int bch2_fs_recovery(struct bch_fs *c)
        bch2_fs_journal_start(&c->journal);
 
        err = "error starting allocator";
-       if (bch2_fs_allocator_start(c))
+       ret = bch2_fs_allocator_start(c);
+       if (ret)
                goto err;
 
        bch_verbose(c, "starting journal replay:");
@@ -246,12 +250,16 @@ int bch2_fs_recovery(struct bch_fs *c)
        if (c->opts.norecovery)
                goto out;
 
-       bch_verbose(c, "starting fsck:");
        err = "error in fsck";
-       ret = bch2_fsck(c, !c->opts.nofsck);
+       ret = bch2_fsck(c);
        if (ret)
                goto err;
-       bch_verbose(c, "fsck done");
+
+       if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) {
+               mutex_lock(&c->sb_lock);
+               c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
+               mutex_unlock(&c->sb_lock);
+       }
 
        if (enabled_qtypes(c)) {
                bch_verbose(c, "reading quotas:");
@@ -273,8 +281,10 @@ fsck_err:
 
 int bch2_fs_initialize(struct bch_fs *c)
 {
-       struct bch_inode_unpacked inode;
+       struct bch_inode_unpacked root_inode, lostfound_inode;
        struct bkey_inode_buf packed_inode;
+       struct bch_hash_info root_hash_info;
+       struct qstr lostfound = QSTR("lost+found");
        const char *err = "cannot allocate memory";
        struct bch_dev *ca;
        LIST_HEAD(journal);
@@ -307,21 +317,46 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch2_journal_set_replay_done(&c->journal);
 
        err = "error starting allocator";
-       if (bch2_fs_allocator_start(c))
+       ret = bch2_fs_allocator_start(c);
+       if (ret)
                goto err;
 
-       bch2_inode_init(c, &inode, 0, 0,
+       bch2_inode_init(c, &root_inode, 0, 0,
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
-       inode.bi_inum = BCACHEFS_ROOT_INO;
-
-       bch2_inode_pack(&packed_inode, &inode);
+       root_inode.bi_inum = BCACHEFS_ROOT_INO;
+       root_inode.bi_nlink++; /* lost+found */
+       bch2_inode_pack(&packed_inode, &root_inode);
 
        err = "error creating root directory";
-       if (bch2_btree_insert(c, BTREE_ID_INODES,
-                             &packed_inode.inode.k_i,
-                             NULL, NULL, NULL, 0))
+       ret = bch2_btree_insert(c, BTREE_ID_INODES,
+                               &packed_inode.inode.k_i,
+                               NULL, NULL, NULL, 0);
+       if (ret)
                goto err;
 
+       bch2_inode_init(c, &lostfound_inode, 0, 0,
+                       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
+                       &root_inode);
+       lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
+       bch2_inode_pack(&packed_inode, &lostfound_inode);
+
+       err = "error creating lost+found";
+       ret = bch2_btree_insert(c, BTREE_ID_INODES,
+                               &packed_inode.inode.k_i,
+                               NULL, NULL, NULL, 0);
+       if (ret)
+               goto err;
+
+       root_hash_info = bch2_hash_info_init(c, &root_inode);
+
+       ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
+                                &lostfound, lostfound_inode.bi_inum, NULL,
+                                BTREE_INSERT_NOFAIL);
+       if (ret)
+               goto err;
+
+       atomic_long_set(&c->nr_inodes, 2);
+
        if (enabled_qtypes(c)) {
                ret = bch2_fs_quota_read(c);
                if (ret)
@@ -329,12 +364,14 @@ int bch2_fs_initialize(struct bch_fs *c)
        }
 
        err = "error writing first journal entry";
-       if (bch2_journal_meta(&c->journal))
+       ret = bch2_journal_meta(&c->journal);
+       if (ret)
                goto err;
 
        mutex_lock(&c->sb_lock);
        SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+       c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
 
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
index c80510952b71c0224c56d53e1ef3f438da10a2d5..99f1fe87329987100481a483b28944fe52e524d4 100644 (file)
@@ -125,46 +125,29 @@ struct bch_hash_desc {
        bool            (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
 };
 
-static inline struct bkey_s_c
-bch2_hash_lookup_at(const struct bch_hash_desc desc,
-                  const struct bch_hash_info *info,
-                  struct btree_iter *iter, const void *search)
+static inline struct btree_iter *
+bch2_hash_lookup(struct btree_trans *trans,
+                const struct bch_hash_desc desc,
+                const struct bch_hash_info *info,
+                u64 inode, const void *key,
+                unsigned flags)
 {
-       u64 inode = iter->pos.inode;
+       struct btree_iter *iter;
        struct bkey_s_c k;
 
-       for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
-               if (iter->pos.inode != inode)
-                       break;
-
-               if (k.k->type == desc.key_type) {
-                       if (!desc.cmp_key(k, search))
-                               return k;
-               } else if (k.k->type == desc.whiteout_type) {
-                       ;
-               } else {
-                       /* hole, not found */
-                       break;
-               }
-       }
-       return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
-}
-
-static inline struct bkey_s_c
-bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc,
-                       const struct bch_hash_info *info,
-                       struct btree_iter *iter, struct bkey_s_c search)
-{
-       u64 inode = iter->pos.inode;
-       struct bkey_s_c k;
+       iter = bch2_trans_get_iter(trans, desc.btree_id,
+                                  POS(inode, desc.hash_key(info, key)),
+                                  BTREE_ITER_SLOTS|flags);
+       if (IS_ERR(iter))
+               return iter;
 
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
                if (iter->pos.inode != inode)
                        break;
 
                if (k.k->type == desc.key_type) {
-                       if (!desc.cmp_bkey(k, search))
-                               return k;
+                       if (!desc.cmp_key(k, key))
+                               return iter;
                } else if (k.k->type == desc.whiteout_type) {
                        ;
                } else {
@@ -172,72 +155,48 @@ bch2_hash_lookup_bkey_at(const struct bch_hash_desc desc,
                        break;
                }
        }
-       return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
-}
-
-static inline struct bkey_s_c
-bch2_hash_lookup(const struct bch_hash_desc desc,
-               const struct bch_hash_info *info,
-               struct bch_fs *c, u64 inode,
-               struct btree_iter *iter, const void *key)
-{
-       bch2_btree_iter_init(iter, c, desc.btree_id,
-                           POS(inode, desc.hash_key(info, key)),
-                           BTREE_ITER_SLOTS);
-
-       return bch2_hash_lookup_at(desc, info, iter, key);
-}
-
-static inline struct bkey_s_c
-bch2_hash_lookup_intent(const struct bch_hash_desc desc,
-                      const struct bch_hash_info *info,
-                      struct bch_fs *c, u64 inode,
-                      struct btree_iter *iter, const void *key)
-{
-       bch2_btree_iter_init(iter, c, desc.btree_id,
-                            POS(inode, desc.hash_key(info, key)),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-       return bch2_hash_lookup_at(desc, info, iter, key);
+       return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
 }
 
-static inline struct bkey_s_c
-bch2_hash_hole_at(const struct bch_hash_desc desc, struct btree_iter *iter)
+static inline struct btree_iter *
+bch2_hash_hole(struct btree_trans *trans,
+              const struct bch_hash_desc desc,
+              const struct bch_hash_info *info,
+              u64 inode, const void *key)
 {
-       u64 inode = iter->pos.inode;
+       struct btree_iter *iter;
        struct bkey_s_c k;
 
+       iter = bch2_trans_get_iter(trans, desc.btree_id,
+                                  POS(inode, desc.hash_key(info, key)),
+                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return iter;
+
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
                if (iter->pos.inode != inode)
                        break;
 
                if (k.k->type != desc.key_type)
-                       return k;
+                       return iter;
        }
-       return btree_iter_err(k) ? k : bkey_s_c_err(-ENOENT);
-}
-
-static inline struct bkey_s_c bch2_hash_hole(const struct bch_hash_desc desc,
-                                           const struct bch_hash_info *info,
-                                           struct bch_fs *c, u64 inode,
-                                           struct btree_iter *iter,
-                                           const void *key)
-{
-       bch2_btree_iter_init(iter, c, desc.btree_id,
-                            POS(inode, desc.hash_key(info, key)),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-       return bch2_hash_hole_at(desc, iter);
+       return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
 }
 
-static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
+static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
+                                          const struct bch_hash_desc desc,
                                           const struct bch_hash_info *info,
-                                          struct btree_iter *iter,
                                           struct btree_iter *start)
 {
+       struct btree_iter *iter;
        struct bkey_s_c k;
 
-       bch2_btree_iter_copy(iter, start);
+       iter = bch2_trans_copy_iter(trans, start);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+
        bch2_btree_iter_next_slot(iter);
 
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
@@ -252,142 +211,108 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
        return btree_iter_err(k);
 }
 
-static inline int bch2_hash_set(const struct bch_hash_desc desc,
-                              const struct bch_hash_info *info,
-                              struct bch_fs *c, u64 inode,
-                              u64 *journal_seq,
-                              struct bkey_i *insert, int flags)
+static inline int __bch2_hash_set(struct btree_trans *trans,
+                                 const struct bch_hash_desc desc,
+                                 const struct bch_hash_info *info,
+                                 u64 inode, struct bkey_i *insert, int flags)
 {
-       struct btree_iter iter, hashed_slot;
+       struct btree_iter *iter, *slot = NULL;
        struct bkey_s_c k;
-       int ret;
 
-       bch2_btree_iter_init(&hashed_slot, c, desc.btree_id,
-               POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
-               BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       bch2_btree_iter_init(&iter, c, desc.btree_id, hashed_slot.pos,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       bch2_btree_iter_link(&hashed_slot, &iter);
-retry:
-       /*
-        * On hash collision, we have to keep the slot we hashed to locked while
-        * we do the insert - to avoid racing with another thread deleting
-        * whatever's in the slot we hashed to:
-        */
-       ret = bch2_btree_iter_traverse(&hashed_slot);
-       if (ret)
-               goto err;
-
-       /*
-        * On -EINTR/retry, we dropped locks - always restart from the slot we
-        * hashed to:
-        */
-       bch2_btree_iter_copy(&iter, &hashed_slot);
-
-       k = bch2_hash_lookup_bkey_at(desc, info, &iter, bkey_i_to_s_c(insert));
-
-       ret = btree_iter_err(k);
-       if (ret == -ENOENT) {
-               if (flags & BCH_HASH_SET_MUST_REPLACE) {
-                       ret = -ENOENT;
-                       goto err;
+       iter = bch2_trans_get_iter(trans, desc.btree_id,
+                       POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+
+       for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
+               if (iter->pos.inode != inode)
+                       break;
+
+               if (k.k->type == desc.key_type) {
+                       if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
+                               goto found;
+
+                       /* hash collision: */
+                       continue;
                }
 
-               /*
-                * Not found, so we're now looking for any open
-                * slot - we might have skipped over a whiteout
-                * that we could have used, so restart from the
-                * slot we hashed to:
-                */
-               bch2_btree_iter_copy(&iter, &hashed_slot);
-               k = bch2_hash_hole_at(desc, &iter);
-               if ((ret = btree_iter_err(k)))
-                       goto err;
-       } else if (!ret) {
-               if (flags & BCH_HASH_SET_MUST_CREATE) {
-                       ret = -EEXIST;
-                       goto err;
+               if (!slot &&
+                   !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+                       slot = bch2_trans_copy_iter(trans, iter);
+                       if (IS_ERR(slot))
+                               return PTR_ERR(slot);
                }
-       } else {
-               goto err;
+
+               if (k.k->type != desc.whiteout_type)
+                       goto not_found;
        }
 
-       insert->k.p = iter.pos;
-       ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
-                                 BTREE_INSERT_ATOMIC|flags,
-                                 BTREE_INSERT_ENTRY(&iter, insert));
-err:
-       if (ret == -EINTR)
-               goto retry;
-
-       /*
-        * On successful insert, we don't want to clobber ret with error from
-        * iter:
-        */
-       bch2_btree_iter_unlock(&iter);
-       bch2_btree_iter_unlock(&hashed_slot);
-       return ret;
+       return btree_iter_err(k) ?: -ENOSPC;
+not_found:
+       if (flags & BCH_HASH_SET_MUST_REPLACE)
+               return -ENOENT;
+
+       insert->k.p = slot->pos;
+       bch2_trans_update(trans, slot, insert, 0);
+       return 0;
+found:
+       if (flags & BCH_HASH_SET_MUST_CREATE)
+               return -EEXIST;
+
+       insert->k.p = iter->pos;
+       bch2_trans_update(trans, iter, insert, 0);
+       return 0;
 }
 
-static inline int bch2_hash_delete_at(const struct bch_hash_desc desc,
-                                     const struct bch_hash_info *info,
-                                     struct btree_iter *iter,
-                                     u64 *journal_seq)
+static inline int bch2_hash_set(const struct bch_hash_desc desc,
+                              const struct bch_hash_info *info,
+                              struct bch_fs *c, u64 inode,
+                              u64 *journal_seq,
+                              struct bkey_i *insert, int flags)
 {
-       struct btree_iter whiteout_iter;
-       struct bkey_i delete;
-       int ret = -ENOENT;
+       return bch2_trans_do(c, journal_seq, flags|BTREE_INSERT_ATOMIC,
+                       __bch2_hash_set(&trans, desc, info,
+                                       inode, insert, flags));
+}
 
-       bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id,
-                            iter->pos, BTREE_ITER_SLOTS);
-       bch2_btree_iter_link(iter, &whiteout_iter);
+static inline int bch2_hash_delete_at(struct btree_trans *trans,
+                                     const struct bch_hash_desc desc,
+                                     const struct bch_hash_info *info,
+                                     struct btree_iter *iter)
+{
+       struct bkey_i *delete;
+       int ret;
 
-       ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter);
+       ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
        if (ret < 0)
-               goto err;
-
-       bkey_init(&delete.k);
-       delete.k.p = iter->pos;
-       delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
-
-       ret = bch2_btree_insert_at(iter->c, NULL, NULL, journal_seq,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_ATOMIC,
-                                 BTREE_INSERT_ENTRY(iter, &delete));
-err:
-       bch2_btree_iter_unlink(&whiteout_iter);
-       return ret;
+               return ret;
+
+       delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+       if (IS_ERR(delete))
+               return PTR_ERR(delete);
+
+       bkey_init(&delete->k);
+       delete->k.p = iter->pos;
+       delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+       bch2_trans_update(trans, iter, delete, 0);
+       return 0;
 }
 
-static inline int bch2_hash_delete(const struct bch_hash_desc desc,
-                                 const struct bch_hash_info *info,
-                                 struct bch_fs *c, u64 inode,
-                                 u64 *journal_seq, const void *key)
+static inline int bch2_hash_delete(struct btree_trans *trans,
+                                  const struct bch_hash_desc desc,
+                                  const struct bch_hash_info *info,
+                                  u64 inode, const void *key)
 {
-       struct btree_iter iter, whiteout_iter;
-       struct bkey_s_c k;
-       int ret = -ENOENT;
-
-       bch2_btree_iter_init(&iter, c, desc.btree_id,
-                            POS(inode, desc.hash_key(info, key)),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       bch2_btree_iter_init(&whiteout_iter, c, desc.btree_id,
-                           POS(inode, desc.hash_key(info, key)),
-                           BTREE_ITER_SLOTS);
-       bch2_btree_iter_link(&iter, &whiteout_iter);
-retry:
-       k = bch2_hash_lookup_at(desc, info, &iter, key);
-       if ((ret = btree_iter_err(k)))
-               goto err;
-
-       ret = bch2_hash_delete_at(desc, info, &iter, journal_seq);
-err:
-       if (ret == -EINTR)
-               goto retry;
-
-       bch2_btree_iter_unlock(&whiteout_iter);
-       bch2_btree_iter_unlock(&iter);
-       return ret;
+       struct btree_iter *iter;
+
+       iter = bch2_hash_lookup(trans, desc, info, inode, key,
+                               BTREE_ITER_INTENT);
+       if (IS_ERR(iter))
+               return PTR_ERR(iter);
+
+       return bch2_hash_delete_at(trans, desc, info, iter);
 }
 
 #endif /* _BCACHEFS_STR_HASH_H */
index 24c6cc568762e969521409cdc452da6b85a5ff20..1272ea7a7a2864d7f55c989f07b4f85a2ece8144 100644 (file)
@@ -52,7 +52,7 @@ static int __bch2_strtoh(const char *cp, u64 *res,
                cp++;
        } while (isdigit(*cp));
 
-       for (u = 1; u < ARRAY_SIZE(si_units); u++)
+       for (u = 1; u < strlen(si_units); u++)
                if (*cp == si_units[u]) {
                        cp++;
                        goto got_unit;
index c6b5015a0087c145227365bad2e0b4667eaf101e..7d0fee3a8c0495994aa6cbb6da0f4f7e348daeef 100644 (file)
@@ -74,7 +74,6 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
        const struct xattr_handler *handler;
        struct bkey_s_c_xattr xattr;
-       unsigned u64s;
 
        switch (k.k->type) {
        case BCH_XATTR:
@@ -82,13 +81,15 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
                        return "value too small";
 
                xattr = bkey_s_c_to_xattr(k);
-               u64s = xattr_val_u64s(xattr.v->x_name_len,
-                                     le16_to_cpu(xattr.v->x_val_len));
 
-               if (bkey_val_u64s(k.k) < u64s)
+               if (bkey_val_u64s(k.k) <
+                       xattr_val_u64s(xattr.v->x_name_len,
+                                      le16_to_cpu(xattr.v->x_val_len)))
                        return "value too small";
 
-               if (bkey_val_u64s(k.k) > u64s)
+               if (bkey_val_u64s(k.k) >
+                       xattr_val_u64s(xattr.v->x_name_len,
+                                      le16_to_cpu(xattr.v->x_val_len) + 4))
                        return "value too big";
 
                handler = bch2_xattr_type_to_handler(xattr.v->x_type);
@@ -142,32 +143,28 @@ void bch2_xattr_to_text(struct bch_fs *c, char *buf,
        }
 }
 
-struct bkey_s_c bch2_xattr_get_iter(struct bch_fs *c,
-                                   struct btree_iter *iter,
-                                   struct bch_inode_info *inode,
-                                   const char *name, int type)
-{
-       return bch2_hash_lookup(bch2_xattr_hash_desc,
-                               &inode->ei_str_hash,
-                               c, inode->v.i_ino, iter,
-                               &X_SEARCH(type, name, strlen(name)));
-}
-
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
-                 const char *name, void *buffer, size_t size, int type)
+                  const char *name, void *buffer, size_t size, int type)
 {
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       struct btree_trans trans;
+       struct btree_iter *iter;
        struct bkey_s_c_xattr xattr;
        int ret;
 
-       k = bch2_hash_lookup(bch2_xattr_hash_desc, &inode->ei_str_hash, c,
-                            inode->v.i_ino, &iter,
-                            &X_SEARCH(type, name, strlen(name)));
-       if (IS_ERR(k.k))
-               return bch2_btree_iter_unlock(&iter) ?: -ENODATA;
+       bch2_trans_init(&trans, c);
+
+       iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+                               &inode->ei_str_hash, inode->v.i_ino,
+                               &X_SEARCH(type, name, strlen(name)),
+                               0);
+       if (IS_ERR(iter)) {
+               bch2_trans_exit(&trans);
+               BUG_ON(PTR_ERR(iter) == -EINTR);
 
-       xattr = bkey_s_c_to_xattr(k);
+               return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
+       }
+
+       xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
        ret = le16_to_cpu(xattr.v->x_val_len);
        if (buffer) {
                if (ret > size)
@@ -176,47 +173,48 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
                        memcpy(buffer, xattr_val(xattr.v), ret);
        }
 
-       bch2_btree_iter_unlock(&iter);
+       bch2_trans_exit(&trans);
        return ret;
 }
 
-int bch2_xattr_set(struct bch_fs *c, u64 inum,
+int bch2_xattr_set(struct btree_trans *trans, u64 inum,
                   const struct bch_hash_info *hash_info,
                   const char *name, const void *value, size_t size,
-                  int flags, int type, u64 *journal_seq)
+                  int type, int flags)
 {
-       struct xattr_search_key search = X_SEARCH(type, name, strlen(name));
        int ret;
 
        if (value) {
                struct bkey_i_xattr *xattr;
+               unsigned namelen = strlen(name);
                unsigned u64s = BKEY_U64s +
-                       xattr_val_u64s(search.name.len, size);
+                       xattr_val_u64s(namelen, size);
 
                if (u64s > U8_MAX)
                        return -ERANGE;
 
-               xattr = kmalloc(u64s * sizeof(u64), GFP_NOFS);
-               if (!xattr)
-                       return -ENOMEM;
+               xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+               if (IS_ERR(xattr))
+                       return PTR_ERR(xattr);
 
                bkey_xattr_init(&xattr->k_i);
                xattr->k.u64s           = u64s;
                xattr->v.x_type         = type;
-               xattr->v.x_name_len     = search.name.len;
+               xattr->v.x_name_len     = namelen;
                xattr->v.x_val_len      = cpu_to_le16(size);
-               memcpy(xattr->v.x_name, search.name.name, search.name.len);
+               memcpy(xattr->v.x_name, name, namelen);
                memcpy(xattr_val(&xattr->v), value, size);
 
-               ret = bch2_hash_set(bch2_xattr_hash_desc, hash_info, c,
-                               inum, journal_seq,
-                               &xattr->k_i,
-                               (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
-                               (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
-               kfree(xattr);
+               ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+                             inum, &xattr->k_i,
+                             (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
+                             (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
        } else {
-               ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info,
-                                      c, inum, journal_seq, &search);
+               struct xattr_search_key search =
+                       X_SEARCH(type, name, strlen(name));
+
+               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
+                                      hash_info, inum, &search);
        }
 
        if (ret == -ENOENT)
@@ -308,9 +306,11 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
-       return bch2_xattr_set(c, inode->v.i_ino, &inode->ei_str_hash,
-                             name, value, size, flags, handler->flags,
-                             &inode->ei_journal_seq);
+       return bch2_trans_do(c, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC,
+                       bch2_xattr_set(&trans, inode->v.i_ino,
+                                      &inode->ei_str_hash,
+                                      name, value, size,
+                                      handler->flags, flags));
 }
 
 static const struct xattr_handler bch_xattr_user_handler = {
@@ -433,7 +433,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
        }
 
        mutex_lock(&inode->ei_update_lock);
-       ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
+       ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
        mutex_unlock(&inode->ei_update_lock);
 
        if (value &&
index 1365032d56c39aa19e298d48e78f06562ffbdb7d..0689d327cdc4ea91b9969f8f847bcf746c318223 100644 (file)
@@ -35,15 +35,12 @@ struct xattr_handler;
 struct bch_hash_info;
 struct bch_inode_info;
 
-struct bkey_s_c bch2_xattr_get_iter(struct bch_fs *,
-                                   struct btree_iter *,
-                                   struct bch_inode_info *,
-                                   const char *, int);
 int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
                  const char *, void *, size_t, int);
 
-int bch2_xattr_set(struct bch_fs *, u64, const struct bch_hash_info *,
-                  const char *, const void *, size_t, int, int, u64 *);
+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+                  const char *, const void *, size_t, int, int);
+
 ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
 
 extern const struct xattr_handler *bch2_xattr_handlers[];