]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to f9c612bbf82d bcachefs: Fixes for building in userspace
authorKent Overstreet <kent.overstreet@linux.dev>
Sat, 23 Sep 2023 22:42:30 +0000 (18:42 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 24 Sep 2023 00:03:23 +0000 (20:03 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
114 files changed:
.bcachefs_revision
Makefile.compiler
cmd_dump.c
cmd_kill_btree_node.c
cmd_migrate.c
include/linux/blkdev.h
include/linux/compiler.h
include/linux/rcupdate.h
libbcachefs/acl.c
libbcachefs/acl.h
libbcachefs/alloc_background.c
libbcachefs/alloc_foreground.c
libbcachefs/backpointers.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_sort.h
libbcachefs/bset.c
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_locking.h
libbcachefs/btree_trans_commit.c
libbcachefs/btree_types.h
libbcachefs/btree_update.c
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_write_buffer.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_waiting_for_journal.c
libbcachefs/chardev.c
libbcachefs/checksum.c
libbcachefs/checksum.h
libbcachefs/compress.c
libbcachefs/counters.c
libbcachefs/data_update.c
libbcachefs/data_update.h
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/disk_groups.c
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/errcode.c
libbcachefs/errcode.h
libbcachefs/error.c
libbcachefs/fs-io-buffered.c
libbcachefs/fs-io-direct.c
libbcachefs/fs-io-pagecache.c
libbcachefs/fs-io.c
libbcachefs/fs-io.h
libbcachefs/fs-ioctl.c
libbcachefs/fs-ioctl.h
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.h [deleted file]
libbcachefs/io_misc.c [new file with mode: 0644]
libbcachefs/io_misc.h [new file with mode: 0644]
libbcachefs/io_read.c [new file with mode: 0644]
libbcachefs/io_read.h [new file with mode: 0644]
libbcachefs/io_write.c [moved from libbcachefs/io.c with 53% similarity]
libbcachefs/io_write.h [new file with mode: 0644]
libbcachefs/io_write_types.h [moved from libbcachefs/io_types.h with 54% similarity]
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_seq_blacklist.c
libbcachefs/logged_ops.c [new file with mode: 0644]
libbcachefs/logged_ops.h [new file with mode: 0644]
libbcachefs/lru.c
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/printbuf.c
libbcachefs/quota.c
libbcachefs/rebalance.c
libbcachefs/recovery.c
libbcachefs/recovery_types.h
libbcachefs/reflink.c
libbcachefs/replicas.c
libbcachefs/six.c
libbcachefs/snapshot.c
libbcachefs/snapshot.h
libbcachefs/subvolume.c
libbcachefs/subvolume.h
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/tests.c
libbcachefs/trace.h
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/varint.c
libbcachefs/vstructs.h
libbcachefs/xattr.c
linux/blkdev.c
rust-src/bch_bindgen/src/bkey.rs
rust-src/bch_bindgen/src/btree.rs
rust-src/bch_bindgen/src/libbcachefs_wrapper.h

index 57c74ea23e1e8ab4e8dbd9e827011b37f9c03cdd..0c7b8559f0a842eedb60f3606ec9e76814468758 100644 (file)
@@ -1 +1 @@
-e7f62157681d96386dc500609149b9685358a2b0
+f9c612bbf82da87d7d4a005310c5213db00e22de
index 7aa1fbc4aafef69327bf3b0ca51a80c1d46492a0..8fcb427405a6f17f61655a6d0881c433f22e1dd6 100644 (file)
@@ -32,13 +32,13 @@ try-run = $(shell set -e;           \
 # Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
 
 as-option = $(call try-run,\
-       $(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
+       $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
 
 # as-instr
 # Usage: aflags-y += $(call as-instr,instr,option1,option2)
 
 as-instr = $(call try-run,\
-       printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
+       printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
 
 # __cc-option
 # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
@@ -72,7 +72,3 @@ clang-min-version = $(call test-ge, $(CONFIG_CLANG_VERSION), $1)
 # ld-option
 # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y)
 ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3))
-
-# ld-ifversion
-# Usage:  $(call ld-ifversion, -ge, 22252, y)
-ld-ifversion = $(shell [ $(CONFIG_LD_VERSION)0 $(1) $(2)0 ] && echo $(3) || echo $(4))
index bf570dc68c87049cdf9a24888b04dd01e135fe3b..0d34923360fbe2977a8e9a3fc0f812367e716acb 100644 (file)
@@ -61,13 +61,11 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
        for (i = 0; i < BTREE_ID_NR; i++) {
                const struct bch_extent_ptr *ptr;
                struct bkey_ptrs_c ptrs;
-               struct btree_trans trans;
+               struct btree_trans *trans = bch2_trans_get(c);
                struct btree_iter iter;
                struct btree *b;
 
-               bch2_trans_init(&trans, c, 0, 0);
-
-               __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
+               __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
                        struct btree_node_iter iter;
                        struct bkey u;
                        struct bkey_s_c k;
@@ -97,8 +95,8 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
                                                  btree_bytes(c));
                }
 
-               bch2_trans_iter_exit(&trans, &iter);
-               bch2_trans_exit(&trans);
+               bch2_trans_iter_exit(trans, &iter);
+               bch2_trans_put(trans);
        }
 
        qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data,
index e9b8265d40e8016c76693ee19f8e398c84490a5c..83389bc4df8974f38b63bc4b6de624547b6e5ea1 100644 (file)
@@ -64,7 +64,7 @@ int cmd_kill_btree_node(int argc, char *argv[])
        if (IS_ERR(c))
                die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c)));
 
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct btree *b;
        int ret;
@@ -74,9 +74,7 @@ int cmd_kill_btree_node(int argc, char *argv[])
        if (ret)
                die("error %s from posix_memalign", bch2_err_str(ret));
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
+       __for_each_btree_node(trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
                if (b->c.level != level)
                        continue;
 
@@ -113,8 +111,8 @@ int cmd_kill_btree_node(int argc, char *argv[])
                bch_err(c, "node at specified index not found");
        ret = EXIT_FAILURE;
 done:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
 
        bch2_fs_stop(c);
        return ret;
index 3958ba6bdd23ee8c8d42c51c612767cde06c9a74..85ab96c09ce763343f41df2b8037b319e2223b36 100644 (file)
@@ -33,7 +33,7 @@
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/fs-common.h"
 #include "libbcachefs/inode.h"
-#include "libbcachefs/io.h"
+#include "libbcachefs/io_write.h"
 #include "libbcachefs/replicas.h"
 #include "libbcachefs/str_hash.h"
 #include "libbcachefs/super.h"
@@ -126,7 +126,7 @@ static void update_inode(struct bch_fs *c,
        bch2_inode_pack(&packed, inode);
        packed.inode.k.p.snapshot = U32_MAX;
        ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
-                               NULL, NULL, 0);
+                               NULL, 0);
        if (ret)
                die("error updating inode: %s", bch2_err_str(ret));
 }
@@ -140,7 +140,7 @@ static void create_link(struct bch_fs *c,
        struct bch_inode_unpacked inode;
 
        int ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_link_trans(&trans,
+               bch2_link_trans(trans,
                                (subvol_inum) { 1, parent->bi_inum }, &parent_u,
                                (subvol_inum) { 1, inum }, &inode, &qstr));
        if (ret)
@@ -159,7 +159,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
        bch2_inode_init_early(c, &new_inode);
 
        int ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_create_trans(&trans,
+               bch2_create_trans(trans,
                                  (subvol_inum) { 1, parent->bi_inum }, parent,
                                  &new_inode, &qstr,
                                  uid, gid, mode, rdev, NULL, NULL,
@@ -232,7 +232,7 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
                struct bch_inode_unpacked inode_u;
 
                int ret = bch2_trans_do(c, NULL, NULL, 0,
-                               bch2_xattr_set(&trans,
+                               bch2_xattr_set(trans,
                                               (subvol_inum) { 1, dst->bi_inum },
                                               &inode_u, &hash_info, attr,
                                               val, val_size, h->flags, 0));
@@ -339,8 +339,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                        die("error reserving space in new filesystem: %s",
                            bch2_err_str(ret));
 
-               ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
-                                       &res, NULL, 0);
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0);
                if (ret)
                        die("btree insert error %s", bch2_err_str(ret));
 
index 7d378ab2cdf0a16587de7de7e74acf03e2e0cfff..39143117c1a9bf1e22a8cec790eb1e88d426257d 100644 (file)
@@ -6,6 +6,8 @@
 #include <linux/kobject.h>
 #include <linux/types.h>
 
+#define MAX_LFS_FILESIZE       ((loff_t)LLONG_MAX)
+
 #define BIO_MAX_VECS   256U
 
 typedef unsigned fmode_t;
@@ -21,30 +23,20 @@ struct user_namespace;
 #define MINOR(dev)     ((unsigned int) ((dev) & MINORMASK))
 #define MKDEV(ma,mi)   (((ma) << MINORBITS) | (mi))
 
-/* file is open for reading */
-#define FMODE_READ             ((__force fmode_t)0x1)
-/* file is open for writing */
-#define FMODE_WRITE            ((__force fmode_t)0x2)
-/* file is seekable */
-#define FMODE_LSEEK            ((__force fmode_t)0x4)
-/* file can be accessed using pread */
-#define FMODE_PREAD            ((__force fmode_t)0x8)
-/* file can be accessed using pwrite */
-#define FMODE_PWRITE           ((__force fmode_t)0x10)
-/* File is opened for execution with sys_execve / sys_uselib */
-#define FMODE_EXEC             ((__force fmode_t)0x20)
-/* File is opened with O_NDELAY (only set for block devices) */
-#define FMODE_NDELAY           ((__force fmode_t)0x40)
-/* File is opened with O_EXCL (only set for block devices) */
-#define FMODE_EXCL             ((__force fmode_t)0x80)
-/* File is opened using open(.., 3, ..) and is writeable only for ioctls
-   (specialy hack for floppy.c) */
-#define FMODE_WRITE_IOCTL      ((__force fmode_t)0x100)
-/* 32bit hashes as llseek() offset (for directories) */
-#define FMODE_32BITHASH         ((__force fmode_t)0x200)
-/* 64bit hashes as llseek() offset (for directories) */
-#define FMODE_64BITHASH         ((__force fmode_t)0x400)
-#define FMODE_BUFFERED         ((__force fmode_t)0x800)
+typedef unsigned int __bitwise blk_mode_t;
+
+/* open for reading */
+#define BLK_OPEN_READ          ((__force blk_mode_t)(1 << 0))
+/* open for writing */
+#define BLK_OPEN_WRITE         ((__force blk_mode_t)(1 << 1))
+/* open exclusively (vs other exclusive openers */
+#define BLK_OPEN_EXCL          ((__force blk_mode_t)(1 << 2))
+/* opened with O_NDELAY */
+#define BLK_OPEN_NDELAY                ((__force blk_mode_t)(1 << 3))
+/* open for "writes" only for ioctls (specialy hack for floppy.c) */
+#define BLK_OPEN_WRITE_IOCTL   ((__force blk_mode_t)(1 << 4))
+
+#define BLK_OPEN_BUFFERED      ((__force blk_mode_t)(1 << 5))
 
 struct inode {
        unsigned long           i_ino;
@@ -93,9 +85,14 @@ int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsig
 unsigned bdev_logical_block_size(struct block_device *bdev);
 sector_t get_capacity(struct gendisk *disk);
 
-void blkdev_put(struct block_device *bdev, fmode_t mode);
+struct blk_holder_ops {
+        void (*mark_dead)(struct block_device *bdev);
+};
+
+void blkdev_put(struct block_device *bdev, void *holder);
 void bdput(struct block_device *bdev);
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder);
+struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+                                       void *holder, const struct blk_holder_ops *hop);
 int lookup_bdev(const char *path, dev_t *);
 
 struct super_block {
index 39df1f16ffc0250cce32a425f11125190299d886..b9486dbe38a24708eeb4159b0cf61c3f1f76ccd6 100644 (file)
@@ -65,6 +65,7 @@
 #define unreachable()          __builtin_unreachable()
 #define __same_type(a, b)      __builtin_types_compatible_p(typeof(a), typeof(b))
 #define fallthrough            __attribute__((__fallthrough__))
+#define __noreturn             __attribute__((__noreturn__))
 
 #define ___PASTE(a,b) a##b
 #define __PASTE(a,b) ___PASTE(a,b)
index ef03253158ed3f37d78b7357378a462c609746df..ec5f478f2a78ace640da4fe1cadc58222bfa3fe9 100644 (file)
@@ -12,7 +12,7 @@
 #define rcu_access_pointer(p)          READ_ONCE(p)
 
 #define kfree_rcu(ptr, rcu_head)       kfree(ptr) /* XXX */
-#define kvfree_rcu(ptr)                        kfree(ptr) /* XXX */
+#define kvfree_rcu_mightsleep(ptr)     kfree(ptr) /* XXX */
 
 #define RCU_INIT_POINTER(p, v)         WRITE_ONCE(p, v)
 
index b1a488860678cd9242f84e7ab5ddd2b1724cd26f..f3809897f00a7d5c98c7f33f36bc2fd587939dcc 100644 (file)
@@ -1,18 +1,71 @@
 // SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 
 #include "bcachefs.h"
 
-#include <linux/fs.h>
+#include "acl.h"
+#include "xattr.h"
+
 #include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+       [ACL_USER_OBJ]  = "user_obj",
+       [ACL_USER]      = "user",
+       [ACL_GROUP_OBJ] = "group_obj",
+       [ACL_GROUP]     = "group",
+       [ACL_MASK]      = "mask",
+       [ACL_OTHER]     = "other",
+       NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+       const void *p, *end = value + size;
+
+       if (!value ||
+           size < sizeof(bch_acl_header) ||
+           ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+               return;
+
+       p = value + sizeof(bch_acl_header);
+       while (p < end) {
+               const bch_acl_entry *in = p;
+               unsigned tag = le16_to_cpu(in->e_tag);
+
+               prt_str(out, acl_types[tag]);
+
+               switch (tag) {
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       p += sizeof(bch_acl_entry_short);
+                       break;
+               case ACL_USER:
+                       prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+                       p += sizeof(bch_acl_entry);
+                       break;
+               case ACL_GROUP:
+                       prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+                       p += sizeof(bch_acl_entry);
+                       break;
+               }
+
+               prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+               if (p != end)
+                       prt_char(out, ' ');
+       }
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-#include "acl.h"
-#include "fs.h"
-#include "xattr.h"
-
 static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
 {
        return sizeof(bch_acl_header) +
@@ -226,18 +279,16 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
        struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c_xattr xattr;
        struct posix_acl *acl = NULL;
        struct bkey_s_c k;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
+       ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
                        &hash, inode_inum(inode), &search, 0);
        if (ret) {
                if (!bch2_err_matches(ret, ENOENT))
@@ -253,7 +304,7 @@ retry:
        }
 
        xattr = bkey_s_c_to_xattr(k);
-       acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v),
+       acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
                        le16_to_cpu(xattr.v->x_val_len));
 
        if (!IS_ERR(acl))
@@ -262,8 +313,8 @@ out:
        if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return acl;
 }
 
@@ -303,7 +354,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 {
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
        struct posix_acl *acl;
@@ -311,12 +362,11 @@ int bch2_set_acl(struct mnt_idmap *idmap,
        int ret;
 
        mutex_lock(&inode->ei_update_lock);
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
        acl = _acl;
 
-       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+       ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
                              BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
@@ -329,30 +379,30 @@ retry:
                        goto btree_err;
        }
 
-       ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
+       ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
        if (ret)
                goto btree_err;
 
        inode_u.bi_ctime        = bch2_current_time(c);
        inode_u.bi_mode         = mode;
 
-       ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL, NULL, 0);
+       ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+               bch2_trans_commit(trans, NULL, NULL, 0);
 btree_err:
-       bch2_trans_iter_exit(&trans, &inode_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
 
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (unlikely(ret))
                goto err;
 
-       bch2_inode_update_after_write(&trans, inode, &inode_u,
+       bch2_inode_update_after_write(trans, inode, &inode_u,
                                      ATTR_CTIME|ATTR_MODE);
 
        set_cached_acl(&inode->v, type, acl);
 err:
-       bch2_trans_exit(&trans);
        mutex_unlock(&inode->ei_update_lock);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -367,7 +417,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
        struct btree_iter iter;
        struct bkey_s_c_xattr xattr;
        struct bkey_i_xattr *new;
-       struct posix_acl *acl;
+       struct posix_acl *acl = NULL;
        struct bkey_s_c k;
        int ret;
 
@@ -377,9 +427,10 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
                return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 
        k = bch2_btree_iter_peek_slot(&iter);
-       xattr = bkey_s_c_to_xattr(k);
+       ret = bkey_err(k);
        if (ret)
                goto err;
+       xattr = bkey_s_c_to_xattr(k);
 
        acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
                        le16_to_cpu(xattr.v->x_val_len));
index bb21d8d696a2fc3806d9ee1e353999ccd424d99b..27e7eec0f278c63784ec9520ffc31c7fad8cd7eb 100644 (file)
@@ -7,8 +7,6 @@ struct bch_hash_info;
 struct bch_inode_info;
 struct posix_acl;
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
 #define BCH_ACL_VERSION        0x0001
 
 typedef struct {
@@ -26,6 +24,10 @@ typedef struct {
        __le32          a_version;
 } bch_acl_header;
 
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
 struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
 
 int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
index 069d98a88232d2f78c4e030e302d7c9a18e770c7..19ef7a444c23bd43b82bf35fe18f372e9157f944 100644 (file)
@@ -237,13 +237,12 @@ int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         enum bkey_invalid_flags flags,
-                         struct printbuf *err)
+                         enum bkey_invalid_flags flags, struct printbuf *err)
 {
        struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 
        if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
-               prt_printf(err, "bad val size (%u > %lu)",
+               prt_printf(err, "bad val size (%u > %zu)",
                       alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
                return -BCH_ERR_invalid_bkey;
        }
@@ -527,7 +526,7 @@ int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
                             struct printbuf *err)
 {
        if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
-               prt_printf(err, "bad val size (%lu != %zu)",
+               prt_printf(err, "bad val size (%zu != %zu)",
                       bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
                return -BCH_ERR_invalid_bkey;
        }
@@ -549,7 +548,7 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke
 
 int bch2_bucket_gens_init(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_alloc_v4 a;
@@ -560,9 +559,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
        u8 gen;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                /*
                 * Not a fsck error because this is checked/repaired by
@@ -575,10 +572,10 @@ int bch2_bucket_gens_init(struct bch_fs *c)
                pos = alloc_gens_pos(iter.pos, &offset);
 
                if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
-                       ret = commit_do(&trans, NULL, NULL,
+                       ret = commit_do(trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL|
                                        BTREE_INSERT_LAZY_RW,
-                               __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+                               bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
                        if (ret)
                                break;
                        have_bucket_gens_key = false;
@@ -592,15 +589,15 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
                g.v.gens[offset] = gen;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (have_bucket_gens_key && !ret)
-               ret = commit_do(&trans, NULL, NULL,
+               ret = commit_do(trans, NULL, NULL,
                                BTREE_INSERT_NOFAIL|
                                BTREE_INSERT_LAZY_RW,
-                       __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+                       bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
@@ -609,20 +606,19 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
 int bch2_alloc_read(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_dev *ca;
        int ret;
 
        down_read(&c->gc_lock);
-       bch2_trans_init(&trans, c, 0, 0);
 
        if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
                const struct bch_bucket_gens *g;
                u64 b;
 
-               for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+               for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
                                   BTREE_ITER_PREFETCH, k, ret) {
                        u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
                        u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
@@ -646,11 +642,11 @@ int bch2_alloc_read(struct bch_fs *c)
                             b++)
                                *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
                }
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
        } else {
                struct bch_alloc_v4 a;
 
-               for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+               for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
                                   BTREE_ITER_PREFETCH, k, ret) {
                        /*
                         * Not a fsck error because this is checked/repaired by
@@ -663,10 +659,10 @@ int bch2_alloc_read(struct bch_fs *c)
 
                        *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
                }
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        up_read(&c->gc_lock);
 
        if (ret)
@@ -1201,15 +1197,15 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
                }
 
                if (need_update) {
-                       struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g));
+                       struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
 
-                       ret = PTR_ERR_OR_ZERO(k);
+                       ret = PTR_ERR_OR_ZERO(u);
                        if (ret)
                                goto err;
 
-                       memcpy(k, &g, sizeof(g));
+                       memcpy(u, &g, sizeof(g));
 
-                       ret = bch2_trans_update(trans, bucket_gens_iter, k, 0);
+                       ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
                        if (ret)
                                goto err;
                }
@@ -1286,7 +1282,7 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
        if (!btree_id_is_extents(iter->btree_id)) {
                return __bch2_check_discard_freespace_key(trans, iter);
        } else {
-               int ret;
+               int ret = 0;
 
                while (!bkey_eq(iter->pos, end) &&
                       !(ret = btree_trans_too_many_iters(trans) ?:
@@ -1355,15 +1351,14 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
                }
 
        if (need_update) {
-               struct bkey_i *k;
+               struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
 
-               k = bch2_trans_kmalloc(trans, sizeof(g));
-               ret = PTR_ERR_OR_ZERO(k);
+               ret = PTR_ERR_OR_ZERO(u);
                if (ret)
                        goto out;
 
-               memcpy(k, &g, sizeof(g));
-               ret = bch2_trans_update(trans, iter, k, 0);
+               memcpy(u, &g, sizeof(g));
+               ret = bch2_trans_update(trans, iter, u, 0);
        }
 out:
 fsck_err:
@@ -1373,27 +1368,25 @@ fsck_err:
 
 int bch2_check_alloc_info(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
        struct bkey hole;
        struct bkey_s_c k;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
                             BTREE_ITER_PREFETCH);
-       bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+       bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
                             BTREE_ITER_PREFETCH);
-       bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+       bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
                             BTREE_ITER_PREFETCH);
-       bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+       bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
                             BTREE_ITER_PREFETCH);
 
        while (1) {
                struct bpos next;
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
                ret = bkey_err(k);
@@ -1406,7 +1399,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
                if (k.k->type) {
                        next = bpos_nosnap_successor(k.k->p);
 
-                       ret = bch2_check_alloc_key(&trans,
+                       ret = bch2_check_alloc_key(trans,
                                                   k, &iter,
                                                   &discard_iter,
                                                   &freespace_iter,
@@ -1416,11 +1409,11 @@ int bch2_check_alloc_info(struct bch_fs *c)
                } else {
                        next = k.k->p;
 
-                       ret = bch2_check_alloc_hole_freespace(&trans,
+                       ret = bch2_check_alloc_hole_freespace(trans,
                                                    bkey_start_pos(k.k),
                                                    &next,
                                                    &freespace_iter) ?:
-                               bch2_check_alloc_hole_bucket_gens(&trans,
+                               bch2_check_alloc_hole_bucket_gens(trans,
                                                    bkey_start_pos(k.k),
                                                    &next,
                                                    &bucket_gens_iter);
@@ -1428,7 +1421,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
                                goto bkey_err;
                }
 
-               ret = bch2_trans_commit(&trans, NULL, NULL,
+               ret = bch2_trans_commit(trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL|
                                        BTREE_INSERT_LAZY_RW);
                if (ret)
@@ -1441,29 +1434,29 @@ bkey_err:
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &bucket_gens_iter);
-       bch2_trans_iter_exit(&trans, &freespace_iter);
-       bch2_trans_iter_exit(&trans, &discard_iter);
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &bucket_gens_iter);
+       bch2_trans_iter_exit(trans, &freespace_iter);
+       bch2_trans_iter_exit(trans, &discard_iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret < 0)
                goto err;
 
-       ret = for_each_btree_key2(&trans, iter,
+       ret = for_each_btree_key2(trans, iter,
                        BTREE_ID_need_discard, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-               bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
-             for_each_btree_key2(&trans, iter,
+               bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+             for_each_btree_key2(trans, iter,
                        BTREE_ID_freespace, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-               bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
-             for_each_btree_key_commit(&trans, iter,
+               bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+             for_each_btree_key_commit(trans, iter,
                        BTREE_ID_bucket_gens, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
                        NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-               bch2_check_bucket_gens_key(&trans, &iter, k));
+               bch2_check_bucket_gens_key(trans, &iter, k));
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -1549,10 +1542,10 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
        int ret = 0;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
                                POS_MIN, BTREE_ITER_PREFETCH, k,
                                NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-                       bch2_check_alloc_to_lru_ref(&trans, &iter)));
+                       bch2_check_alloc_to_lru_ref(trans, &iter)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -1677,29 +1670,25 @@ out:
 static void bch2_do_discards_work(struct work_struct *work)
 {
        struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
        struct bpos discard_pos_done = POS_MAX;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        /*
         * We're doing the commit in bch2_discard_one_bucket instead of using
         * for_each_btree_key_commit() so that we can increment counters after
         * successful commit:
         */
-       ret = for_each_btree_key2(&trans, iter,
-                       BTREE_ID_need_discard, POS_MIN, 0, k,
-               bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
-                                       &seen,
-                                       &open,
-                                       &need_journal_commit,
-                                       &discarded));
-
-       bch2_trans_exit(&trans);
+       ret = bch2_trans_run(c,
+               for_each_btree_key2(trans, iter,
+                               BTREE_ID_need_discard, POS_MIN, 0, k,
+                       bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
+                                               &seen,
+                                               &open,
+                                               &need_journal_commit,
+                                               &discarded)));
 
        if (need_journal_commit * 2 > seen)
                bch2_journal_flush_async(&c->journal, NULL);
@@ -1805,15 +1794,13 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 {
        struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
        struct bch_dev *ca;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        unsigned i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       ret = bch2_btree_write_buffer_flush(&trans);
+       ret = bch2_btree_write_buffer_flush(trans);
        if (ret)
                goto err;
 
@@ -1821,11 +1808,11 @@ static void bch2_do_invalidates_work(struct work_struct *work)
                s64 nr_to_invalidate =
                        should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
-               ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
+               ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
                                lru_pos(ca->dev_idx, 0, 0),
                                lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
                                BTREE_ITER_INTENT, k,
-                       invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
+                       invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
 
                if (ret < 0) {
                        percpu_ref_put(&ca->ref);
@@ -1833,7 +1820,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
                }
        }
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
@@ -1847,7 +1834,7 @@ void bch2_do_invalidates(struct bch_fs *c)
 static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
                                   unsigned long *last_updated)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey hole;
@@ -1855,9 +1842,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
        struct bch_member *m;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
                             POS(ca->dev_idx, ca->mi.first_bucket),
                             BTREE_ITER_PREFETCH);
        /*
@@ -1871,7 +1856,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
                        *last_updated = jiffies;
                }
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                if (bkey_ge(iter.pos, end)) {
                        ret = 0;
@@ -1891,8 +1876,8 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
                        struct bch_alloc_v4 a_convert;
                        const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
 
-                       ret =   bch2_bucket_do_index(&trans, k, a, true) ?:
-                               bch2_trans_commit(&trans, NULL, NULL,
+                       ret =   bch2_bucket_do_index(trans, k, a, true) ?:
+                               bch2_trans_commit(trans, NULL, NULL,
                                                  BTREE_INSERT_LAZY_RW|
                                                  BTREE_INSERT_NOFAIL);
                        if (ret)
@@ -1902,7 +1887,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
                } else {
                        struct bkey_i *freespace;
 
-                       freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
+                       freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
                        ret = PTR_ERR_OR_ZERO(freespace);
                        if (ret)
                                goto bkey_err;
@@ -1912,8 +1897,8 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
                        freespace->k.p          = k.k->p;
                        freespace->k.size       = k.k->size;
 
-                       ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?:
-                               bch2_trans_commit(&trans, NULL, NULL,
+                       ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+                               bch2_trans_commit(trans, NULL, NULL,
                                                  BTREE_INSERT_LAZY_RW|
                                                  BTREE_INSERT_NOFAIL);
                        if (ret)
@@ -1928,11 +1913,11 @@ bkey_err:
                        break;
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
 
        if (ret < 0) {
-               bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "initializing free space");
                return ret;
        }
 
index e02749ddc362ce106d99e5aa3466ea4c2e407d65..3bc4abd3d7d5725e821e37b43282a3d92de149db 100644 (file)
@@ -25,7 +25,7 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "movinggc.h"
 #include "nocow_locking.h"
@@ -502,9 +502,14 @@ again:
 }
 
 /**
- * bch_bucket_alloc - allocate a single bucket from a specific device
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:     transaction object
+ * @ca:                device to allocate from
+ * @watermark: how important is this allocation?
+ * @cl:                if not NULL, closure to be used to wait if buckets not available
+ * @usage:     for secondarily also returning the current device usage
  *
- * Returns index of bucket on success, 0 on failure
+ * Returns:    an open_bucket on success, or an ERR_PTR() on failure.
  */
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
                                      struct bch_dev *ca,
@@ -597,7 +602,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
        struct open_bucket *ob;
 
        bch2_trans_do(c, NULL, NULL, 0,
-                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
+                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
                                                        cl, &usage)));
        return ob;
 }
@@ -775,7 +780,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
        struct dev_alloc_list devs_sorted;
        struct ec_stripe_head *h;
        struct open_bucket *ob;
-       struct bch_dev *ca;
        unsigned i, ec_idx;
        int ret = 0;
 
@@ -805,8 +809,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
                }
        goto out_put_head;
 got_bucket:
-       ca = bch_dev_bkey_exists(c, ob->dev);
-
        ob->ec_idx      = ec_idx;
        ob->ec          = h->s;
        ec_stripe_new_get(h->s, STRIPE_REF_io);
@@ -1032,10 +1034,13 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 
 /**
  * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:                open_bucket to predicate on
+ * @c:         filesystem handle
  * @ca:                if set, we're killing buckets for a particular device
  * @ec:                if true, we're shutting down erasure coding and killing all ec
  *             open_buckets
  *             otherwise, return true
+ * Returns: true if we should kill this open_bucket
  *
  * We're killing open_buckets because we're shutting down a device, erasure
  * coding, or the entire filesystem - check if this open_bucket matches:
index 8747c5e19f9997f1a11b0c32c093c1b3ba602f17..cc856150a948ea7859feedcd0bb99493f328fd04 100644 (file)
@@ -351,20 +351,17 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 {
        struct bch_fs *c = trans->c;
        struct btree_iter alloc_iter = { NULL };
-       struct bch_dev *ca;
        struct bkey_s_c alloc_k;
        struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
-                       "backpointer for mising device:\n%s",
+                       "backpointer for missing device:\n%s",
                        (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                ret = bch2_btree_delete_at(trans, bp_iter, 0);
                goto out;
        }
 
-       ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
        alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
                                     bp_pos_to_bucket(c, k.k->p), 0);
        ret = bkey_err(alloc_k);
@@ -393,10 +390,10 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
        int ret;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                        BTREE_ID_backpointers, POS_MIN, 0, k,
                        NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-                 bch2_check_btree_backpointer(&trans, &iter, k)));
+                 bch2_check_btree_backpointer(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -629,7 +626,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        enum btree_id btree_id;
-       struct bpos_level last_flushed = { UINT_MAX };
+       struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
        int ret = 0;
 
        for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
@@ -706,7 +703,7 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 
                --btree_nodes;
                if (!btree_nodes) {
-                       *end = alloc_k.k->p;
+                       *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
                        break;
                }
 
@@ -726,13 +723,12 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bpos start = POS_MIN, end;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
        while (1) {
-               ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+               ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
                if (ret)
                        break;
 
@@ -752,13 +748,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
                        printbuf_exit(&buf);
                }
 
-               ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+               ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
                if (ret || bpos_eq(end, SPOS_MAX))
                        break;
 
                start = bpos_successor(end);
        }
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
@@ -827,13 +823,12 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 
 int bch2_check_backpointers_to_extents(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
        while (1) {
-               ret = bch2_get_btree_in_memory_pos(&trans,
+               ret = bch2_get_btree_in_memory_pos(trans,
                                                   (1U << BTREE_ID_extents)|
                                                   (1U << BTREE_ID_reflink),
                                                   ~0,
@@ -859,13 +854,13 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
                        printbuf_exit(&buf);
                }
 
-               ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+               ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
                if (ret || !bbpos_cmp(end, BBPOS_MAX))
                        break;
 
                start = bbpos_successor(end);
        }
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
index 30b3d7b9f9dc14de467685e653a2624201a6cf5a..9ae82254452fa799b0aa10ddb544f2bd81d3dc23 100644 (file)
@@ -454,6 +454,7 @@ enum gc_phase {
        GC_PHASE_BTREE_bucket_gens,
        GC_PHASE_BTREE_snapshot_trees,
        GC_PHASE_BTREE_deleted_inodes,
+       GC_PHASE_BTREE_logged_ops,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -626,8 +627,8 @@ struct journal_keys {
        size_t                  size;
 };
 
-struct btree_path_buf {
-       struct btree_path       *path;
+struct btree_trans_buf {
+       struct btree_trans      *trans;
 };
 
 #define REPLICAS_DELTA_LIST_MAX        (1U << 16)
@@ -786,9 +787,9 @@ struct bch_fs {
        /* btree_iter.c: */
        struct seqmutex         btree_trans_lock;
        struct list_head        btree_trans_list;
-       mempool_t               btree_paths_pool;
+       mempool_t               btree_trans_pool;
        mempool_t               btree_trans_mem_pool;
-       struct btree_path_buf  __percpu *btree_paths_bufs;
+       struct btree_trans_buf  __percpu        *btree_trans_bufs;
 
        struct srcu_struct      btree_trans_barrier;
        bool                    btree_trans_barrier_initialized;
index f17238be494cc682f3d4b455a59e097259e12aef..f0d130440baaf2f86e144a71c5183ed39432ea43 100644 (file)
@@ -83,8 +83,8 @@ typedef uuid_t __uuid_t;
 #endif
 
 #define BITMASK(name, type, field, offset, end)                                \
-static const unsigned  name##_OFFSET = offset;                         \
-static const unsigned  name##_BITS = (end - offset);                   \
+static const __maybe_unused unsigned   name##_OFFSET = offset;         \
+static const __maybe_unused unsigned   name##_BITS = (end - offset);   \
                                                                        \
 static inline __u64 name(const type *k)                                        \
 {                                                                      \
@@ -98,9 +98,9 @@ static inline void SET_##name(type *k, __u64 v)                               \
 }
 
 #define LE_BITMASK(_bits, name, type, field, offset, end)              \
-static const unsigned  name##_OFFSET = offset;                         \
-static const unsigned  name##_BITS = (end - offset);                   \
-static const __u##_bits        name##_MAX = (1ULL << (end - offset)) - 1;      \
+static const __maybe_unused unsigned   name##_OFFSET = offset;         \
+static const __maybe_unused unsigned   name##_BITS = (end - offset);   \
+static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
                                                                        \
 static inline __u64 name(const type *k)                                        \
 {                                                                      \
@@ -370,7 +370,9 @@ static inline void bkey_init(struct bkey *k)
        x(backpointer,          28)                     \
        x(inode_v3,             29)                     \
        x(bucket_gens,          30)                     \
-       x(snapshot_tree,        31)
+       x(snapshot_tree,        31)                     \
+       x(logged_op_truncate,   32)                     \
+       x(logged_op_finsert,    33)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -723,7 +725,7 @@ struct bch_inode {
        __le64                  bi_hash_seed;
        __le32                  bi_flags;
        __le16                  bi_mode;
-       __u8                    fields[0];
+       __u8                    fields[];
 } __packed __aligned(8);
 
 struct bch_inode_v2 {
@@ -733,7 +735,7 @@ struct bch_inode_v2 {
        __le64                  bi_hash_seed;
        __le64                  bi_flags;
        __le16                  bi_mode;
-       __u8                    fields[0];
+       __u8                    fields[];
 } __packed __aligned(8);
 
 struct bch_inode_v3 {
@@ -745,7 +747,7 @@ struct bch_inode_v3 {
        __le64                  bi_sectors;
        __le64                  bi_size;
        __le64                  bi_version;
-       __u8                    fields[0];
+       __u8                    fields[];
 } __packed __aligned(8);
 
 #define INODEv3_FIELDS_START_INITIAL   6
@@ -847,8 +849,8 @@ enum {
        __BCH_INODE_NODUMP              = 3,
        __BCH_INODE_NOATIME             = 4,
 
-       __BCH_INODE_I_SIZE_DIRTY        = 5,
-       __BCH_INODE_I_SECTORS_DIRTY     = 6,
+       __BCH_INODE_I_SIZE_DIRTY        = 5, /* obsolete */
+       __BCH_INODE_I_SECTORS_DIRTY     = 6, /* obsolete */
        __BCH_INODE_UNLINKED            = 7,
        __BCH_INODE_BACKPTR_UNTRUSTED   = 8,
 
@@ -1097,20 +1099,20 @@ struct bch_reflink_v {
        struct bch_val          v;
        __le64                  refcount;
        union bch_extent_entry  start[0];
-       __u64                   _data[0];
+       __u64                   _data[];
 } __packed __aligned(8);
 
 struct bch_indirect_inline_data {
        struct bch_val          v;
        __le64                  refcount;
-       u8                      data[0];
+       u8                      data[];
 };
 
 /* Inline data */
 
 struct bch_inline_data {
        struct bch_val          v;
-       u8                      data[0];
+       u8                      data[];
 };
 
 /* Subvolumes: */
@@ -1183,6 +1185,33 @@ struct bch_lru {
 
 #define LRU_ID_STRIPES         (1U << 16)
 
+/* Logged operations btree: */
+
+struct bch_logged_op_truncate {
+       struct bch_val          v;
+       __le32                  subvol;
+       __le32                  pad;
+       __le64                  inum;
+       __le64                  new_i_size;
+};
+
+enum logged_op_finsert_state {
+       LOGGED_OP_FINSERT_start,
+       LOGGED_OP_FINSERT_shift_extents,
+       LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+       struct bch_val          v;
+       __u8                    state;
+       __u8                    pad[3];
+       __le32                  subvol;
+       __le64                  inum;
+       __le64                  dst_offset;
+       __le64                  src_offset;
+       __le64                  pos;
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1223,7 +1252,7 @@ enum bch_sb_field_type {
 
 struct bch_sb_field_journal {
        struct bch_sb_field     field;
-       __le64                  buckets[0];
+       __le64                  buckets[];
 };
 
 struct bch_sb_field_journal_v2 {
@@ -1232,7 +1261,7 @@ struct bch_sb_field_journal_v2 {
        struct bch_sb_field_journal_v2_entry {
                __le64          start;
                __le64          nr;
-       }                       d[0];
+       }                       d[];
 };
 
 /* BCH_SB_FIELD_members: */
@@ -1279,7 +1308,7 @@ enum bch_member_state {
 
 struct bch_sb_field_members {
        struct bch_sb_field     field;
-       struct bch_member       members[0];
+       struct bch_member       members[];
 };
 
 /* BCH_SB_FIELD_crypt: */
@@ -1377,19 +1406,19 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 struct bch_replicas_entry_v0 {
        __u8                    data_type;
        __u8                    nr_devs;
-       __u8                    devs[0];
+       __u8                    devs[];
 } __packed;
 
 struct bch_sb_field_replicas_v0 {
        struct bch_sb_field     field;
-       struct bch_replicas_entry_v0 entries[0];
+       struct bch_replicas_entry_v0 entries[];
 } __packed __aligned(8);
 
 struct bch_replicas_entry {
        __u8                    data_type;
        __u8                    nr_devs;
        __u8                    nr_required;
-       __u8                    devs[0];
+       __u8                    devs[];
 } __packed;
 
 #define replicas_entry_bytes(_i)                                       \
@@ -1397,7 +1426,7 @@ struct bch_replicas_entry {
 
 struct bch_sb_field_replicas {
        struct bch_sb_field     field;
-       struct bch_replicas_entry entries[0];
+       struct bch_replicas_entry entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_quota: */
@@ -1432,7 +1461,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,            struct bch_disk_group, flags[0], 6, 24)
 
 struct bch_sb_field_disk_groups {
        struct bch_sb_field     field;
-       struct bch_disk_group   entries[0];
+       struct bch_disk_group   entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_counters */
@@ -1525,7 +1554,7 @@ enum bch_persistent_counters {
 
 struct bch_sb_field_counters {
        struct bch_sb_field     field;
-       __le64                  d[0];
+       __le64                  d[];
 };
 
 /*
@@ -1539,10 +1568,8 @@ struct jset_entry {
        __u8                    type; /* designates what this jset holds */
        __u8                    pad[3];
 
-       union {
-               struct bkey_i   start[0];
-               __u64           _data[0];
-       };
+       struct bkey_i           start[0];
+       __u64                   _data[];
 };
 
 struct bch_sb_field_clean {
@@ -1553,10 +1580,8 @@ struct bch_sb_field_clean {
        __le16                  _write_clock;
        __le64                  journal_seq;
 
-       union {
-               struct jset_entry start[0];
-               __u64           _data[0];
-       };
+       struct jset_entry       start[0];
+       __u64                   _data[];
 };
 
 struct journal_seq_blacklist_entry {
@@ -1567,10 +1592,8 @@ struct journal_seq_blacklist_entry {
 struct bch_sb_field_journal_seq_blacklist {
        struct bch_sb_field     field;
 
-       union {
-               struct journal_seq_blacklist_entry start[0];
-               __u64           _data[0];
-       };
+       struct journal_seq_blacklist_entry start[0];
+       __u64                   _data[];
 };
 
 /* Superblock: */
@@ -1645,7 +1668,8 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_max
 };
 
-static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
 
@@ -1706,10 +1730,8 @@ struct bch_sb {
 
        struct bch_sb_layout    layout;
 
-       union {
-               struct bch_sb_field start[0];
-               __le64          _data[0];
-       };
+       struct bch_sb_field     start[0];
+       __le64                  _data[];
 } __packed __aligned(8);
 
 /*
@@ -1954,7 +1976,7 @@ enum bch_csum_type {
        BCH_CSUM_NR
 };
 
-static const unsigned bch_crc_bytes[] = {
+static const __maybe_unused unsigned bch_crc_bytes[] = {
        [BCH_CSUM_none]                         = 0,
        [BCH_CSUM_crc32c_nonzero]               = 4,
        [BCH_CSUM_crc32c]                       = 4,
@@ -2186,10 +2208,8 @@ struct jset {
        __le64                  last_seq;
 
 
-       union {
-               struct jset_entry start[0];
-               __u64           _data[0];
-       };
+       struct jset_entry       start[0];
+       __u64                   _data[];
 } __packed __aligned(8);
 
 LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
@@ -2259,7 +2279,10 @@ enum btree_id_flags {
        x(snapshot_trees,       15,     0,                                      \
          BIT_ULL(KEY_TYPE_snapshot_tree))                                      \
        x(deleted_inodes,       16,     BTREE_ID_SNAPSHOTS,                     \
-         BIT_ULL(KEY_TYPE_set))
+         BIT_ULL(KEY_TYPE_set))                                                \
+       x(logged_ops,           17,     0,                                      \
+         BIT_ULL(KEY_TYPE_logged_op_truncate)|                                 \
+         BIT_ULL(KEY_TYPE_logged_op_finsert))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
@@ -2294,10 +2317,8 @@ struct bset {
        __le16                  version;
        __le16                  u64s; /* count of d[] in u64s */
 
-       union {
-               struct bkey_packed start[0];
-               __u64           _data[0];
-       };
+       struct bkey_packed      start[0];
+       __u64                   _data[];
 } __packed __aligned(8);
 
 LE32_BITMASK(BSET_CSUM_TYPE,   struct bset, flags, 0, 4);
index 0a5bfe6e9a2da57ce9bc628941dc7833aca6d6e7..abdb05507d162c7c06bb89ce96bf67f6484207a7 100644 (file)
@@ -127,7 +127,7 @@ static void pack_state_finish(struct pack_state *state,
                              struct bkey_packed *k)
 {
        EBUG_ON(state->p <  k->_data);
-       EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+       EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
 
        *state->p = state->w;
 }
@@ -308,9 +308,14 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format,
 
 /**
  * bch2_bkey_pack_key -- pack just the key, not the value
+ * @out:       packed result
+ * @in:                key to pack
+ * @format:    format of packed result
+ *
+ * Returns: true on success, false on failure
  */
 bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-                  const struct bkey_format *format)
+                       const struct bkey_format *format)
 {
        struct pack_state state = pack_state_init(format, out);
        u64 *w = out->_data;
@@ -336,9 +341,12 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 
 /**
  * bch2_bkey_unpack -- unpack the key and the value
+ * @b:         btree node of @src key (for packed format)
+ * @dst:       unpacked result
+ * @src:       packed input
  */
 void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
-                const struct bkey_packed *src)
+                     const struct bkey_packed *src)
 {
        __bkey_unpack_key(b, &dst->k, src);
 
@@ -349,19 +357,24 @@ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
 
 /**
  * bch2_bkey_pack -- pack the key and the value
+ * @dst:       packed result
+ * @src:       unpacked input
+ * @format:    format of packed result
+ *
+ * Returns: true on success, false on failure
  */
-bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
-              const struct bkey_format *format)
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+                   const struct bkey_format *format)
 {
        struct bkey_packed tmp;
 
-       if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+       if (!bch2_bkey_pack_key(&tmp, &src->k, format))
                return false;
 
-       memmove_u64s((u64 *) out + format->key_u64s,
-                    &in->v,
-                    bkey_val_u64s(&in->k));
-       memcpy_u64s_small(out, &tmp, format->key_u64s);
+       memmove_u64s((u64 *) dst + format->key_u64s,
+                    &src->v,
+                    bkey_val_u64s(&src->k));
+       memcpy_u64s_small(dst, &tmp, format->key_u64s);
 
        return true;
 }
index 51969a46265e124233028dfbf6b3e39137cb383a..5184502092369d963f076383ee9bf3ca5a358f6c 100644 (file)
@@ -52,7 +52,7 @@ struct bkey_s {
 
 static inline struct bkey_i *bkey_next(struct bkey_i *k)
 {
-       return (struct bkey_i *) (k->_data + k->k.u64s);
+       return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
 }
 
 #define bkey_val_u64s(_k)      ((_k)->u64s - BKEY_U64s)
@@ -397,7 +397,7 @@ static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
 }
 
 #define bkeyp_val(_format, _k)                                         \
-        ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+        ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
 
 extern const struct bkey_format bch2_bkey_format_current;
 
@@ -732,7 +732,7 @@ static inline unsigned high_word_offset(const struct bkey_format *f)
 #error edit for your odd byteorder.
 #endif
 
-#define high_word(f, k)                ((k)->_data + high_word_offset(f))
+#define high_word(f, k)                ((u64 *) (k)->_data + high_word_offset(f))
 #define next_word(p)           nth_word(p, 1)
 #define prev_word(p)           nth_word(p, -1)
 
index 6547142db42806cfebfa92e5bca81bba600287df..be9f012fc7be5fc5898de53b4466eb55c6de1cd2 100644 (file)
@@ -10,6 +10,7 @@
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "io_misc.h"
 #include "lru.h"
 #include "quota.h"
 #include "reflink.h"
@@ -25,7 +26,7 @@ const char * const bch2_bkey_types[] = {
 };
 
 static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                              unsigned flags, struct printbuf *err)
+                              enum bkey_invalid_flags flags, struct printbuf *err)
 {
        return 0;
 }
@@ -39,7 +40,7 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                unsigned flags, struct printbuf *err)
+                                enum bkey_invalid_flags flags, struct printbuf *err)
 {
        if (bkey_val_bytes(k.k)) {
                prt_printf(err, "incorrect value size (%zu != 0)",
@@ -55,7 +56,7 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                  unsigned flags, struct printbuf *err)
+                                  enum bkey_invalid_flags flags, struct printbuf *err)
 {
        return 0;
 }
@@ -70,7 +71,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 })
 
 static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                       unsigned flags, struct printbuf *err)
+                                       enum bkey_invalid_flags flags, struct printbuf *err)
 {
        return 0;
 }
@@ -91,7 +92,7 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 })
 
 static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                               unsigned flags, struct printbuf *err)
+                               enum bkey_invalid_flags flags, struct printbuf *err)
 {
        if (bkey_val_bytes(k.k)) {
                prt_printf(err, "incorrect value size (%zu != %zu)",
@@ -368,7 +369,6 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 {
        const struct bkey_ops *ops;
        struct bkey uk;
-       struct bkey_s u;
        unsigned nr_compat = 5;
        int i;
 
@@ -433,7 +433,9 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
                }
 
                break;
-       case 4:
+       case 4: {
+               struct bkey_s u;
+
                if (!bkey_packed(k)) {
                        u = bkey_i_to_s(packed_to_bkey(k));
                } else {
@@ -450,6 +452,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
                if (ops->compat)
                        ops->compat(btree_id, version, big_endian, write, u);
                break;
+       }
        default:
                BUG();
        }
index 79cf11d1b4e7e69d5e13512b3d5d7a8b14f39c42..7c0f0b160f18533302ebc0e5568c7599f2f6cffd 100644 (file)
@@ -9,14 +9,24 @@ struct sort_iter {
 
        struct sort_iter_set {
                struct bkey_packed *k, *end;
-       } data[MAX_BSETS + 1];
+       } data[];
 };
 
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
 {
        iter->b = b;
        iter->used = 0;
-       iter->size = ARRAY_SIZE(iter->data);
+       iter->size = size;
+}
+
+struct sort_iter_stack {
+       struct sort_iter        iter;
+       struct sort_iter_set    sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+       sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
 }
 
 static inline void sort_iter_add(struct sort_iter *iter,
index bcdf28f39b9c3db2b1b3f14b093348f3a0412da6..bb73ba9017b006e7fe181e19b7cccfe8494c1339 100644 (file)
@@ -172,10 +172,10 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
                printk(KERN_ERR "iter was:");
 
                btree_node_iter_for_each(_iter, set) {
-                       struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
-                       struct bset_tree *t = bch2_bkey_to_bset(b, k);
+                       struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+                       struct bset_tree *t = bch2_bkey_to_bset(b, k2);
                        printk(" [%zi %zi]", t - b->set,
-                              k->_data - bset(b, t)->_data);
+                              k2->_data - bset(b, t)->_data);
                }
                panic("\n");
        }
@@ -232,7 +232,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 {
        struct bset_tree *t = bch2_bkey_to_bset(b, where);
        struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
-       struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+       struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
        struct printbuf buf1 = PRINTBUF;
        struct printbuf buf2 = PRINTBUF;
 #if 0
@@ -300,7 +300,8 @@ static unsigned bkey_float_byte_offset(unsigned idx)
 }
 
 struct ro_aux_tree {
-       struct bkey_float       f[0];
+       u8                      nothing[0];
+       struct bkey_float       f[];
 };
 
 struct rw_aux_tree {
@@ -476,7 +477,7 @@ static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
 {
        unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
 
-       return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+       return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
 }
 
 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
@@ -1010,8 +1011,8 @@ void bch2_bset_insert(struct btree *b,
                btree_keys_account_key_add(&b->nr, t - b->set, src);
 
        if (src->u64s != clobber_u64s) {
-               u64 *src_p = where->_data + clobber_u64s;
-               u64 *dst_p = where->_data + src->u64s;
+               u64 *src_p = (u64 *) where->_data + clobber_u64s;
+               u64 *dst_p = (u64 *) where->_data + src->u64s;
 
                EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
                        (int) clobber_u64s - src->u64s);
@@ -1037,7 +1038,7 @@ void bch2_bset_delete(struct btree *b,
                      unsigned clobber_u64s)
 {
        struct bset_tree *t = bset_tree_last(b);
-       u64 *src_p = where->_data + clobber_u64s;
+       u64 *src_p = (u64 *) where->_data + clobber_u64s;
        u64 *dst_p = where->_data;
 
        bch2_bset_verify_rw_aux_tree(b, t);
@@ -1188,7 +1189,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
        case BSET_RO_AUX_TREE:
                return bset_search_tree(b, t, search, lossy_packed_search);
        default:
-               unreachable();
+               BUG();
        }
 }
 
@@ -1268,9 +1269,13 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 }
 
 /**
- * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
  * given position
  *
+ * @iter:      iterator to initialize
+ * @b:         btree node to search
+ * @search:    search key
+ *
  * Main entry point to the lookup code for individual btree nodes:
  *
  * NOTE:
index a8283fdc7e63929d0a088e3b9bb6624e240f4de1..7c6769cd17b33338a2ba03b416aa8c7950a0fe77 100644 (file)
@@ -795,7 +795,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
        six_unlock_intent(&b->c.lock);
 
        /* Unlock before doing IO: */
-       if (trans && sync)
+       if (path && sync)
                bch2_trans_unlock_noassert(trans);
 
        bch2_btree_node_read(c, b, sync);
@@ -934,7 +934,7 @@ retry:
        }
 
        if (unlikely(need_relock)) {
-               int ret = bch2_trans_relock(trans) ?:
+               ret = bch2_trans_relock(trans) ?:
                        bch2_btree_path_relock_intent(trans, path);
                if (ret) {
                        six_unlock_type(&b->c.lock, lock_type);
@@ -965,11 +965,20 @@ retry:
 }
 
 /**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  *
+ * @trans:     btree transaction object
+ * @path:      btree_path being traversed
+ * @k:         pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level:     level of btree node being looked up (0 == leaf node)
+ * @lock_type: SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip:  ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
  */
 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
                                  const struct bkey_i *k, unsigned level,
@@ -1016,28 +1025,8 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
        }
 
        if (unlikely(btree_node_read_in_flight(b))) {
-               u32 seq = six_lock_seq(&b->c.lock);
-
                six_unlock_type(&b->c.lock, lock_type);
-               bch2_trans_unlock(trans);
-
-               bch2_btree_node_wait_on_read(b);
-
-               /*
-                * should_be_locked is not set on this path yet, so we need to
-                * relock it specifically:
-                */
-               if (trans) {
-                       int ret = bch2_trans_relock(trans) ?:
-                               bch2_btree_path_relock_intent(trans, path);
-                       if (ret) {
-                               BUG_ON(!trans->restarted);
-                               return ERR_PTR(ret);
-                       }
-               }
-
-               if (!six_relock_type(&b->c.lock, lock_type, seq))
-                       return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+               return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
        }
 
        prefetch(b->aux_data);
index 83dcd9eb2c5c4ddfda8768c1fe4f1306fe9de3f2..97fbd833115800e4ed8a8b1b0577fa1a6f359af0 100644 (file)
@@ -529,13 +529,11 @@ fsck_err:
 
 int bch2_check_topology(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree *b;
        unsigned i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
 
@@ -546,8 +544,8 @@ int bch2_check_topology(struct bch_fs *c)
                if (btree_node_fake(b))
                        continue;
 
-               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
-               ret = bch2_btree_repair_topology_recurse(&trans, b);
+               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+               ret = bch2_btree_repair_topology_recurse(trans, b);
                six_unlock_read(&b->c.lock);
 
                if (ret == DROP_THIS_NODE) {
@@ -556,7 +554,7 @@ int bch2_check_topology(struct bch_fs *c)
                }
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -566,8 +564,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                               struct bkey_s_c *k)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
-       const union bch_extent_entry *entry;
+       struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
+       const union bch_extent_entry *entry_c;
        struct extent_ptr_decoded p = { 0 };
        bool do_update = false;
        struct printbuf buf = PRINTBUF;
@@ -577,10 +575,10 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
         * XXX
         * use check_bucket_ref here
         */
-       bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+       bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
                struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-               enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
+               enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
 
                if (!g->gen_valid &&
                    (c->opts.reconstruct_alloc ||
@@ -1068,15 +1066,13 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 
 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        enum btree_id ids[BTREE_ID_NR];
        unsigned i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        if (initial)
-               trans.is_initial_gc = true;
+               trans->is_initial_gc = true;
 
        for (i = 0; i < BTREE_ID_NR; i++)
                ids[i] = i;
@@ -1084,22 +1080,22 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 
        for (i = 0; i < BTREE_ID_NR && !ret; i++)
                ret = initial
-                       ? bch2_gc_btree_init(&trans, ids[i], metadata_only)
-                       : bch2_gc_btree(&trans, ids[i], initial, metadata_only);
+                       ? bch2_gc_btree_init(trans, ids[i], metadata_only)
+                       : bch2_gc_btree(trans, ids[i], initial, metadata_only);
 
        for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
                if (!bch2_btree_id_root(c, i)->alive)
                        continue;
 
                ret = initial
-                       ? bch2_gc_btree_init(&trans, i, metadata_only)
-                       : bch2_gc_btree(&trans, i, initial, metadata_only);
+                       ? bch2_gc_btree_init(trans, i, metadata_only)
+                       : bch2_gc_btree(trans, i, initial, metadata_only);
        }
 
        if (ret < 0)
                bch_err_fn(c, ret);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -1220,14 +1216,6 @@ static int bch2_gc_done(struct bch_fs *c,
             fsck_err(c, _msg ": got %llu, should be %llu"              \
                      , ##__VA_ARGS__, dst->_f, src->_f)))              \
                dst->_f = src->_f
-#define copy_stripe_field(_f, _msg, ...)                               \
-       if (dst->_f != src->_f &&                                       \
-           (!verify ||                                                 \
-            fsck_err(c, "stripe %zu has wrong "_msg                    \
-                     ": got %u, should be %u",                         \
-                     iter.pos, ##__VA_ARGS__,                          \
-                     dst->_f, src->_f)))                               \
-               dst->_f = src->_f
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)                                   \
@@ -1249,7 +1237,7 @@ static int bch2_gc_done(struct bch_fs *c,
                        copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
                        copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
                }
-       };
+       }
 
        {
                unsigned nr = fs_usage_u64s(c);
@@ -1469,37 +1457,35 @@ fsck_err:
 
 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_dev *ca;
        unsigned i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for_each_member_device(ca, c, i) {
-               ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+               ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
                                POS(ca->dev_idx, ca->mi.first_bucket),
                                BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
                                NULL, NULL, BTREE_INSERT_LAZY_RW,
-                       bch2_alloc_write_key(&trans, &iter, k, metadata_only));
+                       bch2_alloc_write_key(trans, &iter, k, metadata_only));
 
                if (ret < 0) {
-                       bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
+                       bch_err_fn(c, ret);
                        percpu_ref_put(&ca->ref);
                        break;
                }
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret < 0 ? ret : 0;
 }
 
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
        struct bch_dev *ca;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bucket *g;
@@ -1515,17 +1501,16 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
                if (!buckets) {
                        percpu_ref_put(&ca->ref);
                        bch_err(c, "error allocating ca->buckets[gc]");
-                       return -BCH_ERR_ENOMEM_gc_alloc_start;
+                       ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+                       goto err;
                }
 
                buckets->first_bucket   = ca->mi.first_bucket;
                buckets->nbuckets       = ca->mi.nbuckets;
                rcu_assign_pointer(ca->buckets_gc, buckets);
-       };
-
-       bch2_trans_init(&trans, c, 0, 0);
+       }
 
-       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                ca = bch_dev_bkey_exists(c, k.k->p.inode);
                g = gc_bucket(ca, k.k->p.offset);
@@ -1546,13 +1531,11 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
                        g->stripe_redundancy    = a->stripe_redundancy;
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
-
-       bch2_trans_exit(&trans);
-
+       bch2_trans_iter_exit(trans, &iter);
+err:
+       bch2_trans_put(trans);
        if (ret)
-               bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
-
+               bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1575,7 +1558,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
                        g->dirty_sectors = 0;
                        g->cached_sectors = 0;
                }
-       };
+       }
 }
 
 static int bch2_gc_write_reflink_key(struct btree_trans *trans,
@@ -1627,7 +1610,7 @@ fsck_err:
 
 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        size_t idx = 0;
@@ -1636,23 +1619,23 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
        if (metadata_only)
                return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
-       ret = for_each_btree_key_commit(&trans, iter,
+       ret = for_each_btree_key_commit(trans, iter,
                        BTREE_ID_reflink, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
                        NULL, NULL, BTREE_INSERT_NOFAIL,
-               bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
+               bch2_gc_write_reflink_key(trans, &iter, k, &idx));
 
        c->reflink_gc_nr = 0;
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int bch2_gc_reflink_start(struct bch_fs *c,
                                 bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct reflink_gc *r;
@@ -1661,10 +1644,10 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
        if (metadata_only)
                return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
        c->reflink_gc_nr = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                const __le64 *refcount = bkey_refcount_c(k);
 
@@ -1682,9 +1665,9 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
                r->size         = k.k->size;
                r->refcount     = 0;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -1751,7 +1734,7 @@ fsck_err:
 
 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
@@ -1759,15 +1742,15 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
        if (metadata_only)
                return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
-       ret = for_each_btree_key_commit(&trans, iter,
+       ret = for_each_btree_key_commit(trans, iter,
                        BTREE_ID_stripes, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
                        NULL, NULL, BTREE_INSERT_NOFAIL,
-               bch2_gc_write_stripes_key(&trans, &iter, k));
+               bch2_gc_write_stripes_key(trans, &iter, k));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -1779,6 +1762,12 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
 /**
  * bch2_gc - walk _all_ references to buckets, and recompute them:
  *
+ * @c:                 filesystem object
+ * @initial:           are we in recovery?
+ * @metadata_only:     are we just checking metadata references, or everything?
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
  * Order matters here:
  *  - Concurrent GC relies on the fact that we have a total ordering for
  *    everything that GC walks - see  gc_will_visit_node(),
@@ -1947,7 +1936,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 
 int bch2_gc_gens(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_dev *ca;
@@ -1965,7 +1954,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
        trace_and_count(c, gc_gens_start, c);
        down_read(&c->gc_lock);
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
        for_each_member_device(ca, c, i) {
                struct bucket_gens *gens;
@@ -1988,33 +1977,31 @@ int bch2_gc_gens(struct bch_fs *c)
 
        for (i = 0; i < BTREE_ID_NR; i++)
                if (btree_type_has_ptrs(i)) {
-                       struct btree_iter iter;
-                       struct bkey_s_c k;
-
                        c->gc_gens_btree = i;
                        c->gc_gens_pos = POS_MIN;
-                       ret = for_each_btree_key_commit(&trans, iter, i,
+
+                       ret = for_each_btree_key_commit(trans, iter, i,
                                        POS_MIN,
                                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                                        k,
                                        NULL, NULL,
                                        BTREE_INSERT_NOFAIL,
-                               gc_btree_gens_key(&trans, &iter, k));
+                               gc_btree_gens_key(trans, &iter, k));
                        if (ret && !bch2_err_matches(ret, EROFS))
-                               bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+                               bch_err_fn(c, ret);
                        if (ret)
                                goto err;
                }
 
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
                        POS_MIN,
                        BTREE_ITER_PREFETCH,
                        k,
                        NULL, NULL,
                        BTREE_INSERT_NOFAIL,
-               bch2_alloc_write_oldest_gen(&trans, &iter, k));
+               bch2_alloc_write_oldest_gen(trans, &iter, k));
        if (ret && !bch2_err_matches(ret, EROFS))
-               bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+               bch_err_fn(c, ret);
        if (ret)
                goto err;
 
@@ -2031,7 +2018,7 @@ err:
                ca->oldest_gen = NULL;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        up_read(&c->gc_lock);
        mutex_unlock(&c->gc_gens_lock);
        return ret;
@@ -2086,7 +2073,7 @@ static int bch2_gc_thread(void *arg)
                ret = bch2_gc_gens(c);
 #endif
                if (ret < 0)
-                       bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
+                       bch_err_fn(c, ret);
 
                debug_check_no_locks_held();
        }
@@ -2116,7 +2103,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
        p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
        if (IS_ERR(p)) {
-               bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
+               bch_err_fn(c, PTR_ERR(p));
                return PTR_ERR(p);
        }
 
index 3b654841ab00675b27220334813cc508227c366a..a869cf6ac7c6b94c84cba88bb0a3eeaac0d98d68 100644 (file)
@@ -14,7 +14,7 @@
 #include "debug.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "recovery.h"
@@ -106,8 +106,8 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
                vpfree(p, size);
 }
 
-static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
-                                      bool *used_mempool)
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
+                               bool *used_mempool)
 {
        unsigned flags = memalloc_nofs_save();
        void *p;
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
        BUG_ON(size > btree_bytes(c));
 
        *used_mempool = false;
-       p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT);
+       p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
        if (!p) {
                *used_mempool = true;
                p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -123,8 +123,6 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
        memalloc_nofs_restore(flags);
        return p;
 }
-#define btree_bounce_alloc(_c, _size, _used_mempool)           \
-       alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool))
 
 static void sort_bkey_ptrs(const struct btree *bt,
                           struct bkey_packed **ptrs, unsigned nr)
@@ -294,7 +292,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
                            bool filter_whiteouts)
 {
        struct btree_node *out;
-       struct sort_iter sort_iter;
+       struct sort_iter_stack sort_iter;
        struct bset_tree *t;
        struct bset *start_bset = bset(b, &b->set[start_idx]);
        bool used_mempool = false;
@@ -303,13 +301,13 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        bool sorting_entire_node = start_idx == 0 &&
                end_idx == b->nsets;
 
-       sort_iter_init(&sort_iter, b);
+       sort_iter_stack_init(&sort_iter, b);
 
        for (t = b->set + start_idx;
             t < b->set + end_idx;
             t++) {
                u64s += le16_to_cpu(bset(b, t)->u64s);
-               sort_iter_add(&sort_iter,
+               sort_iter_add(&sort_iter.iter,
                              btree_bkey_first(b, t),
                              btree_bkey_last(b, t));
        }
@@ -322,7 +320,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
        start_time = local_clock();
 
-       u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+       u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
 
        out->keys.u64s = cpu_to_le16(u64s);
 
@@ -338,7 +336,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        start_bset->journal_seq = cpu_to_le64(seq);
 
        if (sorting_entire_node) {
-               unsigned u64s = le16_to_cpu(out->keys.u64s);
+               u64s = le16_to_cpu(out->keys.u64s);
 
                BUG_ON(bytes != btree_bytes(c));
 
@@ -412,8 +410,6 @@ void bch2_btree_sort_into(struct bch_fs *c,
        bch2_verify_btree_nr_keys(dst);
 }
 
-#define SORT_CRIT      (4096 / sizeof(u64))
-
 /*
  * We're about to add another bset to the btree node, so if there's currently
  * too many bsets - sort some of them together:
@@ -544,6 +540,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
        prt_str(out, ": ");
 }
 
+__printf(8, 9)
 static int __btree_err(int ret,
                       struct bch_fs *c,
                       struct bch_dev *ca,
@@ -624,9 +621,6 @@ __cold
 void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 {
        struct bset_tree *t;
-       struct bkey_s_c k;
-       struct bkey unpacked;
-       struct btree_node_iter iter;
 
        for_each_bset(b, t) {
                struct bset *i = bset(b, t);
@@ -662,6 +656,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
        bch2_bset_set_no_aux_tree(b, b->set);
        bch2_btree_build_aux_trees(b);
 
+       struct bkey_s_c k;
+       struct bkey unpacked;
+       struct btree_node_iter iter;
        for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
                BUG_ON(bpos_lt(k.k->p, b->data->min_key));
                BUG_ON(bpos_gt(k.k->p, b->data->max_key));
@@ -910,7 +907,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
                BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        unsigned u64s;
-       unsigned blacklisted_written, nonblacklisted_written = 0;
        unsigned ptr_written = btree_ptr_sectors_written(&b->key);
        struct printbuf buf = PRINTBUF;
        int ret = 0, retry_read = 0, write = READ;
@@ -920,8 +916,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        b->written = 0;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
-       sort_iter_init(iter, b);
-       iter->size = (btree_blocks(c) + 1) * 2;
+       sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
 
        if (bch2_meta_read_fault("btree"))
                btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
@@ -1045,8 +1040,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                sort_iter_add(iter,
                              vstruct_idx(i, 0),
                              vstruct_last(i));
-
-               nonblacklisted_written = b->written;
        }
 
        if (ptr_written) {
@@ -1064,18 +1057,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                                                      true),
                                     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
                                     "found bset signature after last bset");
-
-               /*
-                * Blacklisted bsets are those that were written after the most recent
-                * (flush) journal write. Since there wasn't a flush, they may not have
-                * made it to all devices - which means we shouldn't write new bsets
-                * after them, as that could leave a gap and then reads from that device
-                * wouldn't find all the bsets in that btree node - which means it's
-                * important that we start writing new bsets after the most recent _non_
-                * blacklisted bset:
-                */
-               blacklisted_written = b->written;
-               b->written = nonblacklisted_written;
        }
 
        sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
@@ -1143,9 +1124,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        btree_node_reset_sib_u64s(b);
 
        bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+               struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
 
-               if (ca->mi.state != BCH_MEMBER_STATE_rw)
+               if (ca2->mi.state != BCH_MEMBER_STATE_rw)
                        set_btree_node_need_rewrite(b);
        }
 
@@ -1227,19 +1208,17 @@ start:
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
                               rb->start_time);
        bio_put(&rb->bio);
-       printbuf_exit(&buf);
 
        if (saw_error && !btree_node_read_error(b)) {
-               struct printbuf buf = PRINTBUF;
-
+               printbuf_reset(&buf);
                bch2_bpos_to_text(&buf, b->key.k.p);
                bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
                         __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
-               printbuf_exit(&buf);
 
                bch2_btree_node_rewrite_async(c, b);
        }
 
+       printbuf_exit(&buf);
        clear_btree_node_read_in_flight(b);
        wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
@@ -1649,8 +1628,7 @@ err:
 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
                        const struct bkey_i *k, unsigned level)
 {
-       return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level));
-
+       return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
 }
 
 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@@ -1712,15 +1690,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
-       struct btree_trans trans;
-
-       bch2_trans_init(&trans, c, 0, 0);
+       struct btree_trans *trans = bch2_trans_get(c);
 
-       btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
        __btree_node_write_done(c, b);
        six_unlock_read(&b->c.lock);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 }
 
 static void btree_node_write_work(struct work_struct *work)
@@ -1749,7 +1725,7 @@ static void btree_node_write_work(struct work_struct *work)
                }
        } else {
                ret = bch2_trans_do(c, NULL, NULL, 0,
-                       bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
+                       bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
                                        BCH_WATERMARK_reclaim|
                                        BTREE_INSERT_JOURNAL_RECLAIM|
                                        BTREE_INSERT_NOFAIL|
@@ -1854,7 +1830,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
        struct bset *i;
        struct btree_node *bn = NULL;
        struct btree_node_entry *bne = NULL;
-       struct sort_iter sort_iter;
+       struct sort_iter_stack sort_iter;
        struct nonce nonce;
        unsigned bytes_to_write, sectors_to_write, bytes, u64s;
        u64 seq = 0;
@@ -1927,7 +1903,7 @@ do_write:
 
        bch2_sort_whiteouts(c, b);
 
-       sort_iter_init(&sort_iter, b);
+       sort_iter_stack_init(&sort_iter, b);
 
        bytes = !b->written
                ? sizeof(struct btree_node)
@@ -1942,7 +1918,7 @@ do_write:
                        continue;
 
                bytes += le16_to_cpu(i->u64s) * sizeof(u64);
-               sort_iter_add(&sort_iter,
+               sort_iter_add(&sort_iter.iter,
                              btree_bkey_first(b, t),
                              btree_bkey_last(b, t));
                seq = max(seq, le64_to_cpu(i->journal_seq));
@@ -1971,14 +1947,14 @@ do_write:
        i->journal_seq  = cpu_to_le64(seq);
        i->u64s         = 0;
 
-       sort_iter_add(&sort_iter,
+       sort_iter_add(&sort_iter.iter,
                      unwritten_whiteouts_start(c, b),
                      unwritten_whiteouts_end(c, b));
        SET_BSET_SEPARATE_WHITEOUTS(i, false);
 
        b->whiteout_u64s = 0;
 
-       u64s = bch2_sort_keys(i->start, &sort_iter, false);
+       u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
        le16_add_cpu(&i->u64s, u64s);
 
        BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
index cd99bbb00a5a944337ae8c1bf0827a213e048f19..7e03dd76fb380498a42bcdef91857727403a4d8a 100644 (file)
@@ -7,7 +7,7 @@
 #include "btree_locking.h"
 #include "checksum.h"
 #include "extents.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct bch_fs;
 struct btree_write;
index 5216d3391079d5b6f286b38ba27298569808f3ee..4cee5e6cd7f4aa5c7270d2f766ada56a6cef6ba3 100644 (file)
@@ -488,7 +488,6 @@ fixup_done:
        if (!bch2_btree_node_iter_end(node_iter) &&
            iter_current_key_modified &&
            b->c.level) {
-               struct bset_tree *t;
                struct bkey_packed *k, *k2, *p;
 
                k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -689,7 +688,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
                        if (t != BTREE_NODE_UNLOCKED) {
                                btree_node_unlock(trans, path, b->c.level);
                                six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-                               mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
+                               mark_btree_node_locked(trans, path, b->c.level, t);
                        }
 
                        bch2_btree_path_level_init(trans, path, b);
@@ -764,7 +763,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
                        for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
                                path->l[i].b = NULL;
 
-                       mark_btree_node_locked(trans, path, path->level, lock_type);
+                       mark_btree_node_locked(trans, path, path->level,
+                                              (enum btree_node_locked_type) lock_type);
                        bch2_btree_path_level_init(trans, path, b);
                        return 0;
                }
@@ -936,7 +936,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
        if (btree_node_read_locked(path, level + 1))
                btree_node_unlock(trans, path, level + 1);
 
-       mark_btree_node_locked(trans, path, level, lock_type);
+       mark_btree_node_locked(trans, path, level,
+                              (enum btree_node_locked_type) lock_type);
        path->level = level;
        bch2_btree_path_level_init(trans, path, b);
 
@@ -1341,14 +1342,14 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p
        __bch2_path_free(trans, path);
 }
 
-void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
 {
        panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
              trans->restart_count, restart_count,
              (void *) trans->last_begin_ip);
 }
 
-void bch2_trans_in_restart_error(struct btree_trans *trans)
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
 {
        panic("in transaction restart: %s, last restarted by %pS\n",
              bch2_err_str(trans->restarted),
@@ -1493,7 +1494,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
 static noinline void btree_path_overflow(struct btree_trans *trans)
 {
        bch2_dump_trans_paths_updates(trans);
-       panic("trans path oveflow\n");
+       panic("trans path overflow\n");
 }
 
 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
@@ -2046,8 +2047,12 @@ out:
 }
 
 /**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter:      iterator to peek from
+ * @end:       search limit: returns keys less than or equal to @end
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
 {
@@ -2184,10 +2189,13 @@ end:
 }
 
 /**
- * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
- * to iterator's current position, returning keys from every level of the btree.
- * For keys at different levels of the btree that compare equal, the key from
- * the lower level (leaf) is returned first.
+ * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
+ * equal to iterator's current position, returning keys from every level of the
+ * btree. For keys at different levels of the btree that compare equal, the key
+ * from the lower level (leaf) is returned first.
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
 {
@@ -2278,8 +2286,11 @@ out_no_locked:
 }
 
 /**
- * bch2_btree_iter_next: returns first key greater than iterator's current
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
  * position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
@@ -2290,8 +2301,11 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 }
 
 /**
- * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
  * iterator's current position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
@@ -2414,8 +2428,11 @@ out_no_locked:
 }
 
 /**
- * bch2_btree_iter_prev: returns first key less than iterator's current
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
  * position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
@@ -2722,7 +2739,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 
 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
                          struct btree_iter *iter,
-                         unsigned btree_id, struct bpos pos,
+                         enum btree_id btree_id, struct bpos pos,
                          unsigned flags)
 {
        bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
@@ -2830,6 +2847,8 @@ static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
  *
+ * Returns:    current restart counter, to be used with trans_was_restarted()
+ *
  * While iterating over nodes or updating nodes a attempt to lock a btree node
  * may return BCH_ERR_transaction_restart when the trylock fails. When this
  * occurs bch2_trans_begin() should be called and the transaction retried.
@@ -2887,28 +2906,23 @@ u32 bch2_trans_begin(struct btree_trans *trans)
        return trans->restart_count;
 }
 
-static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
 {
-       size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
-       size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
-       void *p = NULL;
-
-       BUG_ON(trans->used_mempool);
+       struct btree_trans *trans;
 
-#ifdef __KERNEL__
-       p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
-#endif
-       if (!p) {
-               p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
-               /*
-                * paths need to be zeroed, bch2_check_for_deadlock looks at
-                * paths in other threads
-                */
-               memset(p, 0, paths_bytes);
+       if (IS_ENABLED(__KERNEL__)) {
+               trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+               if (trans)
+                       return trans;
        }
 
-       trans->paths            = p; p += paths_bytes;
-       trans->updates          = p; p += updates_bytes;
+       trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+       /*
+        * paths need to be zeroed, bch2_check_for_deadlock looks at
+        * paths in other threads
+        */
+       memset(&trans->paths, 0, sizeof(trans->paths));
+       return trans;
 }
 
 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
@@ -2928,13 +2942,16 @@ unsigned bch2_trans_get_fn_idx(const char *fn)
        return i;
 }
 
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
        __acquires(&c->btree_trans_barrier)
 {
+       struct btree_trans *trans;
        struct btree_transaction_stats *s;
 
        bch2_assert_btree_nodes_not_locked();
 
+       trans = bch2_trans_alloc(c);
+
        memset(trans, 0, sizeof(*trans));
        trans->c                = c;
        trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
@@ -2946,8 +2963,6 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
                !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
        closure_init_stack(&trans->ref);
 
-       bch2_trans_alloc_paths(trans, c);
-
        s = btree_trans_stats(trans);
        if (s && s->max_mem) {
                unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
@@ -2993,6 +3008,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 list_add_done:
                seqmutex_unlock(&c->btree_trans_lock);
        }
+
+       return trans;
 }
 
 static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -3017,7 +3034,7 @@ leaked:
 #endif
 }
 
-void bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_put(struct btree_trans *trans)
        __releases(&c->btree_trans_barrier)
 {
        struct btree_insert_entry *i;
@@ -3063,18 +3080,11 @@ void bch2_trans_exit(struct btree_trans *trans)
        else
                kfree(trans->mem);
 
-#ifdef __KERNEL__
-       /*
-        * Userspace doesn't have a real percpu implementation:
-        */
-       trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
-#endif
-
-       if (trans->paths)
-               mempool_free(trans->paths, &c->btree_paths_pool);
-
-       trans->mem      = (void *) 0x1;
-       trans->paths    = (void *) 0x1;
+       /* Userspace doesn't have a real percpu implementation: */
+       if (IS_ENABLED(__KERNEL__))
+               trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+       if (trans)
+               mempool_free(trans, &c->btree_trans_pool);
 }
 
 static void __maybe_unused
@@ -3152,6 +3162,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
        struct btree_transaction_stats *s;
+       struct btree_trans *trans;
+       int cpu;
+
+       trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+       if (trans)
+               panic("%s leaked btree_trans\n", trans->fn);
+
+       if (c->btree_trans_bufs)
+               for_each_possible_cpu(cpu)
+                       kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+       free_percpu(c->btree_trans_bufs);
 
        for (s = c->btree_transaction_stats;
             s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
@@ -3163,13 +3184,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
        if (c->btree_trans_barrier_initialized)
                cleanup_srcu_struct(&c->btree_trans_barrier);
        mempool_exit(&c->btree_trans_mem_pool);
-       mempool_exit(&c->btree_paths_pool);
+       mempool_exit(&c->btree_trans_pool);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
        struct btree_transaction_stats *s;
-       unsigned nr = BTREE_ITER_MAX;
        int ret;
 
        for (s = c->btree_transaction_stats;
@@ -3182,9 +3202,12 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
        INIT_LIST_HEAD(&c->btree_trans_list);
        seqmutex_init(&c->btree_trans_lock);
 
-       ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
-                       sizeof(struct btree_path) * nr +
-                       sizeof(struct btree_insert_entry) * nr) ?:
+       c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+       if (!c->btree_trans_bufs)
+               return -ENOMEM;
+
+       ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+                                         sizeof(struct btree_trans)) ?:
                mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
                                          BTREE_TRANS_MEM_MAX) ?:
                init_srcu_struct(&c->btree_trans_barrier);
index 8876f2b829fadc8c55830005bc6e06e0ab2c28ab..fbe273453db36d2fa926e6c61c977926818eabbb 100644 (file)
@@ -276,12 +276,14 @@ int bch2_trans_relock_notrace(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 bool bch2_trans_locked(struct btree_trans *);
 
-static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 {
-       return restart_count != trans->restart_count;
+       return restart_count != trans->restart_count
+               ? -BCH_ERR_transaction_restart_nested
+               : 0;
 }
 
-void bch2_trans_restart_error(struct btree_trans *, u32);
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
 
 static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
                                                   u32 restart_count)
@@ -290,7 +292,7 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
                bch2_trans_restart_error(trans, restart_count);
 }
 
-void bch2_trans_in_restart_error(struct btree_trans *);
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
 
 static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
 {
@@ -463,7 +465,7 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
 }
 
 void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
-                         unsigned, struct bpos, unsigned);
+                         enum btree_id, struct bpos, unsigned);
 
 static inline void bch2_trans_iter_init(struct btree_trans *trans,
                          struct btree_iter *iter,
@@ -672,17 +674,17 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 #define lockrestart_do(_trans, _do)                                    \
 ({                                                                     \
        u32 _restart_count;                                             \
-       int _ret                                                      \
+       int _ret2;                                                      \
                                                                        \
        do {                                                            \
                _restart_count = bch2_trans_begin(_trans);              \
-               _ret = (_do);                                           \
-       } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));  \
+               _ret2 = (_do);                                          \
+       } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \
                                                                        \
-       if (!_ret                                                     \
+       if (!_ret2)                                                     \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                                                                        \
-       _ret                                                          \
+       _ret2;                                                          \
 })
 
 /*
@@ -697,26 +699,23 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 #define nested_lockrestart_do(_trans, _do)                             \
 ({                                                                     \
        u32 _restart_count, _orig_restart_count;                        \
-       int _ret                                                      \
+       int _ret2;                                                      \
                                                                        \
        _restart_count = _orig_restart_count = (_trans)->restart_count; \
                                                                        \
-       while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
+       while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
                _restart_count = bch2_trans_begin(_trans);              \
                                                                        \
-       if (!_ret                                                     \
+       if (!_ret2)                                                     \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                                                                        \
-       if (!_ret && trans_was_restarted(_trans, _orig_restart_count))  \
-               _ret = -BCH_ERR_transaction_restart_nested;             \
-                                                                       \
-       _ret;                                                           \
+       _ret2 ?: trans_was_restarted(_trans, _restart_count);           \
 })
 
 #define for_each_btree_key2(_trans, _iter, _btree_id,                  \
                            _start, _flags, _k, _do)                    \
 ({                                                                     \
-       int _ret = 0;                                                   \
+       int _ret3 = 0;                                                  \
                                                                        \
        bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
                             (_start), (_flags));                       \
@@ -724,15 +723,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        while (1) {                                                     \
                u32 _restart_count = bch2_trans_begin(_trans);          \
                                                                        \
-               _ret = 0;                                               \
+               _ret3 = 0;                                              \
                (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));   \
                if (!(_k).k)                                            \
                        break;                                          \
                                                                        \
-               _ret = bkey_err(_k) ?: (_do);                           \
-               if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+               _ret3 = bkey_err(_k) ?: (_do);                          \
+               if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
                        continue;                                       \
-               if (_ret                                              \
+               if (_ret3)                                              \
                        break;                                          \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                if (!bch2_btree_iter_advance(&(_iter)))                 \
@@ -740,13 +739,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        }                                                               \
                                                                        \
        bch2_trans_iter_exit((_trans), &(_iter));                       \
-       _ret                                                          \
+       _ret3;                                                          \
 })
 
 #define for_each_btree_key2_upto(_trans, _iter, _btree_id,             \
                            _start, _end, _flags, _k, _do)              \
 ({                                                                     \
-       int _ret = 0;                                                   \
+       int _ret3 = 0;                                                  \
                                                                        \
        bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
                             (_start), (_flags));                       \
@@ -754,15 +753,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        while (1) {                                                     \
                u32 _restart_count = bch2_trans_begin(_trans);          \
                                                                        \
-               _ret = 0;                                               \
+               _ret3 = 0;                                              \
                (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
                if (!(_k).k)                                            \
                        break;                                          \
                                                                        \
-               _ret = bkey_err(_k) ?: (_do);                           \
-               if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+               _ret3 = bkey_err(_k) ?: (_do);                          \
+               if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
                        continue;                                       \
-               if (_ret                                              \
+               if (_ret3)                                              \
                        break;                                          \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                if (!bch2_btree_iter_advance(&(_iter)))                 \
@@ -770,13 +769,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        }                                                               \
                                                                        \
        bch2_trans_iter_exit((_trans), &(_iter));                       \
-       _ret                                                          \
+       _ret3;                                                          \
 })
 
 #define for_each_btree_key_reverse(_trans, _iter, _btree_id,           \
                                   _start, _flags, _k, _do)             \
 ({                                                                     \
-       int _ret = 0;                                                   \
+       int _ret3 = 0;                                                  \
                                                                        \
        bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
                             (_start), (_flags));                       \
@@ -785,14 +784,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
                u32 _restart_count = bch2_trans_begin(_trans);          \
                (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
                if (!(_k).k) {                                          \
-                       _ret = 0;                                       \
+                       _ret3 = 0;                                      \
                        break;                                          \
                }                                                       \
                                                                        \
-               _ret = bkey_err(_k) ?: (_do);                           \
-               if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+               _ret3 = bkey_err(_k) ?: (_do);                          \
+               if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
                        continue;                                       \
-               if (_ret                                              \
+               if (_ret3)                                              \
                        break;                                          \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                if (!bch2_btree_iter_rewind(&(_iter)))                  \
@@ -800,7 +799,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        }                                                               \
                                                                        \
        bch2_trans_iter_exit((_trans), &(_iter));                       \
-       _ret                                                          \
+       _ret3;                                                          \
 })
 
 #define for_each_btree_key_commit(_trans, _iter, _btree_id,            \
@@ -916,21 +915,21 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
 void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
 void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
-void bch2_trans_exit(struct btree_trans *);
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
 
 extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
 unsigned bch2_trans_get_fn_idx(const char *);
 
-#define bch2_trans_init(_trans, _c, _nr_iters, _mem)                   \
-do {                                                                   \
+#define bch2_trans_get(_c)                                             \
+({                                                                     \
        static unsigned trans_fn_idx;                                   \
                                                                        \
        if (unlikely(!trans_fn_idx))                                    \
                trans_fn_idx = bch2_trans_get_fn_idx(__func__);         \
-                                                                       \
-       __bch2_trans_init(_trans, _c, trans_fn_idx);                    \
-} while (0)
+       __bch2_trans_get(_c, trans_fn_idx);                             \
+})
 
 void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
index f7c001d42391faa5151f1c807514db35392f3abe..1407f69140c2060c4f85ccda0f4f59d18f5e2304 100644 (file)
@@ -243,8 +243,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
        }
 
        if (ck) {
-               int ret;
-
                ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
                if (unlikely(ret)) {
                        bkey_cached_move_to_freelist(bc, ck);
@@ -253,7 +251,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 
                path->l[0].b = (void *) ck;
                path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
-               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 
                ret = bch2_btree_node_lock_write(trans, path, &ck->c);
                if (unlikely(ret)) {
@@ -331,7 +329,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
                        return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
                }
 
-               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
        }
 
        ck->c.level             = 0;
@@ -479,7 +477,7 @@ retry:
                if (!ck)
                        goto retry;
 
-               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
                path->locks_want = 1;
        } else {
                enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -497,7 +495,8 @@ retry:
                        goto retry;
                }
 
-               mark_btree_node_locked(trans, path, 0, lock_want);
+               mark_btree_node_locked(trans, path, 0,
+                                      (enum btree_node_locked_type) lock_want);
        }
 
        path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
@@ -579,7 +578,8 @@ retry:
                        goto retry;
                }
 
-               mark_btree_node_locked(trans, path, 0, lock_want);
+               mark_btree_node_locked(trans, path, 0,
+                                      (enum btree_node_locked_type) lock_want);
        }
 
        path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
@@ -705,13 +705,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
        struct bkey_cached *ck =
                container_of(pin, struct bkey_cached, journal);
        struct bkey_cached_key key;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
+       btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
        key = ck->key;
 
        if (ck->journal.seq != seq ||
@@ -728,13 +726,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
        }
        six_unlock_read(&ck->c.lock);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
-               btree_key_cache_flush_pos(&trans, key, seq,
+       ret = commit_do(trans, NULL, NULL, 0,
+               btree_key_cache_flush_pos(trans, key, seq,
                                BTREE_INSERT_JOURNAL_RECLAIM, false));
 unlock:
        srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -1065,7 +1063,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-       prt_printf(out, "nr_freed:\t%zu",       atomic_long_read(&c->nr_freed));
+       prt_printf(out, "nr_freed:\t%lu",       atomic_long_read(&c->nr_freed));
        prt_newline(out);
        prt_printf(out, "nr_keys:\t%lu",        atomic_long_read(&c->nr_keys));
        prt_newline(out);
index 22e2cd3914a524c574bbf9e07451a5d13df7d159..6231e9ffc5d7497b693febe166e64560a6f024c9 100644 (file)
@@ -91,7 +91,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
 static inline void mark_btree_node_locked(struct btree_trans *trans,
                                          struct btree_path *path,
                                          unsigned level,
-                                         enum six_lock_type type)
+                                         enum btree_node_locked_type type)
 {
        mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
index eafb0388ef820a9628bb73254fad0f065ebbef35..04c1f4610972599686c18ada9591a1f83dbd1557 100644 (file)
@@ -163,13 +163,11 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct btree_write *w = container_of(pin, struct btree_write, journal);
        struct btree *b = container_of(w, struct btree, writes[i]);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        unsigned long old, new, v;
        unsigned idx = w - b->writes;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
        v = READ_ONCE(b->flags);
 
        do {
@@ -188,7 +186,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        btree_node_write_if_need(c, b, SIX_LOCK_read);
        six_unlock_read(&b->c.lock);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return 0;
 }
 
@@ -214,7 +212,11 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
 }
 
 /**
- * btree_insert_key - insert a key one key into a leaf node
+ * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
+ * @trans:             btree transaction object
+ * @path:              path pointing to @insert's pos
+ * @insert:            key to insert
+ * @journal_seq:       sequence number of journal reservation
  */
 inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
                                       struct btree_path *path,
@@ -555,7 +557,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
        struct btree_write_buffered_key *wb;
        struct btree_trans_commit_hook *h;
        unsigned u64s = 0;
-       bool marking = false;
        int ret;
 
        if (race_fault()) {
@@ -584,9 +585,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                        *stopped_at = i;
                        return ret;
                }
-
-               if (btree_node_type_needs_gc(i->bkey_type))
-                       marking = true;
        }
 
        if (trans->nr_wb_updates &&
@@ -778,7 +776,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
                bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
                                                   struct btree_insert_entry *i,
                                                   struct printbuf *err)
@@ -804,7 +801,6 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un
 
        return -EINVAL;
 }
-#endif
 
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
@@ -1029,7 +1025,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
        if (ret)
                goto out_reset;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
        trans_for_each_update(trans, i) {
                struct printbuf buf = PRINTBUF;
                enum bkey_invalid_flags invalid_flags = 0;
@@ -1046,7 +1041,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
                if (ret)
                        return ret;
        }
-#endif
 
        if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
                ret = do_bch2_trans_commit_to_journal_replay(trans);
index 71ad3893e83d0546f4ccac93e4b12585cb8f0018..67ecb5e4e57e6bad2415d498d4a08d0bbf71589c 100644 (file)
@@ -194,34 +194,34 @@ struct btree_node_iter {
 /*
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
-static const u16 BTREE_ITER_SLOTS              = 1 << 0;
-static const u16 BTREE_ITER_ALL_LEVELS         = 1 << 1;
+static const __maybe_unused u16 BTREE_ITER_SLOTS               = 1 << 0;
+static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS          = 1 << 1;
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-static const u16 BTREE_ITER_INTENT             = 1 << 2;
+static const __maybe_unused u16 BTREE_ITER_INTENT              = 1 << 2;
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-static const u16 BTREE_ITER_PREFETCH           = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_PREFETCH            = 1 << 3;
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-static const u16 BTREE_ITER_IS_EXTENTS         = 1 << 4;
-static const u16 BTREE_ITER_NOT_EXTENTS                = 1 << 5;
-static const u16 BTREE_ITER_CACHED             = 1 << 6;
-static const u16 BTREE_ITER_WITH_KEY_CACHE     = 1 << 7;
-static const u16 BTREE_ITER_WITH_UPDATES       = 1 << 8;
-static const u16 BTREE_ITER_WITH_JOURNAL       = 1 << 9;
-static const u16 __BTREE_ITER_ALL_SNAPSHOTS    = 1 << 10;
-static const u16 BTREE_ITER_ALL_SNAPSHOTS      = 1 << 11;
-static const u16 BTREE_ITER_FILTER_SNAPSHOTS   = 1 << 12;
-static const u16 BTREE_ITER_NOPRESERVE         = 1 << 13;
-static const u16 BTREE_ITER_CACHED_NOFILL      = 1 << 14;
-static const u16 BTREE_ITER_KEY_CACHE_FILL     = 1 << 15;
-#define __BTREE_ITER_FLAGS_END                        16
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS          = 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS         = 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_CACHED              = 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE      = 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES                = 1 << 8;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL                = 1 << 9;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS     = 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS       = 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS    = 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE          = 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL       = 1 << 14;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL      = 1 << 15;
+#define __BTREE_ITER_FLAGS_END                                        16
 
 enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -459,8 +459,8 @@ struct btree_trans {
        void                    *mem;
 
        u8                      sorted[BTREE_ITER_MAX + 8];
-       struct btree_path       *paths;
-       struct btree_insert_entry *updates;
+       struct btree_path       paths[BTREE_ITER_MAX];
+       struct btree_insert_entry updates[BTREE_ITER_MAX];
        struct btree_write_buffered_key *wb_updates;
 
        /* update path: */
index 880ce74318945aee7252ce0be8d669f65d0a2210..324767c0ddccd7457004a34e8ed6e49da8c54b85 100644 (file)
@@ -124,7 +124,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
        struct bkey_s_c old_k, new_k;
        snapshot_id_list s;
        struct bkey_i *update;
-       int ret;
+       int ret = 0;
 
        if (!bch2_snapshot_has_children(c, old_pos.snapshot))
                return 0;
@@ -466,11 +466,49 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
        return 0;
 }
 
+static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
+                                                   struct btree_iter *iter,
+                                                   struct btree_path *path)
+{
+       if (!iter->key_cache_path ||
+           !iter->key_cache_path->should_be_locked ||
+           !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+               struct bkey_cached *ck;
+               int ret;
+
+               if (!iter->key_cache_path)
+                       iter->key_cache_path =
+                               bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                             BTREE_ITER_INTENT|
+                                             BTREE_ITER_CACHED, _THIS_IP_);
+
+               iter->key_cache_path =
+                       bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+                                               iter->flags & BTREE_ITER_INTENT,
+                                               _THIS_IP_);
+
+               ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+                                              BTREE_ITER_CACHED);
+               if (unlikely(ret))
+                       return ret;
+
+               ck = (void *) iter->key_cache_path->l[0].b;
+
+               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                       trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+               }
+
+               btree_path_set_should_be_locked(iter->key_cache_path);
+       }
+
+       return 0;
+}
+
 int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                                   struct bkey_i *k, enum btree_update_flags flags)
 {
        struct btree_path *path = iter->update_path ?: iter->path;
-       struct bkey_cached *ck;
        int ret;
 
        if (iter->flags & BTREE_ITER_IS_EXTENTS)
@@ -494,34 +532,9 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
            !path->cached &&
            !path->level &&
            btree_id_cached(trans->c, path->btree_id)) {
-               if (!iter->key_cache_path ||
-                   !iter->key_cache_path->should_be_locked ||
-                   !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
-                       if (!iter->key_cache_path)
-                               iter->key_cache_path =
-                                       bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-                                                     BTREE_ITER_INTENT|
-                                                     BTREE_ITER_CACHED, _THIS_IP_);
-
-                       iter->key_cache_path =
-                               bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-                                                       iter->flags & BTREE_ITER_INTENT,
-                                                       _THIS_IP_);
-
-                       ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
-                                                      BTREE_ITER_CACHED);
-                       if (unlikely(ret))
-                               return ret;
-
-                       ck = (void *) iter->key_cache_path->l[0].b;
-
-                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-                               trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-                               return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-                       }
-
-                       btree_path_set_should_be_locked(iter->key_cache_path);
-               }
+               ret = bch2_trans_update_get_key_cache(trans, iter, path);
+               if (ret)
+                       return ret;
 
                path = iter->key_cache_path;
        }
@@ -640,6 +653,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
        int ret;
 
        bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+                            BTREE_ITER_CACHED|
                             BTREE_ITER_NOT_EXTENTS|
                             BTREE_ITER_INTENT);
        ret   = bch2_btree_iter_traverse(&iter) ?:
@@ -648,8 +662,8 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
        return ret;
 }
 
-int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
-                       struct bkey_i *k, enum btree_update_flags flags)
+int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
+                           struct bkey_i *k, enum btree_update_flags flags)
 {
        struct btree_iter iter;
        int ret;
@@ -667,16 +681,18 @@ int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
  * bch2_btree_insert - insert keys into the extent btree
  * @c:                 pointer to struct bch_fs
  * @id:                        btree to insert into
- * @insert_keys:       list of keys to insert
- * @hook:              insert callback
+ * @k:                 key to insert
+ * @disk_res:          must be non-NULL whenever inserting or potentially
+ *                     splitting data extents
+ * @flags:             transaction commit flags
+ *
+ * Returns:            0 on success, error code on failure
  */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-                     struct bkey_i *k,
-                     struct disk_reservation *disk_res,
-                     u64 *journal_seq, int flags)
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
+                     struct disk_reservation *disk_res, int flags)
 {
-       return bch2_trans_do(c, disk_res, journal_seq, flags,
-                            __bch2_btree_insert(&trans, id, k, 0));
+       return bch2_trans_do(c, disk_res, NULL, flags,
+                            bch2_btree_insert_trans(trans, id, k, 0));
 }
 
 int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
@@ -714,6 +730,23 @@ int bch2_btree_delete_at_buffered(struct btree_trans *trans,
        return bch2_trans_update_buffered(trans, btree, k);
 }
 
+int bch2_btree_delete(struct btree_trans *trans,
+                     enum btree_id btree, struct bpos pos,
+                     unsigned update_flags)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, btree, pos,
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_btree_delete_at(trans, &iter, update_flags);
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
                                  struct bpos start, struct bpos end,
                                  unsigned update_flags,
@@ -777,9 +810,7 @@ err:
        }
        bch2_trans_iter_exit(trans, &iter);
 
-       if (!ret && trans_was_restarted(trans, restart_count))
-               ret = -BCH_ERR_transaction_restart_nested;
-       return ret;
+       return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 /*
@@ -793,7 +824,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                            u64 *journal_seq)
 {
        int ret = bch2_trans_run(c,
-                       bch2_btree_delete_range_trans(&trans, id, start, end,
+                       bch2_btree_delete_range_trans(trans, id, start, end,
                                                      update_flags, journal_seq));
        if (ret == -BCH_ERR_transaction_restart_nested)
                ret = 0;
@@ -818,6 +849,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
        return bch2_trans_update_buffered(trans, btree, k);
 }
 
+__printf(2, 0)
 static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
 {
        struct printbuf buf = PRINTBUF;
@@ -854,6 +886,7 @@ err:
        return ret;
 }
 
+__printf(3, 0)
 static int
 __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
                  va_list args)
@@ -865,12 +898,13 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
        } else {
                ret = bch2_trans_do(c, NULL, NULL,
                        BTREE_INSERT_LAZY_RW|commit_flags,
-                       __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+                       __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
        }
 
        return ret;
 }
 
+__printf(2, 3)
 int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
 {
        va_list args;
@@ -886,6 +920,7 @@ int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
  * Use for logging messages during recovery to enable reserved space and avoid
  * blocking.
  */
+__printf(2, 3)
 int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
 {
        va_list args;
index 901c42b57c3590f4386e2687b32359e6d676d51e..9816d22865403043c6caa819b3f249a2e10ea6fa 100644 (file)
@@ -4,7 +4,6 @@
 
 #include "btree_iter.h"
 #include "journal.h"
-#include "journal.h"
 
 struct bch_fs;
 struct btree;
@@ -58,14 +57,15 @@ int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
                                unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
 
 int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
                                struct bkey_i *, enum btree_update_flags);
 
-int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
                        enum btree_update_flags);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-                    struct disk_reservation *, u64 *, int flags);
+                    struct disk_reservation *, int flags);
 
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
                                  struct bpos, struct bpos, unsigned, u64 *);
@@ -114,8 +114,8 @@ void bch2_trans_commit_hook(struct btree_trans *,
                            struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *, unsigned);
 
-int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
 
 /**
  * bch2_trans_commit - insert keys at given iterator positions
@@ -145,30 +145,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
        nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
                                        (_journal_seq), (_flags)))
 
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)                \
-({                                                                     \
-       struct btree_trans trans;                                       \
-       int _ret;                                                       \
-                                                                       \
-       bch2_trans_init(&trans, (_c), 0, 0);                            \
-       _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \
-       bch2_trans_exit(&trans);                                        \
-                                                                       \
-       _ret;                                                           \
-})
-
 #define bch2_trans_run(_c, _do)                                                \
 ({                                                                     \
-       struct btree_trans trans;                                       \
-       int _ret;                                                       \
-                                                                       \
-       bch2_trans_init(&trans, (_c), 0, 0);                            \
-       _ret = (_do);                                                   \
-       bch2_trans_exit(&trans);                                        \
-                                                                       \
+       struct btree_trans *trans = bch2_trans_get(_c);                 \
+       int _ret = (_do);                                               \
+       bch2_trans_put(trans);                                          \
        _ret;                                                           \
 })
 
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)                \
+       bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
+
 #define trans_for_each_update(_trans, _i)                              \
        for ((_i) = (_trans)->updates;                                  \
             (_i) < (_trans)->updates + (_trans)->nr_updates;           \
index c741150e68af971d144e752df543078bf1396bf8..7dbf6b6c7f3481e0d0ab0e15b9b5ec6c501131f0 100644 (file)
@@ -143,10 +143,15 @@ static size_t btree_node_u64s_with_format(struct btree *b,
 }
 
 /**
- * btree_node_format_fits - check if we could rewrite node with a new format
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
  *
- * This assumes all keys can pack with the new format -- it just checks if
- * the re-packed keys would fit inside the node itself.
+ * @c:         filesystem handle
+ * @b:         btree node to rewrite
+ * @new_f:     bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
  */
 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
                                 struct bkey_format *new_f)
@@ -244,7 +249,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
        struct write_point *wp;
        struct btree *b;
        BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-       struct open_buckets ob = { .nr = 0 };
+       struct open_buckets obs = { .nr = 0 };
        struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
        enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
        unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
@@ -257,7 +262,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
                struct btree_alloc *a =
                        &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
 
-               ob = a->ob;
+               obs = a->ob;
                bkey_copy(&tmp.k, &a->k);
                mutex_unlock(&c->btree_reserve_cache_lock);
                goto mem_alloc;
@@ -292,7 +297,7 @@ retry:
        bkey_btree_ptr_v2_init(&tmp.k);
        bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
 
-       bch2_open_bucket_get(c, wp, &ob);
+       bch2_open_bucket_get(c, wp, &obs);
        bch2_alloc_sectors_done(c, wp);
 mem_alloc:
        b = bch2_btree_node_mem_alloc(trans, interior_node);
@@ -304,7 +309,7 @@ mem_alloc:
        BUG_ON(b->ob.nr);
 
        bkey_copy(&b->key, &tmp.k);
-       b->ob = ob;
+       b->ob = obs;
 
        return b;
 }
@@ -592,12 +597,11 @@ static void btree_update_nodes_written(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
        struct btree *b;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        u64 journal_seq = 0;
        unsigned i;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 512);
        /*
         * If we're already in an error state, it might be because a btree node
         * was never written, and we might be trying to free that same btree
@@ -618,7 +622,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 
                b = as->old_nodes[i];
 
-               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
                seq = b->data ? b->data->keys.seq : 0;
                six_unlock_read(&b->c.lock);
 
@@ -640,13 +644,13 @@ static void btree_update_nodes_written(struct btree_update *as)
         * journal reclaim does btree updates when flushing bkey_cached entries,
         * which may require allocations as well.
         */
-       ret = commit_do(&trans, &as->disk_res, &journal_seq,
+       ret = commit_do(trans, &as->disk_res, &journal_seq,
                        BCH_WATERMARK_reclaim|
                        BTREE_INSERT_NOFAIL|
                        BTREE_INSERT_NOCHECK_RW|
                        BTREE_INSERT_JOURNAL_RECLAIM,
-                       btree_update_nodes_written_trans(&trans, as));
-       bch2_trans_unlock(&trans);
+                       btree_update_nodes_written_trans(trans, as));
+       bch2_trans_unlock(trans);
 
        bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
                             "%s(): error %s", __func__, bch2_err_str(ret));
@@ -655,7 +659,7 @@ err:
                struct btree_path *path;
 
                b = as->b;
-               path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
+               path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
                /*
                 * @b is the node we did the final insert into:
                 *
@@ -678,13 +682,13 @@ err:
                 * we may rarely end up with a locked path besides the one we
                 * have here:
                 */
-               bch2_trans_unlock(&trans);
-               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
-               mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+               bch2_trans_unlock(trans);
+               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
                path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
                path->l[b->c.level].b = b;
 
-               bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+               bch2_btree_node_lock_write_nofail(trans, path, &b->c);
 
                mutex_lock(&c->btree_interior_update_lock);
 
@@ -697,15 +701,15 @@ err:
                 * btree_interior_update_lock:
                 */
                if (as->b == b) {
-                       struct bset *i = btree_bset_last(b);
-
                        BUG_ON(!b->c.level);
                        BUG_ON(!btree_node_dirty(b));
 
                        if (!ret) {
-                               i->journal_seq = cpu_to_le64(
+                               struct bset *last = btree_bset_last(b);
+
+                               last->journal_seq = cpu_to_le64(
                                                             max(journal_seq,
-                                                                le64_to_cpu(i->journal_seq)));
+                                                                le64_to_cpu(last->journal_seq)));
 
                                bch2_btree_add_journal_pin(c, b, journal_seq);
                        } else {
@@ -724,8 +728,8 @@ err:
                six_unlock_write(&b->c.lock);
 
                btree_node_write_if_need(c, b, SIX_LOCK_intent);
-               btree_node_unlock(&trans, path, b->c.level);
-               bch2_path_put(&trans, path, true);
+               btree_node_unlock(trans, path, b->c.level);
+               bch2_path_put(trans, path, true);
        }
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -745,7 +749,7 @@ err:
        for (i = 0; i < as->nr_new_nodes; i++) {
                b = as->new_nodes[i];
 
-               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
                btree_node_write_if_need(c, b, SIX_LOCK_read);
                six_unlock_read(&b->c.lock);
        }
@@ -753,8 +757,8 @@ err:
        for (i = 0; i < as->nr_open_buckets; i++)
                bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
 
-       bch2_btree_update_free(as, &trans);
-       bch2_trans_exit(&trans);
+       bch2_btree_update_free(as, trans);
+       bch2_trans_put(trans);
 }
 
 static void btree_interior_update_work(struct work_struct *work)
@@ -1216,18 +1220,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        bch2_recalc_btree_reserve(c);
 }
 
-/**
- * bch_btree_set_root - update the root in memory and on disk
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks. However, you must hold an intent lock on the
- * old root.
- *
- * Note: This allocates a journal entry but doesn't add any keys to
- * it.  All the btree roots are part of every journal write, so there
- * is nothing new to be done.  This just guarantees that there is a
- * journal write.
- */
 static void bch2_btree_set_root(struct btree_update *as,
                                struct btree_trans *trans,
                                struct btree_path *path,
@@ -1341,12 +1333,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
                ;
 
        while (!bch2_keylist_empty(keys)) {
-               struct bkey_i *k = bch2_keylist_front(keys);
+               insert = bch2_keylist_front(keys);
 
-               if (bpos_gt(k->k.p, b->key.k.p))
+               if (bpos_gt(insert->k.p, b->key.k.p))
                        break;
 
-               bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
+               bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
                bch2_keylist_pop_front(keys);
        }
 }
@@ -1513,12 +1505,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
                path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
                six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
                bch2_btree_path_level_init(trans, path1, n1);
 
                path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
                six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
                bch2_btree_path_level_init(trans, path2, n2);
 
                /*
@@ -1539,7 +1531,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
                        path2->locks_want++;
                        BUG_ON(btree_node_locked(path2, n3->c.level));
                        six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
-                       mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+                       mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
                        bch2_btree_path_level_init(trans, path2, n3);
 
                        n3->sib_u64s[0] = U16_MAX;
@@ -1563,7 +1555,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
                path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
                six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
                bch2_btree_path_level_init(trans, path1, n1);
 
                if (parent)
@@ -1661,12 +1653,16 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
 }
 
 /**
- * bch_btree_insert_node - insert bkeys into a given btree node
+ * bch2_btree_insert_node - insert bkeys into a given btree node
  *
- * @iter:              btree iterator
+ * @as:                        btree_update object
+ * @trans:             btree_trans object
+ * @path:              path that points to current node
+ * @b:                 node to insert keys into
  * @keys:              list of keys to insert
- * @hook:              insert callback
- * @persistent:                if not null, @persistent will wait on journal write
+ * @flags:             transaction commit flags
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
  *
  * Inserts as many keys as it can into a given btree node, splitting it if full.
  * If a split occurred, this function will return early. This can only happen
@@ -1890,7 +1886,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
        new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
        six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-       mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+       mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
        bch2_btree_path_level_init(trans, new_path, n);
 
        bkey_init(&delete.k);
@@ -1934,9 +1930,6 @@ err_free_update:
        goto out;
 }
 
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- */
 int bch2_btree_node_rewrite(struct btree_trans *trans,
                            struct btree_iter *iter,
                            struct btree *b,
@@ -1967,7 +1960,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
        new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
        six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-       mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+       mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
        bch2_btree_path_level_init(trans, new_path, n);
 
        trace_and_count(c, btree_node_rewrite, c, b);
@@ -2055,9 +2048,9 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
        int ret;
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-                     async_btree_node_rewrite_trans(&trans, a));
+                     async_btree_node_rewrite_trans(trans, a));
        if (ret)
-               bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
+               bch_err_fn(c, ret);
        bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
        kfree(a);
 }
@@ -2096,8 +2089,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 
                ret = bch2_fs_read_write_early(c);
                if (ret) {
-                       bch_err(c, "%s: error going read-write: %s",
-                               __func__, bch2_err_str(ret));
+                       bch_err_msg(c, ret, "going read-write");
                        kfree(a);
                        return;
                }
@@ -2372,7 +2364,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
 
 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 {
-       bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id));
+       bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
 }
 
 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
index 6d2d43b6ff6aa0560d587a3d37f54135906f8d0d..4e6241db518b59d62c551e3d6d9c2541fd87737a 100644 (file)
@@ -296,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
        mutex_lock(&wb->flush_lock);
 
        return bch2_trans_run(c,
-                       __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+                       __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
 }
 
 static inline u64 btree_write_buffer_ref(int idx)
index c02c8c917a2926bfd4255d4d8338817e6dd22562..e7f4506f69ca4c371ec6ca1ce2c26f5d7d6540b2 100644 (file)
@@ -680,7 +680,7 @@ static int check_bucket_ref(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
        size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
-       u16 bucket_sectors = !ptr->cached
+       u32 bucket_sectors = !ptr->cached
                ? dirty_sectors
                : cached_sectors;
        struct printbuf buf = PRINTBUF;
@@ -752,9 +752,9 @@ static int check_bucket_ref(struct btree_trans *trans,
                goto err;
        }
 
-       if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
+       if ((u64) bucket_sectors + sectors > U32_MAX) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                       "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
+                       "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
@@ -1201,7 +1201,7 @@ not_found:
                new->k.p                = bkey_start_pos(p.k);
                new->k.p.offset += *idx - start;
                bch2_key_resize(&new->k, next_idx - *idx);
-               ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i,
+               ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
                                          BTREE_TRIGGER_NORUN);
        }
 
@@ -1300,7 +1300,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
        static int warned_disk_usage = 0;
        bool warn = false;
        unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-       struct replicas_delta *d = deltas->d, *d2;
+       struct replicas_delta *d, *d2;
        struct replicas_delta *top = (void *) deltas->d + deltas->used;
        struct bch_fs_usage *dst;
        s64 added = 0, should_not_have_added;
@@ -1923,7 +1923,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-       int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
+       int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
 
        if (ret)
                bch_err_fn(c, ret);
index f192809f50cf040fe0129f00c7c8dc011356ef7c..ecbeb7280f87f3cd2f9d3146412ebe8ebdf68da6 100644 (file)
@@ -40,15 +40,42 @@ static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, secto
        for (_b = (_buckets)->b + (_buckets)->first_bucket;     \
             _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ *   while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR      0
+#else
+#define BUCKET_LOCK_BITNR      (BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+       ulong   ulong;
+       u8      byte;
+};
+
 static inline void bucket_unlock(struct bucket *b)
 {
-       smp_store_release(&b->lock, 0);
+       BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+       bit_spin_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
 }
 
 static inline void bucket_lock(struct bucket *b)
 {
-       while (xchg(&b->lock, 1))
-               cpu_relax();
+       bit_spin_lock(BUCKET_LOCK_BITNR, (void *) &b->lock);
 }
 
 static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
@@ -180,7 +207,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
 
        switch (watermark) {
        case BCH_WATERMARK_NR:
-               unreachable();
+               BUG();
        case BCH_WATERMARK_stripe:
                reserved += ca->mi.nbuckets >> 6;
                fallthrough;
index 81ab685cdef9f35dbed4a55fa2044aad8dcbaf59..ec1b636ef78d075d1c2b6a9dd2b610a5ba8f274c 100644 (file)
@@ -133,7 +133,7 @@ retry_rehash:
        b->t = n;
        kvfree(t);
 
-       pr_debug("took %zu rehashes, table at %zu/%zu elements",
+       pr_debug("took %zu rehashes, table at %zu/%lu elements",
                 nr_rehashes, nr_elements, 1UL << b->t->bits);
 out:
        mutex_unlock(&b->lock);
index fb603df099a5b43d00b6a2bfb783a28403442a97..f69e15dc699c9b6b22c07c8a1ce709bf478e57d8 100644 (file)
@@ -86,10 +86,9 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
                devs[i] = strndup_user((const char __user *)(unsigned long)
                                       user_devs[i],
                                       PATH_MAX);
-               if (!devs[i]) {
-                       ret = -ENOMEM;
+               ret= PTR_ERR_OR_ZERO(devs[i]);
+               if (ret)
                        goto err;
-               }
        }
 
        c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
@@ -117,8 +116,9 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
                return -EINVAL;
 
        path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       if (!path)
-               return -ENOMEM;
+       ret = PTR_ERR_OR_ZERO(path);
+       if (ret)
+               return ret;
 
        err = bch2_fs_open_incremental(path);
        kfree(path);
@@ -149,9 +149,10 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 static long bch2_ioctl_query_uuid(struct bch_fs *c,
                        struct bch_ioctl_query_uuid __user *user_arg)
 {
-       return copy_to_user(&user_arg->uuid,
-                           &c->sb.user_uuid,
-                           sizeof(c->sb.user_uuid));
+       if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
+                        sizeof(c->sb.user_uuid)))
+               return -EFAULT;
+       return 0;
 }
 
 #if 0
@@ -188,8 +189,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
                return -EINVAL;
 
        path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       if (!path)
-               return -ENOMEM;
+       ret = PTR_ERR_OR_ZERO(path);
+       if (ret)
+               return ret;
 
        ret = bch2_dev_add(c, path);
        kfree(path);
@@ -230,8 +232,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
                return -EINVAL;
 
        path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       if (!path)
-               return -ENOMEM;
+       ret = PTR_ERR_OR_ZERO(path);
+       if (ret)
+               return ret;
 
        ret = bch2_dev_online(c, path);
        kfree(path);
@@ -338,7 +341,10 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
        if (len < sizeof(e))
                return -EINVAL;
 
-       return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+       if (copy_to_user(buf, &e, sizeof(e)))
+               return -EFAULT;
+
+       return sizeof(e);
 }
 
 static const struct file_operations bcachefs_data_ops = {
@@ -417,7 +423,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
        if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
                return -EFAULT;
 
-       arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
+       arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
        if (!arg)
                return -ENOMEM;
 
@@ -466,9 +472,11 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
        percpu_up_read(&c->mark_lock);
        kfree(src);
 
-       if (!ret)
-               ret = copy_to_user(user_arg, arg,
-                       sizeof(*arg) + arg->replica_entries_bytes);
+       if (ret)
+               goto err;
+       if (copy_to_user(user_arg, arg,
+                        sizeof(*arg) + arg->replica_entries_bytes))
+               ret = -EFAULT;
 err:
        kfree(arg);
        return ret;
@@ -513,7 +521,10 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 
        percpu_ref_put(&ca->ref);
 
-       return copy_to_user(user_arg, &arg, sizeof(arg));
+       if (copy_to_user(user_arg, &arg, sizeof(arg)))
+               return -EFAULT;
+
+       return 0;
 }
 
 static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -550,8 +561,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
                goto err;
        }
 
-       ret = copy_to_user((void __user *)(unsigned long)arg.sb,
-                          sb, vstruct_bytes(sb));
+       if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
+                        vstruct_bytes(sb)))
+               ret = -EFAULT;
 err:
        if (!IS_ERR_OR_NULL(ca))
                percpu_ref_put(&ca->ref);
@@ -617,6 +629,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
            arg.pad)
                return -EINVAL;
 
+       if (arg.nbuckets > U32_MAX)
+               return -EINVAL;
+
        ca = bch2_device_lookup(c, arg.dev, arg.flags);
        if (IS_ERR(ca))
                return PTR_ERR(ca);
index 4c87c596718175a9d16c3f9de428f3fa19afb709..1948119edbf429e315c86d0c811294d491c4b1c0 100644 (file)
@@ -139,7 +139,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 
                for (i = 0; i < pages; i++) {
                        unsigned offset = offset_in_page(buf);
-                       unsigned pg_len = min(len, PAGE_SIZE - offset);
+                       unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
 
                        sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
                        buf += pg_len;
@@ -159,15 +159,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
                crypto_alloc_sync_skcipher("chacha20", 0, 0);
        int ret;
 
-       if (!chacha20) {
-               pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
-               return PTR_ERR(chacha20);
+       ret = PTR_ERR_OR_ZERO(chacha20);
+       if (ret) {
+               pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+               return ret;
        }
 
        ret = crypto_skcipher_setkey(&chacha20->base,
                                     (void *) key, sizeof(*key));
        if (ret) {
-               pr_err("crypto_skcipher_setkey() error: %i", ret);
+               pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
                goto err;
        }
 
@@ -366,11 +367,11 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
        BUG_ON(!bch2_checksum_mergeable(type));
 
        while (b_len) {
-               unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+               unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
 
                bch2_checksum_update(&state,
-                               page_address(ZERO_PAGE(0)), b);
-               b_len -= b;
+                               page_address(ZERO_PAGE(0)), page_len);
+               b_len -= page_len;
        }
        a.lo = (__le64 __force) bch2_checksum_final(&state);
        a.lo ^= b.lo;
@@ -395,9 +396,9 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
                unsigned                        csum_type;
                struct bch_csum                 csum;
        } splits[3] = {
-               { crc_a, len_a, new_csum_type },
-               { crc_b, len_b, new_csum_type },
-               { NULL,  bio_sectors(bio) - len_a - len_b, new_csum_type },
+               { crc_a, len_a, new_csum_type, { 0 }},
+               { crc_b, len_b, new_csum_type, { 0 } },
+               { NULL,  bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
        }, *i;
        bool mergeable = crc_old.csum_type == new_csum_type &&
                bch2_checksum_mergeable(new_csum_type);
@@ -558,6 +559,7 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
        return ret;
 }
 
+#ifndef __KERNEL__
 int bch2_revoke_key(struct bch_sb *sb)
 {
        key_serial_t key_id;
@@ -575,6 +577,7 @@ int bch2_revoke_key(struct bch_sb *sb)
 
        return 0;
 }
+#endif
 
 int bch2_decrypt_sb_key(struct bch_fs *c,
                        struct bch_sb_field_crypt *crypt,
@@ -596,7 +599,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 
        /* decrypt real key: */
        ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-                            &sb_key, sizeof(sb_key));
+                                     &sb_key, sizeof(sb_key));
        if (ret)
                goto err;
 
index 9a4898db31b1475cd7d5497262b35093a23a4b3c..13998388c545c476545b1e6cd418306f67dcf90e 100644 (file)
@@ -40,15 +40,16 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
  */
 #define csum_vstruct(_c, _type, _nonce, _i)                            \
 ({                                                                     \
-       const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
-       const void *end = vstruct_end(_i);                              \
+       const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
                                                                        \
-       bch2_checksum(_c, _type, _nonce, start, end - start);           \
+       bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
 })
 
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
 int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
 int bch2_revoke_key(struct bch_sb *);
+#endif
 
 int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
                 void *data, size_t);
index 6b17f7cc5860aacdbcd7a389444bfcee38dd1840..1480b64547b0c961d7a62f06071e8bffcf7e35a8 100644 (file)
@@ -3,7 +3,6 @@
 #include "checksum.h"
 #include "compress.h"
 #include "extents.h"
-#include "io.h"
 #include "super-io.h"
 
 #include <linux/lz4.h>
@@ -571,7 +570,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
        size_t decompress_workspace_size = 0;
-       bool decompress_workspace_needed;
        ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
                                                 c->opts.encoded_extent_max);
        struct {
@@ -581,7 +579,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
                size_t                          decompress_workspace;
        } compression_types[] = {
                { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
-                       max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
+                       max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+                       0 },
                { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
                        zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
                        zlib_inflate_workspacesize(), },
@@ -620,9 +619,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
                if (!(features & (1 << i->feature)))
                        continue;
 
-               if (i->decompress_workspace)
-                       decompress_workspace_needed = true;
-
                if (mempool_initialized(&c->compress_workspace[i->type]))
                        continue;
 
index 442a9b806a3c164d5ae4209ea4494d45ee815230..26eb3d82b1cb98977224ca67d24b4f213360796b 100644 (file)
@@ -43,7 +43,7 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
                prt_tab(out);
                prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
                prt_newline(out);
-       };
+       }
 };
 
 int bch2_sb_counters_to_cpu(struct bch_fs *c)
index 81518f20d37df41cad9ef15301844b310f2968ca..899ff46de8e062aa4213d4815b8b20789d54bae3 100644 (file)
@@ -9,7 +9,7 @@
 #include "ec.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "keylist.h"
 #include "move.h"
 #include "nocow_locking.h"
@@ -49,10 +49,6 @@ static void trace_move_extent_fail2(struct data_update *m,
        if (insert) {
                i = 0;
                bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
-                       struct bkey_s new_s;
-                       new_s.k = (void *) new.k;
-                       new_s.v = (void *) new.v;
-
                        if (((1U << i) & m->data_opts.rewrite_ptrs) &&
                            (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
                            !ptr->cached)
@@ -307,7 +303,7 @@ out:
 
 int bch2_data_update_index_update(struct bch_write_op *op)
 {
-       return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
+       return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
 }
 
 void bch2_data_update_read_done(struct data_update *m,
index 49e9055cbb5262a642532ea75856b1df10e5101f..7ca1f98d7e9462d3563f3149f96d288300d54388 100644 (file)
@@ -4,7 +4,7 @@
 #define _BCACHEFS_DATA_UPDATE_H
 
 #include "bkey_buf.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct moving_context;
 
index ae47e1854b80a217c46c49236275afcf90f20ca7..75a3dc7cbd470da758750e612b187f16e34e8eb1 100644 (file)
@@ -19,7 +19,6 @@
 #include "extents.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
 #include "super.h"
 
 #include <linux/console.h>
@@ -154,10 +153,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        BUG_ON(b->nsets != 1);
 
        for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
-               if (k->type == KEY_TYPE_btree_ptr_v2) {
-                       struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
-                       v->mem_ptr = 0;
-               }
+               if (k->type == KEY_TYPE_btree_ptr_v2)
+                       ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
 
        v = c->verify_data;
        bkey_copy(&v->key, &b->key);
@@ -322,16 +319,16 @@ static ssize_t flush_buf(struct dump_iter *i)
 {
        if (i->buf.pos) {
                size_t bytes = min_t(size_t, i->buf.pos, i->size);
-               int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
+               int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
 
-               if (err)
-                       return err;
+               i->ret   += copied;
+               i->ubuf  += copied;
+               i->size  -= copied;
+               i->buf.pos -= copied;
+               memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
 
-               i->ret   += bytes;
-               i->ubuf  += bytes;
-               i->size  -= bytes;
-               i->buf.pos -= bytes;
-               memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
+               if (copied != bytes)
+                       return -EFAULT;
        }
 
        return i->size ? 0 : i->ret;
@@ -369,7 +366,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
                               size_t size, loff_t *ppos)
 {
        struct dump_iter *i = file->private_data;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        ssize_t ret;
@@ -382,17 +379,17 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
        if (ret)
                return ret;
 
-       bch2_trans_init(&trans, i->c, 0, 0);
-       ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+       trans = bch2_trans_get(i->c);
+       ret = for_each_btree_key2(trans, iter, i->id, i->from,
                                  BTREE_ITER_PREFETCH|
                                  BTREE_ITER_ALL_SNAPSHOTS, k, ({
                bch2_bkey_val_to_text(&i->buf, i->c, k);
                prt_newline(&i->buf);
-               drop_locks_do(&trans, flush_buf(i));
+               drop_locks_do(trans, flush_buf(i));
        }));
        i->from = iter.pos;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (!ret)
                ret = flush_buf(i);
@@ -411,7 +408,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
                                       size_t size, loff_t *ppos)
 {
        struct dump_iter *i = file->private_data;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct btree *b;
        ssize_t ret;
@@ -427,26 +424,26 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
        if (bpos_eq(SPOS_MAX, i->from))
                return i->ret;
 
-       bch2_trans_init(&trans, i->c, 0, 0);
+       trans = bch2_trans_get(i->c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+       for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
                bch2_btree_node_to_text(&i->buf, i->c, b);
                i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
                        ? bpos_successor(b->key.k.p)
                        : b->key.k.p;
 
-               ret = drop_locks_do(&trans, flush_buf(i));
+               ret = drop_locks_do(trans, flush_buf(i));
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (!ret)
                ret = flush_buf(i);
@@ -465,7 +462,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
                                       size_t size, loff_t *ppos)
 {
        struct dump_iter *i = file->private_data;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        ssize_t ret;
@@ -478,9 +475,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
        if (ret)
                return ret;
 
-       bch2_trans_init(&trans, i->c, 0, 0);
+       trans = bch2_trans_get(i->c);
 
-       ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+       ret = for_each_btree_key2(trans, iter, i->id, i->from,
                                  BTREE_ITER_PREFETCH|
                                  BTREE_ITER_ALL_SNAPSHOTS, k, ({
                struct btree_path_level *l = &iter.path->l[0];
@@ -493,11 +490,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
                }
 
                bch2_bfloat_to_text(&i->buf, l->b, _k);
-               drop_locks_do(&trans, flush_buf(i));
+               drop_locks_do(trans, flush_buf(i));
        }));
        i->from = iter.pos;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (!ret)
                ret = flush_buf(i);
index a7559ab03802cb3a39a6dc17d563cb9f3dced5a4..6c6c8d57d72b43a0acdfe41f04c9e7cd3ee53383 100644 (file)
@@ -479,21 +479,19 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
                       const struct bch_hash_info *hash_info,
                       const struct qstr *name, subvol_inum *inum)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+       ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
                                          name, inum, 0);
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (!ret)
-               bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+               bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -522,7 +520,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 
 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_dirent dirent;
@@ -533,15 +531,14 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
        int ret;
 
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
                           SPOS(inum.inum, ctx->pos, snapshot),
                           POS(inum.inum, U64_MAX), 0, k, ret) {
                if (k.k->type != KEY_TYPE_dirent)
@@ -549,7 +546,7 @@ retry:
 
                dirent = bkey_s_c_to_dirent(k);
 
-               ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+               ret = bch2_dirent_read_target(trans, inum, dirent, &target);
                if (ret < 0)
                        break;
                if (ret)
@@ -558,7 +555,7 @@ retry:
                /* dir_emit() can fault and block: */
                bch2_bkey_buf_reassemble(&sk, c, k);
                dirent = bkey_i_to_s_c_dirent(sk.k);
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
 
                name = bch2_dirent_get_name(dirent);
 
@@ -574,16 +571,16 @@ retry:
                 * read_target looks up subvolumes, we can overflow paths if the
                 * directory has many subvolumes in it
                 */
-               ret = btree_trans_too_many_iters(&trans);
+               ret = btree_trans_too_many_iters(trans);
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&sk, c);
 
        return ret;
index f36472c4a78187ae6afe61a1f30114d2b32c2b2a..b292dbef799211ca71bf132152c9d9d5ff464d70 100644 (file)
@@ -32,21 +32,21 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
 
        for (i = 0; i < sb->nr_devices; i++) {
                struct bch_member *m = mi->members + i;
-               unsigned g;
+               unsigned group_id;
 
                if (!BCH_MEMBER_GROUP(m))
                        continue;
 
-               g = BCH_MEMBER_GROUP(m) - 1;
+               group_id = BCH_MEMBER_GROUP(m) - 1;
 
-               if (g >= nr_groups) {
+               if (group_id >= nr_groups) {
                        prt_printf(err, "disk %u has invalid label %u (have %u)",
-                              i, g, nr_groups);
+                                  i, group_id, nr_groups);
                        return -BCH_ERR_invalid_sb_disk_groups;
                }
 
-               if (BCH_GROUP_DELETED(&groups->entries[g])) {
-                       prt_printf(err, "disk %u has deleted label %u", i, g);
+               if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+                       prt_printf(err, "disk %u has deleted label %u", i, group_id);
                        return -BCH_ERR_invalid_sb_disk_groups;
                }
        }
@@ -183,8 +183,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 
        for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
                struct bch_member *m = mi->members + i;
-               struct bch_disk_group_cpu *dst =
-                       &cpu_g->entries[BCH_MEMBER_GROUP(m)];
+               struct bch_disk_group_cpu *dst;
 
                if (!bch2_member_exists(m))
                        continue;
index f58e84a2bf88eb09772d5da414ebdb29906f93b7..8646856e4539eae46fed2634154ebb340f841a46 100644 (file)
 #include "btree_update.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
+#include "checksum.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_read.h"
 #include "keylist.h"
 #include "recovery.h"
 #include "replicas.h"
@@ -475,7 +476,7 @@ err:
 
 static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
 {
-       return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe));
+       return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
 }
 
 /* recovery read path: */
@@ -787,12 +788,10 @@ static void ec_stripe_delete_work(struct work_struct *work)
 {
        struct bch_fs *c =
                container_of(work, struct bch_fs, ec_stripe_delete_work);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        int ret;
        u64 idx;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        while (1) {
                mutex_lock(&c->ec_stripes_heap_lock);
                idx = stripe_idx_to_delete(c);
@@ -801,15 +800,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
                if (!idx)
                        break;
 
-               ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-                               ec_stripe_delete(&trans, idx));
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               ec_stripe_delete(trans, idx));
                if (ret) {
                        bch_err_fn(c, ret);
                        break;
                }
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
@@ -998,24 +997,22 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
        unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       ret = bch2_btree_write_buffer_flush(&trans);
+       ret = bch2_btree_write_buffer_flush(trans);
        if (ret)
                goto err;
 
        for (i = 0; i < nr_data; i++) {
-               ret = ec_stripe_update_bucket(&trans, s, i);
+               ret = ec_stripe_update_bucket(trans, s, i);
                if (ret)
                        break;
        }
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -1123,7 +1120,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        ret = bch2_trans_do(c, &s->res, NULL,
                            BTREE_INSERT_NOCHECK_RW|
                            BTREE_INSERT_NOFAIL,
-                           ec_stripe_key_update(&trans,
+                           ec_stripe_key_update(trans,
                                        bkey_i_to_stripe(&s->new_stripe.key),
                                        !s->have_existing_stripe));
        if (ret) {
@@ -1133,8 +1130,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
        ret = ec_stripe_update_extents(c, &s->new_stripe);
        if (ret) {
-               bch_err(c, "error creating stripe: error updating pointers: %s",
-                       bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating stripe: error updating pointers");
                goto err;
        }
 err:
@@ -1822,7 +1818,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
 
 int bch2_stripes_read(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        const struct bch_stripe *s;
@@ -1830,9 +1826,7 @@ int bch2_stripes_read(struct bch_fs *c)
        unsigned i;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                if (k.k->type != KEY_TYPE_stripe)
                        continue;
@@ -1855,9 +1849,9 @@ int bch2_stripes_read(struct bch_fs *c)
 
                bch2_stripes_heap_insert(c, m, k.k->p.offset);
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
index 885ae5d5165587e4f72312401edcf1b67dcff964..966d165a3b6602c200f0be8c4090be6d8027e7ea 100644 (file)
@@ -240,7 +240,7 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
                        bch2_ec_do_stripe_creates(c);
                        break;
                default:
-                       unreachable();
+                       BUG();
                }
 }
 
index dc906fc9176fecf9f8a7240ff78780a742367735..d260ff9bbfeb7b9121f222a4362f37f95c927977 100644 (file)
@@ -12,8 +12,6 @@ static const char * const bch2_errcode_strs[] = {
        NULL
 };
 
-#define BCH_ERR_0      0
-
 static unsigned bch2_errcode_parents[] = {
 #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
        BCH_ERRCODES()
@@ -61,3 +59,10 @@ int __bch2_err_class(int err)
 
        return -err;
 }
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+       if (status == BLK_STS_REMOVED)
+               return "device removed";
+       return blk_status_to_str(status);
+}
index f7fa87442e98b849d12d140ea159a36c32ab2eb6..64f7176c2a4e69fdd6bdf30ecc6b4a376f838680 100644 (file)
@@ -99,6 +99,7 @@
        x(ENOENT,                       ENOENT_str_hash_set_must_replace)       \
        x(ENOENT,                       ENOENT_inode)                           \
        x(ENOENT,                       ENOENT_not_subvol)                      \
+       x(ENOENT,                       ENOENT_not_directory)                   \
        x(ENOENT,                       ENOENT_directory_dead)                  \
        x(ENOENT,                       ENOENT_subvolume)                       \
        x(ENOENT,                       ENOENT_snapshot_tree)                   \
        x(BCH_ERR_btree_node_read_err,  btree_node_read_err_want_retry)         \
        x(BCH_ERR_btree_node_read_err,  btree_node_read_err_must_retry)         \
        x(BCH_ERR_btree_node_read_err,  btree_node_read_err_bad_node)           \
-       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_incompatible)
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_incompatible)       \
+       x(0,                            nopromote)                              \
+       x(BCH_ERR_nopromote,            nopromote_may_not)                      \
+       x(BCH_ERR_nopromote,            nopromote_already_promoted)             \
+       x(BCH_ERR_nopromote,            nopromote_unwritten)                    \
+       x(BCH_ERR_nopromote,            nopromote_congested)                    \
+       x(BCH_ERR_nopromote,            nopromote_in_flight)                    \
+       x(BCH_ERR_nopromote,            nopromote_enomem)
 
 enum bch_errcode {
        BCH_ERR_START           = 2048,
@@ -249,4 +257,8 @@ static inline long bch2_err_class(long err)
        return err < 0 ? __bch2_err_class(err) : err;
 }
 
+#define BLK_STS_REMOVED                ((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
 #endif /* _BCACHFES_ERRCODE_H */
index 39009cf0c44866280198933936d61d7f22cca49b..2a5af88726132bc936e4ddc404086b862f0666ef 100644 (file)
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "error.h"
-#include "io.h"
 #include "super.h"
 
 #define FSCK_ERR_RATELIMIT_NR  10
index cbfb5b21ddcba6d09233ed39799b5f8d9bf692ea..58ccc7b91ac79c7343828205e37edfdca56c0fab 100644 (file)
@@ -8,7 +8,8 @@
 #include "fs-io-buffered.h"
 #include "fs-io-direct.h"
 #include "fs-io-pagecache.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 
 #include <linux/backing-dev.h>
 #include <linux/pagemap.h>
@@ -269,7 +270,7 @@ void bch2_readahead(struct readahead_control *ractl)
        struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_io_opts opts;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct folio *folio;
        struct readpages_iter readpages_iter;
        int ret;
@@ -279,8 +280,6 @@ void bch2_readahead(struct readahead_control *ractl)
        ret = readpages_iter_init(&readpages_iter, ractl);
        BUG_ON(ret);
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        bch2_pagecache_add_get(inode);
 
        while ((folio = readpage_iter_peek(&readpages_iter))) {
@@ -299,31 +298,27 @@ void bch2_readahead(struct readahead_control *ractl)
                rbio->bio.bi_end_io = bch2_readpages_end_io;
                BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
-               bchfs_read(&trans, rbio, inode_inum(inode),
+               bchfs_read(trans, rbio, inode_inum(inode),
                           &readpages_iter);
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
        }
 
        bch2_pagecache_add_put(inode);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        darray_exit(&readpages_iter.folios);
 }
 
 static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
                             subvol_inum inum, struct folio *folio)
 {
-       struct btree_trans trans;
-
        bch2_folio_create(folio, __GFP_NOFAIL);
 
        rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
        rbio->bio.bi_iter.bi_sector = folio_sector(folio);
        BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bchfs_read(&trans, rbio, inum, NULL);
-       bch2_trans_exit(&trans);
+       bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
 }
 
 static void bch2_read_single_folio_end_io(struct bio *bio)
@@ -694,12 +689,12 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
        if (IS_ERR_OR_NULL(folio))
                goto err_unlock;
 
-       if (folio_test_uptodate(folio))
-               goto out;
-
        offset = pos - folio_pos(folio);
        len = min_t(size_t, len, folio_end_pos(folio) - pos);
 
+       if (folio_test_uptodate(folio))
+               goto out;
+
        /* If we're writing entire folio, don't need to read it in first: */
        if (!offset && len == folio_size(folio))
                goto out;
@@ -800,10 +795,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
        return copied;
 }
 
-static noinline void folios_trunc(folios *folios, struct folio **fi)
+static noinline void folios_trunc(folios *fs, struct folio **fi)
 {
-       while (folios->data + folios->nr > fi) {
-               struct folio *f = darray_pop(folios);
+       while (fs->data + fs->nr > fi) {
+               struct folio *f = darray_pop(fs);
 
                folio_unlock(f);
                folio_put(f);
@@ -817,35 +812,35 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch2_folio_reservation res;
-       folios folios;
+       folios fs;
        struct folio **fi, *f;
-       unsigned copied = 0, f_offset;
-       u64 end = pos + len, f_pos;
+       unsigned copied = 0, f_offset, f_copied;
+       u64 end = pos + len, f_pos, f_len;
        loff_t last_folio_pos = inode->v.i_size;
        int ret = 0;
 
        BUG_ON(!len);
 
        bch2_folio_reservation_init(c, inode, &res);
-       darray_init(&folios);
+       darray_init(&fs);
 
        ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
                                   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
                                   mapping_gfp_mask(mapping),
-                                  &folios);
+                                  &fs);
        if (ret)
                goto out;
 
-       BUG_ON(!folios.nr);
+       BUG_ON(!fs.nr);
 
-       f = darray_first(folios);
+       f = darray_first(fs);
        if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
                ret = bch2_read_single_folio(f, mapping);
                if (ret)
                        goto out;
        }
 
-       f = darray_last(folios);
+       f = darray_last(fs);
        end = min(end, folio_end_pos(f));
        last_folio_pos = folio_pos(f);
        if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
@@ -858,15 +853,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
                }
        }
 
-       ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
+       ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
        if (ret)
                goto out;
 
        f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+       f_offset = pos - folio_pos(darray_first(fs));
+       darray_for_each(fs, fi) {
+               f = *fi;
+               f_len = min(end, folio_end_pos(f)) - f_pos;
 
                /*
                 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
@@ -878,11 +873,11 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
                 */
                ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
                if (unlikely(ret)) {
-                       folios_trunc(&folios, fi);
-                       if (!folios.nr)
+                       folios_trunc(&fs, fi);
+                       if (!fs.nr)
                                goto out;
 
-                       end = min(end, folio_end_pos(darray_last(folios)));
+                       end = min(end, folio_end_pos(darray_last(fs)));
                        break;
                }
 
@@ -891,18 +886,17 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
        }
 
        if (mapping_writably_mapped(mapping))
-               darray_for_each(folios, fi)
+               darray_for_each(fs, fi)
                        flush_dcache_folio(*fi);
 
        f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-               unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
-
+       f_offset = pos - folio_pos(darray_first(fs));
+       darray_for_each(fs, fi) {
+               f = *fi;
+               f_len = min(end, folio_end_pos(f)) - f_pos;
+               f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
                if (!f_copied) {
-                       folios_trunc(&folios, fi);
+                       folios_trunc(&fs, fi);
                        break;
                }
 
@@ -911,7 +905,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
                    pos + copied + f_copied < inode->v.i_size) {
                        iov_iter_revert(iter, f_copied);
                        folio_zero_range(f, 0, folio_size(f));
-                       folios_trunc(&folios, fi);
+                       folios_trunc(&fs, fi);
                        break;
                }
 
@@ -919,7 +913,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
                copied += f_copied;
 
                if (f_copied != f_len) {
-                       folios_trunc(&folios, fi + 1);
+                       folios_trunc(&fs, fi + 1);
                        break;
                }
 
@@ -938,10 +932,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
        spin_unlock(&inode->v.i_lock);
 
        f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+       f_offset = pos - folio_pos(darray_first(fs));
+       darray_for_each(fs, fi) {
+               f = *fi;
+               f_len = min(end, folio_end_pos(f)) - f_pos;
 
                if (!folio_test_uptodate(f))
                        folio_mark_uptodate(f);
@@ -954,7 +948,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 
        inode->ei_last_dirtied = (unsigned long) current;
 out:
-       darray_for_each(folios, fi) {
+       darray_for_each(fs, fi) {
                folio_unlock(*fi);
                folio_put(*fi);
        }
@@ -967,7 +961,7 @@ out:
        if (last_folio_pos >= inode->v.i_size)
                truncate_pagecache(&inode->v, inode->v.i_size);
 
-       darray_exit(&folios);
+       darray_exit(&fs);
        bch2_folio_reservation_put(c, inode, &res);
 
        return copied ?: ret;
@@ -1055,8 +1049,6 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
                goto out;
        }
 
-       /* We can write back this queue in page reclaim */
-       current->backing_dev_info = inode_to_bdi(&inode->v);
        inode_lock(&inode->v);
 
        ret = generic_write_checks(iocb, from);
@@ -1076,7 +1068,6 @@ ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
                iocb->ki_pos += ret;
 unlock:
        inode_unlock(&inode->v);
-       current->backing_dev_info = NULL;
 
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
index 2b29abd24d5693f6bf535ac982f3bba594dc4c86..6a9557e7ecabb47d1a30b7c665367decb3fecbe4 100644 (file)
@@ -7,10 +7,12 @@
 #include "fs-io.h"
 #include "fs-io-direct.h"
 #include "fs-io-pagecache.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
+#include <linux/prefetch.h>
 #include <linux/task_io_accounting_ops.h>
 
 /* O_DIRECT reads */
@@ -232,23 +234,21 @@ static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
                                       u64 offset, u64 size,
                                       unsigned nr_replicas, bool compressed)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        u64 end = offset + size;
        u32 snapshot;
        bool ret = true;
        int err;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (err)
                goto err;
 
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
                           SPOS(inum.inum, offset, snapshot),
                           BTREE_ITER_SLOTS, k, err) {
                if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
@@ -263,11 +263,11 @@ retry:
        }
 
        offset = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(err, BCH_ERR_transaction_restart))
                goto retry;
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return err ? false : ret;
 }
index 1e60eead29815ccb0dad99b07c4df1ee64055801..8bd9bcdd27f738a7a2f0d2ac831f0c77fdf20aa3 100644 (file)
@@ -14,7 +14,7 @@
 int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
                                     loff_t start, u64 end,
                                     int fgp_flags, gfp_t gfp,
-                                    folios *folios)
+                                    folios *fs)
 {
        struct folio *f;
        u64 pos = start;
@@ -24,7 +24,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
                if ((u64) pos >= (u64) start + (1ULL << 20))
                        fgp_flags &= ~FGP_CREAT;
 
-               ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
+               ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
                if (ret)
                        break;
 
@@ -32,16 +32,16 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
                if (IS_ERR_OR_NULL(f))
                        break;
 
-               BUG_ON(folios->nr && folio_pos(f) != pos);
+               BUG_ON(fs->nr && folio_pos(f) != pos);
 
                pos = folio_end_pos(f);
-               darray_push(folios, f);
+               darray_push(fs, f);
        }
 
-       if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
+       if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
                ret = -ENOMEM;
 
-       return folios->nr ? 0 : ret;
+       return fs->nr ? 0 : ret;
 }
 
 /* pagecache_block must be held */
@@ -73,12 +73,15 @@ int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
        return ret;
 }
 
+#if 0
+/* Useful for debug tracing: */
 static const char * const bch2_folio_sector_states[] = {
 #define x(n)   #n,
        BCH_FOLIO_SECTOR_STATE()
 #undef x
        NULL
 };
+#endif
 
 static inline enum bch_folio_sector_state
 folio_sector_dirty(enum bch_folio_sector_state state)
@@ -177,20 +180,20 @@ static void __bch2_folio_set(struct folio *folio,
  * extents btree:
  */
 int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-                  struct folio **folios, unsigned nr_folios)
+                  struct folio **fs, unsigned nr_folios)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_folio *s;
-       u64 offset = folio_sector(folios[0]);
+       u64 offset = folio_sector(fs[0]);
        unsigned folio_idx;
        u32 snapshot;
        bool need_set = false;
        int ret;
 
        for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-               s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
+               s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
                if (!s)
                        return -ENOMEM;
 
@@ -201,22 +204,22 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
                return 0;
 
        folio_idx = 0;
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
                           SPOS(inum.inum, offset, snapshot),
                           BTREE_ITER_SLOTS, k, ret) {
                unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
                unsigned state = bkey_to_sector_state(k);
 
                while (folio_idx < nr_folios) {
-                       struct folio *folio = folios[folio_idx];
+                       struct folio *folio = fs[folio_idx];
                        u64 folio_start = folio_sector(folio);
                        u64 folio_end   = folio_end_sector(folio);
                        unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
@@ -240,11 +243,11 @@ retry:
        }
 
        offset = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
index 4804e5a47ac9faacdc8b4f5f03baf7b2cf1eafd4..b0e8144ec5500cd37a2d35f71f399c1ebe424d53 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
@@ -16,7 +17,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "journal.h"
-#include "io.h"
+#include "io_misc.h"
 #include "keylist.h"
 #include "quota.h"
 #include "reflink.h"
@@ -164,7 +165,6 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 #endif
 }
 
-
 /* fsync: */
 
 /*
@@ -207,31 +207,29 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
                                 struct bpos start,
                                 struct bpos end)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
                if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
                        ret = 1;
                        break;
                }
        start = iter.pos;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -241,8 +239,8 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
        struct bch_folio *s;
-       unsigned start_offset = start & (PAGE_SIZE - 1);
-       unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+       unsigned start_offset;
+       unsigned end_offset;
        unsigned i;
        struct folio *folio;
        s64 i_sectors_delta = 0;
@@ -391,33 +389,12 @@ static int bch2_extend(struct mnt_idmap *idmap,
        return bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
-static int bch2_truncate_finish_fn(struct btree_trans *trans,
-                                  struct bch_inode_info *inode,
-                                  struct bch_inode_unpacked *bi,
-                                  void *p)
-{
-       bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-       return 0;
-}
-
-static int bch2_truncate_start_fn(struct btree_trans *trans,
-                                 struct bch_inode_info *inode,
-                                 struct bch_inode_unpacked *bi, void *p)
-{
-       u64 *new_i_size = p;
-
-       bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
-       bi->bi_size = *new_i_size;
-       return 0;
-}
-
-int bch2_truncate(struct mnt_idmap *idmap,
+int bchfs_truncate(struct mnt_idmap *idmap,
                  struct bch_inode_info *inode, struct iattr *iattr)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
        struct bch_inode_unpacked inode_u;
-       u64 new_i_size = iattr->ia_size;
        s64 i_sectors_delta = 0;
        int ret = 0;
 
@@ -466,6 +443,8 @@ int bch2_truncate(struct mnt_idmap *idmap,
        if (unlikely(ret < 0))
                goto err;
 
+       truncate_setsize(&inode->v, iattr->ia_size);
+
        /*
         * When extending, we're going to write the new i_size to disk
         * immediately so we need to flush anything above the current on disk
@@ -487,32 +466,22 @@ int bch2_truncate(struct mnt_idmap *idmap,
        if (ret)
                goto err;
 
-       mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
-                              &new_i_size, 0);
-       mutex_unlock(&inode->ei_update_lock);
+       ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-       if (unlikely(ret))
+       if (unlikely(ret)) {
+               /*
+                * If we error here, VFS caches are now inconsistent with btree
+                */
+               set_bit(EI_INODE_ERROR, &inode->ei_flags);
                goto err;
-
-       truncate_setsize(&inode->v, iattr->ia_size);
-
-       ret = bch2_fpunch(c, inode_inum(inode),
-                       round_up(iattr->ia_size, block_bytes(c)) >> 9,
-                       U64_MAX, &i_sectors_delta);
-       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+       }
 
        bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
                                !bch2_journal_error(&c->journal), c,
                                "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
                                inode->v.i_ino, (u64) inode->v.i_blocks,
                                inode->ei_inode.bi_sectors);
-       if (unlikely(ret))
-               goto err;
-
-       mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
-       mutex_unlock(&inode->ei_update_lock);
 
        ret = bch2_setattr_nonsize(idmap, inode, iattr);
 err:
@@ -577,175 +546,33 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       struct bkey_buf copy;
-       struct btree_trans trans;
-       struct btree_iter src, dst, del;
-       loff_t shift, new_size;
-       u64 src_start;
+       s64 i_sectors_delta = 0;
        int ret = 0;
 
        if ((offset | len) & (block_bytes(c) - 1))
                return -EINVAL;
 
        if (insert) {
-               if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
-                       return -EFBIG;
-
                if (offset >= inode->v.i_size)
                        return -EINVAL;
-
-               src_start       = U64_MAX;
-               shift           = len;
        } else {
                if (offset + len >= inode->v.i_size)
                        return -EINVAL;
-
-               src_start       = offset + len;
-               shift           = -len;
        }
 
-       new_size = inode->v.i_size + shift;
-
        ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
        if (ret)
                return ret;
 
-       if (insert) {
-               i_size_write(&inode->v, new_size);
-               mutex_lock(&inode->ei_update_lock);
-               ret = bch2_write_inode_size(c, inode, new_size,
-                                           ATTR_MTIME|ATTR_CTIME);
-               mutex_unlock(&inode->ei_update_lock);
-       } else {
-               s64 i_sectors_delta = 0;
-
-               ret = bch2_fpunch(c, inode_inum(inode),
-                                 offset >> 9, (offset + len) >> 9,
-                                 &i_sectors_delta);
-               bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-               if (ret)
-                       return ret;
-       }
-
-       bch2_bkey_buf_init(&copy);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
-                       POS(inode->v.i_ino, src_start >> 9),
-                       BTREE_ITER_INTENT);
-       bch2_trans_copy_iter(&dst, &src);
-       bch2_trans_copy_iter(&del, &src);
-
-       while (ret == 0 ||
-              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(c, 0);
-               struct bkey_i delete;
-               struct bkey_s_c k;
-               struct bpos next_pos;
-               struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
-               struct bpos atomic_end;
-               unsigned trigger_flags = 0;
-               u32 snapshot;
-
-               bch2_trans_begin(&trans);
-
-               ret = bch2_subvolume_get_snapshot(&trans,
-                                       inode->ei_subvol, &snapshot);
-               if (ret)
-                       continue;
-
-               bch2_btree_iter_set_snapshot(&src, snapshot);
-               bch2_btree_iter_set_snapshot(&dst, snapshot);
-               bch2_btree_iter_set_snapshot(&del, snapshot);
-
-               bch2_trans_begin(&trans);
-
-               k = insert
-                       ? bch2_btree_iter_peek_prev(&src)
-                       : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
-               if ((ret = bkey_err(k)))
-                       continue;
-
-               if (!k.k || k.k->p.inode != inode->v.i_ino)
-                       break;
-
-               if (insert &&
-                   bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
-                       break;
-reassemble:
-               bch2_bkey_buf_reassemble(&copy, c, k);
-
-               if (insert &&
-                   bkey_lt(bkey_start_pos(k.k), move_pos))
-                       bch2_cut_front(move_pos, copy.k);
-
-               copy.k->k.p.offset += shift >> 9;
-               bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
-
-               ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
-               if (ret)
-                       continue;
-
-               if (!bkey_eq(atomic_end, copy.k->k.p)) {
-                       if (insert) {
-                               move_pos = atomic_end;
-                               move_pos.offset -= shift >> 9;
-                               goto reassemble;
-                       } else {
-                               bch2_cut_back(atomic_end, copy.k);
-                       }
-               }
-
-               bkey_init(&delete.k);
-               delete.k.p = copy.k->k.p;
-               delete.k.size = copy.k->k.size;
-               delete.k.p.offset -= shift >> 9;
-               bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
-
-               next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
-
-               if (copy.k->k.size != k.k->size) {
-                       /* We might end up splitting compressed extents: */
-                       unsigned nr_ptrs =
-                               bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
-
-                       ret = bch2_disk_reservation_get(c, &disk_res,
-                                       copy.k->k.size, nr_ptrs,
-                                       BCH_DISK_RESERVATION_NOFAIL);
-                       BUG_ON(ret);
-               }
+       if (insert)
+               i_size_write(&inode->v, inode->v.i_size + len);
 
-               ret =   bch2_btree_iter_traverse(&del) ?:
-                       bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
-                       bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
-                       bch2_trans_commit(&trans, &disk_res, NULL,
-                                         BTREE_INSERT_NOFAIL);
-               bch2_disk_reservation_put(c, &disk_res);
-
-               if (!ret)
-                       bch2_btree_iter_set_pos(&src, next_pos);
-       }
-       bch2_trans_iter_exit(&trans, &del);
-       bch2_trans_iter_exit(&trans, &dst);
-       bch2_trans_iter_exit(&trans, &src);
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&copy, c);
-
-       if (ret)
-               return ret;
+       ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+                                    insert, &i_sectors_delta);
+       if (!ret && !insert)
+               i_size_write(&inode->v, inode->v.i_size - len);
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-       mutex_lock(&inode->ei_update_lock);
-       if (!insert) {
-               i_size_write(&inode->v, new_size);
-               ret = bch2_write_inode_size(c, inode, new_size,
-                                           ATTR_MTIME|ATTR_CTIME);
-       } else {
-               /* We need an inode update to update bi_journal_seq for fsync: */
-               ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-                                      ATTR_MTIME|ATTR_CTIME);
-       }
-       mutex_unlock(&inode->ei_update_lock);
        return ret;
 }
 
@@ -753,16 +580,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                             u64 start_sector, u64 end_sector)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bpos end_pos = POS(inode->v.i_ino, end_sector);
        struct bch_io_opts opts;
        int ret = 0;
 
        bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                        POS(inode->v.i_ino, start_sector),
                        BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
@@ -775,9 +601,9 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                u64 hole_start, hole_end;
                u32 snapshot;
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
-               ret = bch2_subvolume_get_snapshot(&trans,
+               ret = bch2_subvolume_get_snapshot(trans,
                                        inode->ei_subvol, &snapshot);
                if (ret)
                        goto bkey_err;
@@ -814,7 +640,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                                                 &hole_start,
                                                 &hole_end,
                                                 opts.data_replicas, true))
-                               ret = drop_locks_do(&trans,
+                               ret = drop_locks_do(trans,
                                        (bch2_clamp_data_hole(&inode->v,
                                                              &hole_start,
                                                              &hole_end,
@@ -837,7 +663,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                                goto bkey_err;
                }
 
-               ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
+               ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
                                            sectors, opts, &i_sectors_delta,
                                            writepoint_hashed((unsigned long) current));
                if (ret)
@@ -845,7 +671,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
                bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 
-               drop_locks_do(&trans,
+               drop_locks_do(trans,
                        (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
 bkey_err:
                bch2_quota_reservation_put(c, inode, &quota_res);
@@ -857,14 +683,14 @@ bkey_err:
                struct quota_res quota_res = { 0 };
                s64 i_sectors_delta = 0;
 
-               bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+               bch2_fpunch_at(trans, &iter, inode_inum(inode),
                               end_sector, &i_sectors_delta);
                bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
                bch2_quota_reservation_put(c, inode, &quota_res);
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -970,26 +796,24 @@ static int quota_reserve_range(struct bch_inode_info *inode,
                               u64 start, u64 end)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        u32 snapshot;
        u64 sectors = end - start;
        u64 pos = start;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
        if (ret)
                goto err;
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                             SPOS(inode->v.i_ino, pos, snapshot), 0);
 
-       while (!(ret = btree_trans_too_many_iters(&trans)) &&
+       while (!(ret = btree_trans_too_many_iters(trans)) &&
               (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
               !(ret = bkey_err(k))) {
                if (bkey_extent_is_allocation(k.k)) {
@@ -1001,17 +825,14 @@ retry:
                bch2_btree_iter_advance(&iter);
        }
        pos = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
-
-       if (ret)
-               return ret;
+       bch2_trans_put(trans);
 
-       return bch2_quota_reservation_add(c, inode, res, sectors, true);
+       return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
 }
 
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
@@ -1104,7 +925,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        subvol_inum inum = inode_inum(inode);
@@ -1116,15 +937,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
        if (offset >= isize)
                return -ENXIO;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
                           SPOS(inode->v.i_ino, offset >> 9, snapshot),
                           POS(inode->v.i_ino, U64_MAX),
                           0, k, ret) {
@@ -1134,12 +955,12 @@ retry:
                } else if (k.k->p.offset >> 9 > isize)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                return ret;
 
@@ -1157,7 +978,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        subvol_inum inum = inode_inum(inode);
@@ -1169,15 +990,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
        if (offset >= isize)
                return -ENXIO;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
                           SPOS(inode->v.i_ino, offset >> 9, snapshot),
                           BTREE_ITER_SLOTS, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
@@ -1195,12 +1016,12 @@ retry:
                        offset = max(offset, bkey_start_offset(k.k) << 9);
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                return ret;
 
index bb5b709fa8cf69c05f26b81fbbcf5515f42c88c8..ca70346e68dc3d9196c85ce7768b1c9f53e6e792 100644 (file)
@@ -6,7 +6,7 @@
 
 #include "buckets.h"
 #include "fs.h"
-#include "io_types.h"
+#include "io_write_types.h"
 #include "quota.h"
 
 #include <linux/uio.h>
@@ -165,7 +165,7 @@ int __must_check bch2_write_inode_size(struct bch_fs *,
 
 int bch2_fsync(struct file *, loff_t, loff_t, int);
 
-int bch2_truncate(struct mnt_idmap *,
+int bchfs_truncate(struct mnt_idmap *,
                  struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
index 141bcced031e25d9043122c78542cc09cb8586ed..0679b2f79fd6fdca71b820bac339f4bb4ffe2b3a 100644 (file)
@@ -122,7 +122,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 
        fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
-       return copy_to_user(arg, &fa, sizeof(fa));
+       if (copy_to_user(arg, &fa, sizeof(fa)))
+               return -EFAULT;
+
+       return 0;
 }
 
 static int fssetxattr_inode_update_fn(struct btree_trans *trans,
index f201980ef2c38e2dbbe5faf7138f0d2a2c479f16..54a9c21a3b832ba9ad5c0281abe7363ec6f9cd9a 100644 (file)
@@ -5,7 +5,7 @@
 /* Inode flags: */
 
 /* bcachefs inode flags -> vfs inode flags: */
-static const unsigned bch_flags_to_vfs[] = {
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
        [__BCH_INODE_SYNC]      = S_SYNC,
        [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
        [__BCH_INODE_APPEND]    = S_APPEND,
@@ -13,7 +13,7 @@ static const unsigned bch_flags_to_vfs[] = {
 };
 
 /* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const unsigned bch_flags_to_uflags[] = {
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
        [__BCH_INODE_SYNC]      = FS_SYNC_FL,
        [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
        [__BCH_INODE_APPEND]    = FS_APPEND_FL,
@@ -22,7 +22,7 @@ static const unsigned bch_flags_to_uflags[] = {
 };
 
 /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const unsigned bch_flags_to_xflags[] = {
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
        [__BCH_INODE_SYNC]      = FS_XFLAG_SYNC,
        [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
        [__BCH_INODE_APPEND]    = FS_XFLAG_APPEND,
index 80dcda43e26b9dac9b7c3142cc946f03a764e799..1354af2cb85c47d61392a3b04d72dbb3d359e7a1 100644 (file)
@@ -19,7 +19,7 @@
 #include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
 #include "journal.h"
 #include "keylist.h"
 #include "quota.h"
@@ -82,29 +82,27 @@ int __must_check bch2_write_inode(struct bch_fs *c,
                                  inode_set_fn set,
                                  void *p, unsigned fields)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bch_inode_unpacked inode_u;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 512);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
+       ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
                                BTREE_ITER_INTENT) ?:
-               (set ? set(&trans, inode, &inode_u, p) : 0) ?:
-               bch2_inode_write(&trans, &iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+               (set ? set(trans, inode, &inode_u, p) : 0) ?:
+               bch2_inode_write(trans, &iter, &inode_u) ?:
+               bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 
        /*
         * the btree node lock protects inode->ei_inode, not ei_update_lock;
         * this is important for inode updates via bchfs_write_index_update
         */
        if (!ret)
-               bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
+               bch2_inode_update_after_write(trans, inode, &inode_u, fields);
 
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
@@ -114,7 +112,7 @@ retry:
                             inode_inum(inode).subvol,
                             inode_inum(inode).inum);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret < 0 ? ret : 0;
 }
 
@@ -182,7 +180,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
        struct bch_inode_unpacked inode_u;
        struct bch_inode_info *inode;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct bch_subvolume subvol;
        int ret;
 
@@ -196,14 +194,14 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
        if (!(inode->v.i_state & I_NEW))
                return &inode->v;
 
-       bch2_trans_init(&trans, c, 8, 0);
-       ret = lockrestart_do(&trans,
-               bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
-               bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+       trans = bch2_trans_get(c);
+       ret = lockrestart_do(trans,
+               bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+               bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
 
        if (!ret)
-               bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
-       bch2_trans_exit(&trans);
+               bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+       bch2_trans_put(trans);
 
        if (ret) {
                iget_failed(&inode->v);
@@ -226,7 +224,7 @@ __bch2_create(struct mnt_idmap *idmap,
              unsigned flags)
 {
        struct bch_fs *c = dir->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct bch_inode_unpacked dir_u;
        struct bch_inode_info *inode, *old;
        struct bch_inode_unpacked inode_u;
@@ -256,13 +254,11 @@ __bch2_create(struct mnt_idmap *idmap,
        if (!(flags & BCH_CREATE_TMPFILE))
                mutex_lock(&dir->ei_update_lock);
 
-       bch2_trans_init(&trans, c, 8,
-                       2048 + (!(flags & BCH_CREATE_TMPFILE)
-                               ? dentry->d_name.len : 0));
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret   = bch2_create_trans(&trans,
+       ret   = bch2_create_trans(trans,
                                  inode_inum(dir), &dir_u, &inode_u,
                                  !(flags & BCH_CREATE_TMPFILE)
                                  ? &dentry->d_name : NULL,
@@ -278,9 +274,9 @@ retry:
        inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
        inum.inum = inode_u.bi_inum;
 
-       ret   = bch2_subvolume_get(&trans, inum.subvol, true,
+       ret   = bch2_subvolume_get(trans, inum.subvol, true,
                                   BTREE_ITER_WITH_UPDATES, &subvol) ?:
-               bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+               bch2_trans_commit(trans, NULL, &journal_seq, 0);
        if (unlikely(ret)) {
                bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
@@ -291,13 +287,13 @@ err_before_quota:
        }
 
        if (!(flags & BCH_CREATE_TMPFILE)) {
-               bch2_inode_update_after_write(&trans, dir, &dir_u,
+               bch2_inode_update_after_write(trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
                mutex_unlock(&dir->ei_update_lock);
        }
 
        bch2_iget5_set(&inode->v, &inum);
-       bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+       bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
        set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
        set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -337,7 +333,7 @@ err_before_quota:
                unlock_new_inode(&inode->v);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 err:
        posix_acl_release(default_acl);
        posix_acl_release(acl);
@@ -346,7 +342,7 @@ err_trans:
        if (!(flags & BCH_CREATE_TMPFILE))
                mutex_unlock(&dir->ei_update_lock);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        make_bad_inode(&inode->v);
        iput(&inode->v);
        inode = ERR_PTR(ret);
@@ -401,26 +397,25 @@ static int __bch2_link(struct bch_fs *c,
                       struct bch_inode_info *dir,
                       struct dentry *dentry)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bch_inode_unpacked dir_u, inode_u;
        int ret;
 
        mutex_lock(&inode->ei_update_lock);
-       bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_link_trans(&trans,
+       ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_link_trans(trans,
                                        inode_inum(dir),   &dir_u,
                                        inode_inum(inode), &inode_u,
                                        &dentry->d_name));
 
        if (likely(!ret)) {
-               bch2_inode_update_after_write(&trans, dir, &dir_u,
+               bch2_inode_update_after_write(trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
+               bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        mutex_unlock(&inode->ei_update_lock);
        return ret;
 }
@@ -451,24 +446,23 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
        struct bch_inode_info *dir = to_bch_ei(vdir);
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
        struct bch_inode_unpacked dir_u, inode_u;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        int ret;
 
        bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-       bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = commit_do(&trans, NULL, NULL,
+       ret = commit_do(trans, NULL, NULL,
                        BTREE_INSERT_NOFAIL,
-               bch2_unlink_trans(&trans,
+               bch2_unlink_trans(trans,
                                  inode_inum(dir), &dir_u,
                                  &inode_u, &dentry->d_name,
                                  deleting_snapshot));
        if (unlikely(ret))
                goto err;
 
-       bch2_inode_update_after_write(&trans, dir, &dir_u,
+       bch2_inode_update_after_write(trans, dir, &dir_u,
                                      ATTR_MTIME|ATTR_CTIME);
-       bch2_inode_update_after_write(&trans, inode, &inode_u,
+       bch2_inode_update_after_write(trans, inode, &inode_u,
                                      ATTR_MTIME);
 
        if (inode_u.bi_subvol) {
@@ -479,8 +473,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
                set_nlink(&inode->v, 0);
        }
 err:
-       bch2_trans_exit(&trans);
        bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -543,7 +537,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
        struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
        struct bch_inode_unpacked dst_dir_u, src_dir_u;
        struct bch_inode_unpacked src_inode_u, dst_inode_u;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        enum bch_rename_mode mode = flags & RENAME_EXCHANGE
                ? BCH_RENAME_EXCHANGE
                : dst_dentry->d_inode
@@ -560,7 +554,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
                        return ret;
        }
 
-       bch2_trans_init(&trans, c, 8, 2048);
+       trans = bch2_trans_get(c);
 
        bch2_lock_inodes(INODE_UPDATE_LOCK,
                         src_dir,
@@ -587,8 +581,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
                        goto err;
        }
 
-       ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_rename_trans(&trans,
+       ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_rename_trans(trans,
                                          inode_inum(src_dir), &src_dir_u,
                                          inode_inum(dst_dir), &dst_dir_u,
                                          &src_inode_u,
@@ -603,21 +597,21 @@ static int bch2_rename2(struct mnt_idmap *idmap,
        BUG_ON(dst_inode &&
               dst_inode->v.i_ino != dst_inode_u.bi_inum);
 
-       bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
+       bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
                                      ATTR_MTIME|ATTR_CTIME);
 
        if (src_dir != dst_dir)
-               bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
+               bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
 
-       bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
+       bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
                                      ATTR_CTIME);
 
        if (dst_inode)
-               bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
+               bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
                                              ATTR_CTIME);
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        bch2_fs_quota_transfer(c, src_inode,
                               bch_qid(&src_inode->ei_inode),
@@ -680,7 +674,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_qid qid;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
        struct posix_acl *acl = NULL;
@@ -701,13 +695,13 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
        if (ret)
                goto err;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
        kfree(acl);
        acl = NULL;
 
-       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+       ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
                              BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
@@ -715,29 +709,29 @@ retry:
        bch2_setattr_copy(idmap, inode, &inode_u, attr);
 
        if (attr->ia_valid & ATTR_MODE) {
-               ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+               ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
                                     inode_u.bi_mode, &acl);
                if (ret)
                        goto btree_err;
        }
 
-       ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+       ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+               bch2_trans_commit(trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL);
 btree_err:
-       bch2_trans_iter_exit(&trans, &inode_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
 
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (unlikely(ret))
                goto err_trans;
 
-       bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
+       bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
 
        if (acl)
                set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 err_trans:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 err:
        mutex_unlock(&inode->ei_update_lock);
 
@@ -798,7 +792,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
                return ret;
 
        return iattr->ia_valid & ATTR_SIZE
-               ? bch2_truncate(idmap, inode, iattr)
+               ? bchfs_truncate(idmap, inode, iattr)
                : bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
@@ -879,7 +873,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 {
        struct bch_fs *c = vinode->i_sb->s_fs_info;
        struct bch_inode_info *ei = to_bch_ei(vinode);
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
@@ -900,18 +894,18 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 
        bch2_bkey_buf_init(&cur);
        bch2_bkey_buf_init(&prev);
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
        if (ret)
                goto err;
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                             SPOS(ei->v.i_ino, start, snapshot), 0);
 
-       while (!(ret = btree_trans_too_many_iters(&trans)) &&
+       while (!(ret = btree_trans_too_many_iters(trans)) &&
               (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
               !(ret = bkey_err(k))) {
                enum btree_id data_btree = BTREE_ID_extents;
@@ -928,7 +922,7 @@ retry:
 
                bch2_bkey_buf_reassemble(&cur, c, k);
 
-               ret = bch2_read_indirect_extent(&trans, &data_btree,
+               ret = bch2_read_indirect_extent(trans, &data_btree,
                                        &offset_into_extent, &cur);
                if (ret)
                        break;
@@ -947,7 +941,7 @@ retry:
                cur.k->k.p.offset += cur.k->k.size;
 
                if (have_extent) {
-                       bch2_trans_unlock(&trans);
+                       bch2_trans_unlock(trans);
                        ret = bch2_fill_extent(c, info,
                                        bkey_i_to_s_c(prev.k), 0);
                        if (ret)
@@ -961,18 +955,18 @@ retry:
                        POS(iter.pos.inode, iter.pos.offset + sectors));
        }
        start = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        if (!ret && have_extent) {
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
                ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
                                       FIEMAP_EXTENT_LAST);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&cur, c);
        bch2_bkey_buf_exit(&prev, c);
        return ret < 0 ? ret : 0;
@@ -1230,7 +1224,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
        struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
        struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter1;
        struct btree_iter iter2;
        struct bkey_s_c k;
@@ -1245,23 +1239,23 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
        if (!S_ISDIR(dir->v.i_mode))
                return -EINVAL;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
-       bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+       bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
                             POS(dir->ei_inode.bi_inum, 0), 0);
-       bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+       bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
                             POS(dir->ei_inode.bi_inum, 0), 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
        if (ret)
                goto err;
 
        bch2_btree_iter_set_snapshot(&iter1, snapshot);
        bch2_btree_iter_set_snapshot(&iter2, snapshot);
 
-       ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+       ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
        if (ret)
                goto err;
 
@@ -1279,7 +1273,7 @@ retry:
                }
 
                d = bkey_s_c_to_dirent(k);
-               ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+               ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
                if (ret > 0)
                        ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
                if (ret)
@@ -1301,7 +1295,7 @@ retry:
                                continue;
 
                        d = bkey_s_c_to_dirent(k);
-                       ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+                       ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
                        if (ret < 0)
                                break;
                        if (ret)
@@ -1325,9 +1319,9 @@ err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_iter_exit(&trans, &iter1);
-       bch2_trans_iter_exit(&trans, &iter2);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter1);
+       bch2_trans_iter_exit(trans, &iter2);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -1661,7 +1655,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
                up_write(&c->state_lock);
        }
 
-       if (opts.errors >= 0)
+       if (opt_defined(opts, errors))
                c->opts.errors = opts.errors;
 err:
        return bch2_err_class(ret);
@@ -1722,6 +1716,35 @@ static void bch2_put_super(struct super_block *sb)
        __bch2_fs_stop(c);
 }
 
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+       struct bch_fs *c = sb->s_fs_info;
+
+       down_write(&c->state_lock);
+       bch2_fs_read_only(c);
+       up_write(&c->state_lock);
+       return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       int ret;
+
+       down_write(&c->state_lock);
+       ret = bch2_fs_read_write(c);
+       up_write(&c->state_lock);
+       return ret;
+}
+
 static const struct super_operations bch_super_operations = {
        .alloc_inode    = bch2_alloc_inode,
        .destroy_inode  = bch2_destroy_inode,
@@ -1733,10 +1756,8 @@ static const struct super_operations bch_super_operations = {
        .show_options   = bch2_show_options,
        .remount_fs     = bch2_remount,
        .put_super      = bch2_put_super,
-#if 0
        .freeze_fs      = bch2_freeze,
        .unfreeze_fs    = bch2_unfreeze,
-#endif
 };
 
 static int bch2_set_super(struct super_block *s, void *data)
@@ -1890,7 +1911,7 @@ got_sb:
        vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
        ret = PTR_ERR_OR_ZERO(vinode);
        if (ret) {
-               bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "mounting: error getting root inode");
                goto err_put_super;
        }
 
index 10e11119ded222bf6c4d7a474db805fc6f465b3e..5edf1d4b9e6bdfa9a992bf895727228c79de4267 100644 (file)
@@ -197,7 +197,7 @@ int bch2_vfs_init(void);
 
 #else
 
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)       do {} while (0)
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)       ({ do {} while (0); })
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
                                               snapshot_id_list *s) {}
index 57b3dfabea5411164e37c009cdd988c48d994eaa..206302b0f5ed8a6393b7a9f97fc6b1d0d89bb0d4 100644 (file)
@@ -80,7 +80,7 @@ static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
        if (!ret)
                *subvol = le32_to_cpu(s.subvol);
        else if (bch2_err_matches(ret, ENOENT))
-               bch_err(trans->c, "snapshot %u not fonud", snapshot);
+               bch_err(trans->c, "snapshot %u not found", snapshot);
        return ret;
 
 }
@@ -127,8 +127,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
        ret = bch2_inode_unpack(k, inode);
 err:
        if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err(trans->c, "error fetching inode %llu: %s",
-                       inode_nr, bch2_err_str(ret));
+               bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -154,8 +153,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
                *snapshot = iter.pos.snapshot;
 err:
        if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err(trans->c, "error fetching inode %llu:%u: %s",
-                       inode_nr, *snapshot, bch2_err_str(ret));
+               bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -206,17 +204,16 @@ static int __write_inode(struct btree_trans *trans,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 }
 
-static int write_inode(struct btree_trans *trans,
-                      struct bch_inode_unpacked *inode,
-                      u32 snapshot)
+static int fsck_write_inode(struct btree_trans *trans,
+                           struct bch_inode_unpacked *inode,
+                           u32 snapshot)
 {
        int ret = commit_do(trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL|
                                  BTREE_INSERT_LAZY_RW,
                                  __write_inode(trans, inode, snapshot));
        if (ret)
-               bch_err(trans->c, "error in fsck: error updating inode: %s",
-                       bch2_err_str(ret));
+               bch_err_fn(trans->c, ret);
        return ret;
 }
 
@@ -278,13 +275,13 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
        }
 
        if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
+               bch_err_fn(c, ret);
        if (ret)
                return ret;
 
        if (d_type != DT_DIR) {
                bch_err(c, "error looking up lost+found: not a directory");
-               return ret;
+               return -BCH_ERR_ENOENT_not_directory;
        }
 
        /*
@@ -301,7 +298,7 @@ create_lostfound:
                                0, 0, S_IFDIR|0700, 0, NULL, NULL,
                                (subvol_inum) { }, 0);
        if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating lost+found");
        return ret;
 }
 
@@ -365,8 +362,7 @@ static int reattach_inode(struct btree_trans *trans,
                                  BTREE_INSERT_NOFAIL,
                        __reattach_inode(trans, inode, inode_snapshot));
        if (ret) {
-               bch_err(trans->c, "error reattaching inode %llu: %s",
-                       inode->bi_inum, bch2_err_str(ret));
+               bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
                return ret;
        }
 
@@ -475,7 +471,12 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
  * and @ancestor hasn't been overwritten in @seen
  *
- * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ * @c:         filesystem handle
+ * @seen:      list of snapshot ids already seen at current position
+ * @id:                descendent snapshot id
+ * @ancestor:  ancestor snapshot id
+ *
+ * Returns:    whether key in @ancestor snapshot is visible in @id snapshot
  */
 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
                                    u32 id, u32 ancestor)
@@ -520,14 +521,16 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
  * snapshot id @dst, test whether there is some snapshot in which @dst is
  * visible.
  *
- * This assumes we're visiting @src keys in natural key order.
+ * @c:         filesystem handle
+ * @s:         list of snapshot IDs already seen at @src
+ * @src:       snapshot ID of src key
+ * @dst:       snapshot ID of dst key
+ * Returns:    true if there is some snapshot in which @dst is visible
  *
- * @s  - list of snapshot IDs already seen at @src
- * @src        - snapshot ID of src key
- * @dst        - snapshot ID of dst key
+ * Assumes we're visiting @src keys in natural key order
  */
-static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
-                      u32 src, u32 dst)
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+                       u32 src, u32 dst)
 {
        return dst <= src
                ? key_visible_in_snapshot(c, s, dst, src)
@@ -618,10 +621,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 
        w->first_this_inode = true;
 
-       if (trans_was_restarted(trans, restart_count))
-               return -BCH_ERR_transaction_restart_nested;
-
-       return 0;
+       return trans_was_restarted(trans, restart_count);
 }
 
 static struct inode_walker_entry *
@@ -822,7 +822,7 @@ bad_hash:
                      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
                ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
                if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+                       bch_err_fn(c, ret);
                if (ret)
                        return ret;
                ret = -BCH_ERR_transaction_restart_nested;
@@ -886,7 +886,8 @@ static int check_inode(struct btree_trans *trans,
 
                ret = __write_inode(trans, &u, iter->pos.snapshot);
                if (ret) {
-                       bch_err_msg(c, ret, "in fsck: error updating inode");
+                       if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                               bch_err_msg(c, ret, "in fsck updating inode");
                        return ret;
                }
 
@@ -904,8 +905,7 @@ static int check_inode(struct btree_trans *trans,
 
                ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
                if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       bch_err(c, "error in fsck: error while deleting inode: %s",
-                               bch2_err_str(ret));
+                       bch_err_msg(c, ret, "in fsck deleting inode");
                return ret;
        }
 
@@ -928,8 +928,7 @@ static int check_inode(struct btree_trans *trans,
                                POS(u.bi_inum, U64_MAX),
                                0, NULL);
                if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       bch_err(c, "error in fsck: error truncating inode: %s",
-                               bch2_err_str(ret));
+                       bch_err_msg(c, ret, "in fsck truncating inode");
                if (ret)
                        return ret;
 
@@ -954,8 +953,7 @@ static int check_inode(struct btree_trans *trans,
 
                sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
                if (sectors < 0) {
-                       bch_err(c, "error in fsck: error recounting inode sectors: %s",
-                               bch2_err_str(sectors));
+                       bch_err_msg(c, sectors, "fsck recounting inode sectors");
                        return sectors;
                }
 
@@ -974,13 +972,13 @@ static int check_inode(struct btree_trans *trans,
        if (do_update) {
                ret = __write_inode(trans, &u, iter->pos.snapshot);
                if (ret) {
-                       bch_err_msg(c, ret, "in fsck: error updating inode");
+                       bch_err_msg(c, ret, "in fsck updating inode");
                        return ret;
                }
        }
 err:
 fsck_err:
-       if (ret)
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                bch_err_fn(c, ret);
        return ret;
 }
@@ -989,7 +987,7 @@ noinline_for_stack
 int bch2_check_inodes(struct bch_fs *c)
 {
        bool full = c->opts.fsck;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bch_inode_unpacked prev = { 0 };
        struct snapshots_seen s;
@@ -997,16 +995,15 @@ int bch2_check_inodes(struct bch_fs *c)
        int ret;
 
        snapshots_seen_init(&s);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
                        POS_MIN,
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
                        NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_inode(&trans, &iter, k, &prev, &s, full));
+               check_inode(trans, &iter, k, &prev, &s, full));
 
-       bch2_trans_exit(&trans);
        snapshots_seen_exit(&s);
+       bch2_trans_put(trans);
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -1081,7 +1078,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
                            w->last_pos.inode, i->snapshot,
                            i->inode.bi_sectors, i->count)) {
                        i->inode.bi_sectors = i->count;
-                       ret = write_inode(trans, &i->inode, i->snapshot);
+                       ret = fsck_write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
                }
@@ -1089,9 +1086,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 fsck_err:
        if (ret)
                bch_err_fn(c, ret);
-       if (!ret && trans_was_restarted(trans, restart_count))
-               ret = -BCH_ERR_transaction_restart_nested;
-       return ret;
+       return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 struct extent_end {
@@ -1441,7 +1436,7 @@ int bch2_check_extents(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
        struct snapshots_seen s;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct extent_ends extent_ends;
@@ -1450,23 +1445,22 @@ int bch2_check_extents(struct bch_fs *c)
 
        snapshots_seen_init(&s);
        extent_ends_init(&extent_ends);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
                        POS(BCACHEFS_ROOT_INO, 0),
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
                        &res, NULL,
                        BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
                bch2_disk_reservation_put(c, &res);
-               check_extent(&trans, &iter, k, &w, &s, &extent_ends);
+               check_extent(trans, &iter, k, &w, &s, &extent_ends);
        })) ?:
-       check_i_sectors(&trans, &w);
+       check_i_sectors(trans, &w);
 
        bch2_disk_reservation_put(c, &res);
        extent_ends_exit(&extent_ends);
        inode_walker_exit(&w);
-       bch2_trans_exit(&trans);
        snapshots_seen_exit(&s);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
@@ -1501,7 +1495,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
                                "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
                                w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
                        i->inode.bi_nlink = i->count;
-                       ret = write_inode(trans, &i->inode, i->snapshot);
+                       ret = fsck_write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
                }
@@ -1509,9 +1503,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 fsck_err:
        if (ret)
                bch_err_fn(c, ret);
-       if (!ret && trans_was_restarted(trans, restart_count))
-               ret = -BCH_ERR_transaction_restart_nested;
-       return ret;
+       return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 static int check_dirent_target(struct btree_trans *trans,
@@ -1809,23 +1801,22 @@ int bch2_check_dirents(struct bch_fs *c)
        struct inode_walker target = inode_walker_init();
        struct snapshots_seen s;
        struct bch_hash_info hash_info;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
        snapshots_seen_init(&s);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
                        POS(BCACHEFS_ROOT_INO, 0),
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                        k,
                        NULL, NULL,
                        BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
+               check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        snapshots_seen_exit(&s);
        inode_walker_exit(&dir);
        inode_walker_exit(&target);
@@ -1879,23 +1870,18 @@ int bch2_check_xattrs(struct bch_fs *c)
 {
        struct inode_walker inode = inode_walker_init();
        struct bch_hash_info hash_info;
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+       ret = bch2_trans_run(c,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
                        POS(BCACHEFS_ROOT_INO, 0),
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                        k,
                        NULL, NULL,
                        BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_xattr(&trans, &iter, k, &hash_info, &inode));
-
-       bch2_trans_exit(&trans);
-
+               check_xattr(trans, &iter, k, &hash_info, &inode)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -1927,10 +1913,10 @@ static int check_root_trans(struct btree_trans *trans)
                ret = commit_do(trans, NULL, NULL,
                                      BTREE_INSERT_NOFAIL|
                                      BTREE_INSERT_LAZY_RW,
-                       __bch2_btree_insert(trans, BTREE_ID_subvolumes,
+                       bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
                                            &root_subvol.k_i, 0));
                if (ret) {
-                       bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
+                       bch_err_msg(c, ret, "writing root subvol");
                        goto err;
                }
 
@@ -1949,7 +1935,7 @@ static int check_root_trans(struct btree_trans *trans)
 
                ret = __write_inode(trans, &root_inode, snapshot);
                if (ret)
-                       bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
+                       bch_err_msg(c, ret, "writing root inode");
        }
 err:
 fsck_err:
@@ -1964,7 +1950,7 @@ int bch2_check_root(struct bch_fs *c)
        ret = bch2_trans_do(c, NULL, NULL,
                             BTREE_INSERT_NOFAIL|
                             BTREE_INSERT_LAZY_RW,
-               check_root_trans(&trans));
+               check_root_trans(trans));
 
        if (ret)
                bch_err_fn(c, ret);
@@ -2116,16 +2102,14 @@ fsck_err:
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_inode_unpacked u;
        pathbuf path = { 0, };
        int ret;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -2142,12 +2126,12 @@ int bch2_check_directory_structure(struct bch_fs *c)
                if (u.bi_flags & BCH_INODE_UNLINKED)
                        continue;
 
-               ret = check_path(&trans, &path, &u, iter.pos.snapshot);
+               ret = check_path(trans, &path, &u, iter.pos.snapshot);
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        darray_exit(&path);
 
        if (ret)
@@ -2155,8 +2139,6 @@ int bch2_check_directory_structure(struct bch_fs *c)
        return ret;
 }
 
-/* check_nlink pass: */
-
 struct nlink_table {
        size_t          nr;
        size_t          size;
@@ -2238,15 +2220,13 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                                       struct nlink_table *t,
                                       u64 start, u64 *end)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_inode_unpacked u;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_inodes,
+       for_each_btree_key(trans, iter, BTREE_ID_inodes,
                           POS(0, start),
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
@@ -2275,8 +2255,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                }
 
        }
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
@@ -2288,7 +2268,7 @@ noinline_for_stack
 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
                                     u64 range_start, u64 range_end)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct snapshots_seen s;
        struct btree_iter iter;
        struct bkey_s_c k;
@@ -2297,9 +2277,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 
        snapshots_seen_init(&s);
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -2319,12 +2297,12 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
                        break;
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret)
                bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        snapshots_seen_exit(&s);
        return ret;
 }
@@ -2375,22 +2353,17 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                               struct nlink_table *links,
                               u64 range_start, u64 range_end)
 {
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        size_t idx = 0;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
-                       POS(0, range_start),
-                       BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
-
-       bch2_trans_exit(&trans);
-
+       ret = bch2_trans_run(c,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+                               POS(0, range_start),
+                               BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                               NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                       check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
        if (ret < 0) {
                bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
                return ret;
@@ -2472,13 +2445,12 @@ int bch2_fix_reflink_p(struct bch_fs *c)
                return 0;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                                BTREE_ID_extents, POS_MIN,
                                BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
                                BTREE_ITER_ALL_SNAPSHOTS, k,
                                NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-                       fix_reflink_p_key(&trans, &iter, k)));
-
+                       fix_reflink_p_key(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
index 8114b6e4f202b24bbda745aec5c805509fa1cea0..8bfd99cb7ad149deca66f2998bcecbc4bcc076a1 100644 (file)
@@ -120,8 +120,7 @@ static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                struct bch_inode_unpacked unpacked;
 
-               int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
-                                          &unpacked);
+               ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
                BUG_ON(ret);
                BUG_ON(unpacked.bi_inum         != inode->bi_inum);
                BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
@@ -318,7 +317,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
        return bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-int bch2_inode_peek(struct btree_trans *trans,
+static int bch2_inode_peek_nowarn(struct btree_trans *trans,
                    struct btree_iter *iter,
                    struct bch_inode_unpacked *inode,
                    subvol_inum inum, unsigned flags)
@@ -349,7 +348,17 @@ int bch2_inode_peek(struct btree_trans *trans,
        return 0;
 err:
        bch2_trans_iter_exit(trans, iter);
-       if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+       return ret;
+}
+
+int bch2_inode_peek(struct btree_trans *trans,
+                   struct btree_iter *iter,
+                   struct bch_inode_unpacked *inode,
+                   subvol_inum inum, unsigned flags)
+{
+       int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+
+       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
        return ret;
 }
@@ -817,7 +826,7 @@ err:
 
 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_i_inode_generation delete;
        struct bch_inode_unpacked inode_u;
@@ -825,8 +834,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
        u32 snapshot;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 1024);
-
        /*
         * If this was a directory, there shouldn't be any real dirents left -
         * but there could be whiteouts (from hash collisions) that we should
@@ -835,19 +842,19 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
         * XXX: the dirent could ideally would delete whiteouts when they're no
         * longer needed
         */
-       ret   = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
-               bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
-               bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
+       ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+               bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+               bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
        if (ret)
                goto err;
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes,
+       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
                               SPOS(0, inum.inum, snapshot),
                               BTREE_ITER_INTENT|BTREE_ITER_CACHED);
        ret = bkey_err(k);
@@ -855,7 +862,7 @@ retry:
                goto err;
 
        if (!bkey_is_inode(k.k)) {
-               bch2_fs_inconsistent(trans.c,
+               bch2_fs_inconsistent(c,
                                     "inode %llu:%u not found when deleting",
                                     inum.inum, snapshot);
                ret = -EIO;
@@ -868,15 +875,28 @@ retry:
        delete.k.p = iter.pos;
        delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-       ret   = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+       ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+               bch2_trans_commit(trans, NULL, NULL,
                                BTREE_INSERT_NOFAIL);
 err:
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
+       return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+                                 subvol_inum inum,
+                                 struct bch_inode_unpacked *inode)
+{
+       struct btree_iter iter;
+       int ret;
+
+       ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+       if (!ret)
+               bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -897,7 +917,7 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
                            struct bch_inode_unpacked *inode)
 {
        return bch2_trans_do(c, NULL, NULL, 0,
-               bch2_inode_find_by_inum_trans(&trans, inum, inode));
+               bch2_inode_find_by_inum_trans(trans, inum, inode));
 }
 
 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
@@ -1069,14 +1089,12 @@ delete:
 
 int bch2_delete_dead_inodes(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       ret = bch2_btree_write_buffer_flush_sync(&trans);
+       ret = bch2_btree_write_buffer_flush_sync(trans);
        if (ret)
                goto err;
 
@@ -1086,26 +1104,26 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
         * but we can't retry because the btree write buffer won't have been
         * flushed and we'd spin:
         */
-       for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
                           BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-               ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p));
+               ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p));
                if (ret < 0)
                        break;
 
                if (ret) {
                        if (!test_bit(BCH_FS_RW, &c->flags)) {
-                               bch2_trans_unlock(&trans);
+                               bch2_trans_unlock(trans);
                                bch2_fs_lazy_rw(c);
                        }
 
-                       ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot);
+                       ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
                        if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                break;
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
index 22b24405649f0200cc3785bb0e4b431dc02ea72a..a7464e1b696046a074f10f2a72cc38163718f041 100644 (file)
@@ -118,6 +118,9 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *,
 
 int bch2_inode_rm(struct bch_fs *, subvol_inum);
 
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+                                 subvol_inum,
+                                 struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
                                  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
deleted file mode 100644 (file)
index 831e3f1..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_H
-#define _BCACHEFS_IO_H
-
-#include "checksum.h"
-#include "bkey_buf.h"
-#include "io_types.h"
-
-#define to_wbio(_bio)                  \
-       container_of((_bio), struct bch_write_bio, bio)
-
-#define to_rbio(_bio)                  \
-       container_of((_bio), struct bch_read_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-                              enum bch_data_type, const struct bkey_i *, bool);
-
-#define BLK_STS_REMOVED                ((__force blk_status_t)128)
-
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#define BCH_WRITE_FLAGS()              \
-       x(ALLOC_NOWAIT)                 \
-       x(CACHED)                       \
-       x(DATA_ENCODED)                 \
-       x(PAGES_STABLE)                 \
-       x(PAGES_OWNED)                  \
-       x(ONLY_SPECIFIED_DEVS)          \
-       x(WROTE_DATA_INLINE)            \
-       x(FROM_INTERNAL)                \
-       x(CHECK_ENOSPC)                 \
-       x(SYNC)                         \
-       x(MOVE)                         \
-       x(IN_WORKER)                    \
-       x(DONE)                         \
-       x(IO_ERROR)                     \
-       x(CONVERT_UNWRITTEN)
-
-enum __bch_write_flags {
-#define x(f)   __BCH_WRITE_##f,
-       BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f)   BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
-       BCH_WRITE_FLAGS()
-#undef x
-};
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
-       return op->watermark == BCH_WATERMARK_copygc
-               ? op->c->copygc_wq
-               : op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
-                              struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
-                      struct btree_iter *, struct bkey_i *,
-                      struct disk_reservation *, u64, s64 *, bool);
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
-                         unsigned, struct bch_io_opts, s64 *,
-                         struct write_point_specifier);
-
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-                  subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-                                     struct bch_io_opts opts)
-{
-       op->c                   = c;
-       op->end_io              = NULL;
-       op->flags               = 0;
-       op->written             = 0;
-       op->error               = 0;
-       op->csum_type           = bch2_data_checksum_type(c, opts);
-       op->compression_opt     = opts.compression;
-       op->nr_replicas         = 0;
-       op->nr_replicas_required = c->opts.data_replicas_required;
-       op->watermark           = BCH_WATERMARK_normal;
-       op->incompressible      = 0;
-       op->open_buckets.nr     = 0;
-       op->devs_have.nr        = 0;
-       op->target              = 0;
-       op->opts                = opts;
-       op->subvol              = 0;
-       op->pos                 = POS_MAX;
-       op->version             = ZERO_VERSION;
-       op->write_point         = (struct write_point_specifier) { 0 };
-       op->res                 = (struct disk_reservation) { 0 };
-       op->new_i_size          = U64_MAX;
-       op->i_sectors_delta     = 0;
-       op->devs_need_flush     = NULL;
-}
-
-void bch2_write(struct closure *);
-
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
-       struct bch_write_bio *wbio = to_wbio(bio);
-
-       memset(&wbio->wbio, 0, sizeof(wbio->wbio));
-       return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-                               struct bkey_buf *);
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
-                                           enum btree_id *data_btree,
-                                           unsigned *offset_into_extent,
-                                           struct bkey_buf *k)
-{
-       if (k->k->k.type != KEY_TYPE_reflink_p)
-               return 0;
-
-       *data_btree = BTREE_ID_reflink;
-       return __bch2_read_indirect_extent(trans, offset_into_extent, k);
-}
-
-enum bch_read_flags {
-       BCH_READ_RETRY_IF_STALE         = 1 << 0,
-       BCH_READ_MAY_PROMOTE            = 1 << 1,
-       BCH_READ_USER_MAPPED            = 1 << 2,
-       BCH_READ_NODECODE               = 1 << 3,
-       BCH_READ_LAST_FRAGMENT          = 1 << 4,
-
-       /* internal: */
-       BCH_READ_MUST_BOUNCE            = 1 << 5,
-       BCH_READ_MUST_CLONE             = 1 << 6,
-       BCH_READ_IN_RETRY               = 1 << 7,
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
-                      struct bvec_iter, struct bpos, enum btree_id,
-                      struct bkey_s_c, unsigned,
-                      struct bch_io_failures *, unsigned);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
-                       struct bch_read_bio *rbio, struct bpos read_pos,
-                       enum btree_id data_btree, struct bkey_s_c k,
-                       unsigned offset_into_extent, unsigned flags)
-{
-       __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
-                          data_btree, k, offset_into_extent, NULL, flags);
-}
-
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-                subvol_inum, struct bch_io_failures *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                            subvol_inum inum)
-{
-       struct bch_io_failures failed = { .nr = 0 };
-
-       BUG_ON(rbio->_state);
-
-       rbio->c = c;
-       rbio->start_time = local_clock();
-       rbio->subvol = inum.subvol;
-
-       __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
-                   BCH_READ_RETRY_IF_STALE|
-                   BCH_READ_MAY_PROMOTE|
-                   BCH_READ_USER_MAPPED);
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
-                                            struct bch_io_opts opts)
-{
-       struct bch_read_bio *rbio = to_rbio(bio);
-
-       rbio->_state    = 0;
-       rbio->promote   = NULL;
-       rbio->opts      = opts;
-       return rbio;
-}
-
-void bch2_fs_io_exit(struct bch_fs *);
-int bch2_fs_io_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_H */
diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c
new file mode 100644 (file)
index 0000000..32432bd
--- /dev/null
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_misc.c - fallocate, fpunch, truncate:
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "logged_ops.h"
+#include "subvolume.h"
+
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+                         subvol_inum inum,
+                         struct btree_iter *iter,
+                         unsigned sectors,
+                         struct bch_io_opts opts,
+                         s64 *i_sectors_delta,
+                         struct write_point_specifier write_point)
+{
+       struct bch_fs *c = trans->c;
+       struct disk_reservation disk_res = { 0 };
+       struct closure cl;
+       struct open_buckets open_buckets = { 0 };
+       struct bkey_s_c k;
+       struct bkey_buf old, new;
+       unsigned sectors_allocated = 0;
+       bool have_reservation = false;
+       bool unwritten = opts.nocow &&
+           c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+       int ret;
+
+       bch2_bkey_buf_init(&old);
+       bch2_bkey_buf_init(&new);
+       closure_init_stack(&cl);
+
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+
+       if (!have_reservation) {
+               unsigned new_replicas =
+                       max(0, (int) opts.data_replicas -
+                           (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+               /*
+                * Get a disk reservation before (in the nocow case) calling
+                * into the allocator:
+                */
+               ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+               if (unlikely(ret))
+                       goto err;
+
+               bch2_bkey_buf_reassemble(&old, c, k);
+       }
+
+       if (have_reservation) {
+               if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+                       goto err;
+
+               bch2_key_resize(&new.k->k, sectors);
+       } else if (!unwritten) {
+               struct bkey_i_reservation *reservation;
+
+               bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+               reservation = bkey_reservation_init(new.k);
+               reservation->k.p = iter->pos;
+               bch2_key_resize(&reservation->k, sectors);
+               reservation->v.nr_replicas = opts.data_replicas;
+       } else {
+               struct bkey_i_extent *e;
+               struct bch_devs_list devs_have;
+               struct write_point *wp;
+               struct bch_extent_ptr *ptr;
+
+               devs_have.nr = 0;
+
+               bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+               e = bkey_extent_init(new.k);
+               e->k.p = iter->pos;
+
+               ret = bch2_alloc_sectors_start_trans(trans,
+                               opts.foreground_target,
+                               false,
+                               write_point,
+                               &devs_have,
+                               opts.data_replicas,
+                               opts.data_replicas,
+                               BCH_WATERMARK_normal, 0, &cl, &wp);
+               if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+                       ret = -BCH_ERR_transaction_restart_nested;
+               if (ret)
+                       goto err;
+
+               sectors = min(sectors, wp->sectors_free);
+               sectors_allocated = sectors;
+
+               bch2_key_resize(&e->k, sectors);
+
+               bch2_open_bucket_get(c, wp, &open_buckets);
+               bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+               bch2_alloc_sectors_done(c, wp);
+
+               extent_for_each_ptr(extent_i_to_s(e), ptr)
+                       ptr->unwritten = true;
+       }
+
+       have_reservation = true;
+
+       ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+                                0, i_sectors_delta, true);
+err:
+       if (!ret && sectors_allocated)
+               bch2_increment_clock(c, sectors_allocated, WRITE);
+
+       bch2_open_buckets_put(c, &open_buckets);
+       bch2_disk_reservation_put(c, &disk_res);
+       bch2_bkey_buf_exit(&new, c);
+       bch2_bkey_buf_exit(&old, c);
+
+       if (closure_nr_remaining(&cl) != 1) {
+               bch2_trans_unlock(trans);
+               closure_sync(&cl);
+       }
+
+       return ret;
+}
+
+/*
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
+ */
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+                  subvol_inum inum, u64 end,
+                  s64 *i_sectors_delta)
+{
+       struct bch_fs *c        = trans->c;
+       unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
+       struct bpos end_pos = POS(inum.inum, end);
+       struct bkey_s_c k;
+       int ret = 0, ret2 = 0;
+       u32 snapshot;
+
+       while (!ret ||
+              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(c, 0);
+               struct bkey_i delete;
+
+               if (ret)
+                       ret2 = ret;
+
+               bch2_trans_begin(trans);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(iter, snapshot);
+
+               /*
+                * peek_upto() doesn't have ideal semantics for extents:
+                */
+               k = bch2_btree_iter_peek_upto(iter, end_pos);
+               if (!k.k)
+                       break;
+
+               ret = bkey_err(k);
+               if (ret)
+                       continue;
+
+               bkey_init(&delete.k);
+               delete.k.p = iter->pos;
+
+               /* create the biggest key we can */
+               bch2_key_resize(&delete.k, max_sectors);
+               bch2_cut_back(end_pos, &delete);
+
+               ret = bch2_extent_update(trans, inum, iter, &delete,
+                               &disk_res, 0, i_sectors_delta, false);
+               bch2_disk_reservation_put(c, &disk_res);
+       }
+
+       return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+               s64 *i_sectors_delta)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            POS(inum.inum, start),
+                            BTREE_ITER_INTENT);
+
+       ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
+
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
+
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               ret = 0;
+
+       return ret;
+}
+
+/* truncate: */
+
+void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
+
+       prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+       prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+       prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
+}
+
+static int truncate_set_isize(struct btree_trans *trans,
+                             subvol_inum inum,
+                             u64 new_i_size)
+{
+       struct btree_iter iter = { NULL };
+       struct bch_inode_unpacked inode_u;
+       int ret;
+
+       ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+               (inode_u.bi_size = new_i_size, 0) ?:
+               bch2_inode_write(trans, &iter, &inode_u);
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
+                                           struct bkey_i *op_k,
+                                           u64 *i_sectors_delta)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter fpunch_iter;
+       struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
+       subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+       u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+       int ret;
+
+       ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                       truncate_set_isize(trans, inum, new_i_size));
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
+                            POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+                            BTREE_ITER_INTENT);
+       ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+       bch2_trans_iter_exit(trans, &fpunch_iter);
+
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               ret = 0;
+err:
+       bch2_logged_op_finish(trans, op_k);
+       return ret;
+}
+
+int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
+{
+       return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+       struct bkey_i_logged_op_truncate op;
+
+       bkey_logged_op_truncate_init(&op.k_i);
+       op.v.subvol     = cpu_to_le32(inum.subvol);
+       op.v.inum       = cpu_to_le64(inum.inum);
+       op.v.new_i_size = cpu_to_le64(new_i_size);
+
+       return bch2_trans_run(c,
+               bch2_logged_op_start(trans, &op.k_i) ?:
+               __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+}
+
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+       prt_printf(out, "subvol=%u",            le32_to_cpu(op.v->subvol));
+       prt_printf(out, " inum=%llu",           le64_to_cpu(op.v->inum));
+       prt_printf(out, " dst_offset=%lli",     le64_to_cpu(op.v->dst_offset));
+       prt_printf(out, " src_offset=%llu",     le64_to_cpu(op.v->src_offset));
+}
+
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+{
+       struct btree_iter iter;
+       struct bch_inode_unpacked inode_u;
+       int ret;
+
+       offset  <<= 9;
+       len     <<= 9;
+
+       ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+       if (ret)
+               return ret;
+
+       if (len > 0) {
+               if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
+                       ret = -EFBIG;
+                       goto err;
+               }
+
+               if (offset >= inode_u.bi_size) {
+                       ret = -EINVAL;
+                       goto err;
+               }
+       }
+
+       inode_u.bi_size += len;
+       inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
+
+       ret = bch2_inode_write(trans, &iter, &inode_u);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+                                          struct bkey_i *op_k,
+                                          u64 *i_sectors_delta)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+       subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+       u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+       u64 src_offset = le64_to_cpu(op->v.src_offset);
+       s64 shift = dst_offset - src_offset;
+       u64 len = abs(shift);
+       u64 pos = le64_to_cpu(op->v.pos);
+       bool insert = shift > 0;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            POS(inum.inum, 0),
+                            BTREE_ITER_INTENT);
+
+       switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+       op->v.state = LOGGED_OP_FINSERT_shift_extents;
+
+       if (insert) {
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               adjust_i_size(trans, inum, src_offset, len) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
+               if (ret)
+                       goto err;
+       } else {
+               bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+
+               ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
+               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       goto err;
+
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               bch2_logged_op_update(trans, &op->k_i));
+       }
+
+       fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+       while (1) {
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(c, 0);
+               struct bkey_i delete, *copy;
+               struct bkey_s_c k;
+               struct bpos src_pos = POS(inum.inum, src_offset);
+               u32 snapshot;
+
+               bch2_trans_begin(trans);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+               if (ret)
+                       goto btree_err;
+
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
+               bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+
+               k = insert
+                       ? bch2_btree_iter_peek_prev(&iter)
+                       : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+               if ((ret = bkey_err(k)))
+                       goto btree_err;
+
+               if (!k.k ||
+                   k.k->p.inode != inum.inum ||
+                   bkey_le(k.k->p, POS(inum.inum, src_offset)))
+                       break;
+
+               copy = bch2_bkey_make_mut_noupdate(trans, k);
+               if ((ret = PTR_ERR_OR_ZERO(copy)))
+                       goto btree_err;
+
+               if (insert &&
+                   bkey_lt(bkey_start_pos(k.k), src_pos)) {
+                       bch2_cut_front(src_pos, copy);
+
+                       /* Splitting compressed extent? */
+                       bch2_disk_reservation_add(c, &disk_res,
+                                       copy->k.size *
+                                       bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+                                       BCH_DISK_RESERVATION_NOFAIL);
+               }
+
+               bkey_init(&delete.k);
+               delete.k.p = copy->k.p;
+               delete.k.p.snapshot = snapshot;
+               delete.k.size = copy->k.size;
+
+               copy->k.p.offset += shift;
+               copy->k.p.snapshot = snapshot;
+
+               op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
+
+               ret =   bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+                       bch2_logged_op_update(trans, &op->k_i) ?:
+                       bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+btree_err:
+               bch2_disk_reservation_put(c, &disk_res);
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       continue;
+               if (ret)
+                       goto err;
+
+               pos = le64_to_cpu(op->v.pos);
+       }
+
+       op->v.state = LOGGED_OP_FINSERT_finish;
+
+       if (!insert) {
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               adjust_i_size(trans, inum, src_offset, shift) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
+       } else {
+               /* We need an inode update to update bi_journal_seq for fsync: */
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               adjust_i_size(trans, inum, 0, 0) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
+       }
+
+       break;
+case LOGGED_OP_FINSERT_finish:
+       break;
+       }
+err:
+       bch2_logged_op_finish(trans, op_k);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+       return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+                          u64 offset, u64 len, bool insert,
+                          s64 *i_sectors_delta)
+{
+       struct bkey_i_logged_op_finsert op;
+       s64 shift = insert ? len : -len;
+
+       bkey_logged_op_finsert_init(&op.k_i);
+       op.v.subvol     = cpu_to_le32(inum.subvol);
+       op.v.inum       = cpu_to_le64(inum.inum);
+       op.v.dst_offset = cpu_to_le64(offset + shift);
+       op.v.src_offset = cpu_to_le64(offset);
+       op.v.pos        = cpu_to_le64(insert ? U64_MAX : offset);
+
+       return bch2_trans_run(c,
+               bch2_logged_op_start(trans, &op.k_i) ?:
+               __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+}
diff --git a/libbcachefs/io_misc.h b/libbcachefs/io_misc.h
new file mode 100644 (file)
index 0000000..c9e6ed4
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_MISC_H
+#define _BCACHEFS_IO_MISC_H
+
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+                         unsigned, struct bch_io_opts, s64 *,
+                         struct write_point_specifier);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+                  subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+
+void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {  \
+       .val_to_text    = bch2_logged_op_truncate_to_text,      \
+       .min_val_size   = 24,                                   \
+})
+
+int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
+
+int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {   \
+       .val_to_text    = bch2_logged_op_finsert_to_text,       \
+       .min_val_size   = 24,                                   \
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
+int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
+
+#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
new file mode 100644 (file)
index 0000000..443c3ea
--- /dev/null
@@ -0,0 +1,1210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+       const struct bch_devs_mask *devs;
+       unsigned d, nr = 0, total = 0;
+       u64 now = local_clock(), last;
+       s64 congested;
+       struct bch_dev *ca;
+
+       if (!target)
+               return false;
+
+       rcu_read_lock();
+       devs = bch2_target_to_mask(c, target) ?:
+               &c->rw_devs[BCH_DATA_user];
+
+       for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+               ca = rcu_dereference(c->devs[d]);
+               if (!ca)
+                       continue;
+
+               congested = atomic_read(&ca->congested);
+               last = READ_ONCE(ca->congested_last);
+               if (time_after64(now, last))
+                       congested -= (now - last) >> 12;
+
+               total += max(congested, 0LL);
+               nr++;
+       }
+       rcu_read_unlock();
+
+       return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+       return false;
+}
+
+#endif
+
+/* Cache promotion on read */
+
+struct promote_op {
+       struct rcu_head         rcu;
+       u64                     start_time;
+
+       struct rhash_head       hash;
+       struct bpos             pos;
+
+       struct data_update      write;
+       struct bio_vec          bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+       .head_offset    = offsetof(struct promote_op, hash),
+       .key_offset     = offsetof(struct promote_op, pos),
+       .key_len        = sizeof(struct bpos),
+};
+
+static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
+                                 struct bpos pos,
+                                 struct bch_io_opts opts,
+                                 unsigned flags)
+{
+       BUG_ON(!opts.promote_target);
+
+       if (!(flags & BCH_READ_MAY_PROMOTE))
+               return -BCH_ERR_nopromote_may_not;
+
+       if (bch2_bkey_has_target(c, k, opts.promote_target))
+               return -BCH_ERR_nopromote_already_promoted;
+
+       if (bkey_extent_is_unwritten(k))
+               return -BCH_ERR_nopromote_unwritten;
+
+       if (bch2_target_congested(c, opts.promote_target))
+               return -BCH_ERR_nopromote_congested;
+
+       if (rhashtable_lookup_fast(&c->promote_table, &pos,
+                                  bch_promote_params))
+               return -BCH_ERR_nopromote_in_flight;
+
+       return 0;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+       int ret;
+
+       bch2_data_update_exit(&op->write);
+
+       ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+                                    bch_promote_params);
+       BUG_ON(ret);
+       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+       kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct bch_write_op *wop)
+{
+       struct promote_op *op =
+               container_of(wop, struct promote_op, write.op);
+       struct bch_fs *c = op->write.op.c;
+
+       bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+                              op->start_time);
+       promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+       struct bio *bio = &op->write.op.wbio.bio;
+
+       trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+
+       /* we now own pages: */
+       BUG_ON(!rbio->bounce);
+       BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+       memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+              sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+       swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+       bch2_data_update_read_done(&op->write, rbio->pick.crc);
+}
+
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
+                                         enum btree_id btree_id,
+                                         struct bkey_s_c k,
+                                         struct bpos pos,
+                                         struct extent_ptr_decoded *pick,
+                                         struct bch_io_opts opts,
+                                         unsigned sectors,
+                                         struct bch_read_bio **rbio)
+{
+       struct bch_fs *c = trans->c;
+       struct promote_op *op = NULL;
+       struct bio *bio;
+       unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+       int ret;
+
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+               return NULL;
+
+       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
+       if (!op)
+               goto err;
+
+       op->start_time = local_clock();
+       op->pos = pos;
+
+       /*
+        * We don't use the mempool here because extents that aren't
+        * checksummed or compressed can be too big for the mempool:
+        */
+       *rbio = kzalloc(sizeof(struct bch_read_bio) +
+                       sizeof(struct bio_vec) * pages,
+                       GFP_NOFS);
+       if (!*rbio)
+               goto err;
+
+       rbio_init(&(*rbio)->bio, opts);
+       bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+
+       if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+                                GFP_NOFS))
+               goto err;
+
+       (*rbio)->bounce         = true;
+       (*rbio)->split          = true;
+       (*rbio)->kmalloc        = true;
+
+       if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+                                         bch_promote_params))
+               goto err;
+
+       bio = &op->write.op.wbio.bio;
+       bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+
+       ret = bch2_data_update_init(trans, NULL, &op->write,
+                       writepoint_hashed((unsigned long) current),
+                       opts,
+                       (struct data_update_opts) {
+                               .target         = opts.promote_target,
+                               .extra_replicas = 1,
+                               .write_flags    = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
+                       },
+                       btree_id, k);
+       /*
+        * possible errors: -BCH_ERR_nocow_lock_blocked,
+        * -BCH_ERR_ENOSPC_disk_reservation:
+        */
+       if (ret) {
+               ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+                                       bch_promote_params);
+               BUG_ON(ret);
+               goto err;
+       }
+
+       op->write.op.end_io = promote_done;
+
+       return op;
+err:
+       if (*rbio)
+               bio_free_pages(&(*rbio)->bio);
+       kfree(*rbio);
+       *rbio = NULL;
+       kfree(op);
+       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+       return NULL;
+}
+
+noinline
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+                                       struct bvec_iter iter,
+                                       struct bkey_s_c k,
+                                       struct extent_ptr_decoded *pick,
+                                       struct bch_io_opts opts,
+                                       unsigned flags,
+                                       struct bch_read_bio **rbio,
+                                       bool *bounce,
+                                       bool *read_full)
+{
+       struct bch_fs *c = trans->c;
+       bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+       /* data might have to be decompressed in the write path: */
+       unsigned sectors = promote_full
+               ? max(pick->crc.compressed_size, pick->crc.live_size)
+               : bvec_iter_sectors(iter);
+       struct bpos pos = promote_full
+               ? bkey_start_pos(k.k)
+               : POS(k.k->p.inode, iter.bi_sector);
+       struct promote_op *promote;
+       int ret;
+
+       ret = should_promote(c, k, pos, opts, flags);
+       if (ret)
+               goto nopromote;
+
+       promote = __promote_alloc(trans,
+                                 k.k->type == KEY_TYPE_reflink_v
+                                 ? BTREE_ID_reflink
+                                 : BTREE_ID_extents,
+                                 k, pos, pick, opts, sectors, rbio);
+       if (!promote) {
+               ret = -BCH_ERR_nopromote_enomem;
+               goto nopromote;
+       }
+
+       *bounce         = true;
+       *read_full      = promote_full;
+       return promote;
+nopromote:
+       trace_read_nopromote(c, ret);
+       return NULL;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID       1
+#define READ_RETRY             2
+#define READ_ERR               3
+
+enum rbio_context {
+       RBIO_CONTEXT_NULL,
+       RBIO_CONTEXT_HIGHPRI,
+       RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+       return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+                          enum rbio_context context,
+                          struct workqueue_struct *wq)
+{
+       if (context <= rbio->context) {
+               fn(&rbio->work);
+       } else {
+               rbio->work.func         = fn;
+               rbio->context           = context;
+               queue_work(wq, &rbio->work);
+       }
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+       BUG_ON(rbio->bounce && !rbio->split);
+
+       if (rbio->promote)
+               promote_free(rbio->c, rbio->promote);
+       rbio->promote = NULL;
+
+       if (rbio->bounce)
+               bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+       if (rbio->split) {
+               struct bch_read_bio *parent = rbio->parent;
+
+               if (rbio->kmalloc)
+                       kfree(rbio);
+               else
+                       bio_put(&rbio->bio);
+
+               rbio = parent;
+       }
+
+       return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+       if (rbio->start_time)
+               bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+                                      rbio->start_time);
+       bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+                                    struct bvec_iter bvec_iter,
+                                    struct bch_io_failures *failed,
+                                    unsigned flags)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       struct bkey_buf sk;
+       struct bkey_s_c k;
+       int ret;
+
+       flags &= ~BCH_READ_LAST_FRAGMENT;
+       flags |= BCH_READ_MUST_CLONE;
+
+       bch2_bkey_buf_init(&sk);
+
+       bch2_trans_iter_init(trans, &iter, rbio->data_btree,
+                            rbio->read_pos, BTREE_ITER_SLOTS);
+retry:
+       rbio->bio.bi_status = 0;
+
+       k = bch2_btree_iter_peek_slot(&iter);
+       if (bkey_err(k))
+               goto err;
+
+       bch2_bkey_buf_reassemble(&sk, c, k);
+       k = bkey_i_to_s_c(sk.k);
+       bch2_trans_unlock(trans);
+
+       if (!bch2_bkey_matches_ptr(c, k,
+                                  rbio->pick.ptr,
+                                  rbio->data_pos.offset -
+                                  rbio->pick.crc.offset)) {
+               /* extent we wanted to read no longer exists: */
+               rbio->hole = true;
+               goto out;
+       }
+
+       ret = __bch2_read_extent(trans, rbio, bvec_iter,
+                                rbio->read_pos,
+                                rbio->data_btree,
+                                k, 0, failed, flags);
+       if (ret == READ_RETRY)
+               goto retry;
+       if (ret)
+               goto err;
+out:
+       bch2_rbio_done(rbio);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
+       bch2_bkey_buf_exit(&sk, c);
+       return;
+err:
+       rbio->bio.bi_status = BLK_STS_IOERR;
+       goto out;
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+       struct bch_read_bio *rbio =
+               container_of(work, struct bch_read_bio, work);
+       struct bch_fs *c        = rbio->c;
+       struct bvec_iter iter   = rbio->bvec_iter;
+       unsigned flags          = rbio->flags;
+       subvol_inum inum = {
+               .subvol = rbio->subvol,
+               .inum   = rbio->read_pos.inode,
+       };
+       struct bch_io_failures failed = { .nr = 0 };
+
+       trace_and_count(c, read_retry, &rbio->bio);
+
+       if (rbio->retry == READ_RETRY_AVOID)
+               bch2_mark_io_failure(&failed, &rbio->pick);
+
+       rbio->bio.bi_status = 0;
+
+       rbio = bch2_rbio_free(rbio);
+
+       flags |= BCH_READ_IN_RETRY;
+       flags &= ~BCH_READ_MAY_PROMOTE;
+
+       if (flags & BCH_READ_NODECODE) {
+               bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+       } else {
+               flags &= ~BCH_READ_LAST_FRAGMENT;
+               flags |= BCH_READ_MUST_CLONE;
+
+               __bch2_read(c, rbio, iter, inum, &failed, flags);
+       }
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+                           blk_status_t error)
+{
+       rbio->retry = retry;
+
+       if (rbio->flags & BCH_READ_IN_RETRY)
+               return;
+
+       if (retry == READ_ERR) {
+               rbio = bch2_rbio_free(rbio);
+
+               rbio->bio.bi_status = error;
+               bch2_rbio_done(rbio);
+       } else {
+               bch2_rbio_punt(rbio, bch2_rbio_retry,
+                              RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+       }
+}
+
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+                                  struct bch_read_bio *rbio)
+{
+       struct bch_fs *c = rbio->c;
+       u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+       struct bch_extent_crc_unpacked new_crc;
+       struct btree_iter iter;
+       struct bkey_i *new;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       if (crc_is_compressed(rbio->pick.crc))
+               return 0;
+
+       k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if ((ret = bkey_err(k)))
+               goto out;
+
+       if (bversion_cmp(k.k->version, rbio->version) ||
+           !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+               goto out;
+
+       /* Extent was merged? */
+       if (bkey_start_offset(k.k) < data_offset ||
+           k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+               goto out;
+
+       if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+                       rbio->pick.crc, NULL, &new_crc,
+                       bkey_start_offset(k.k) - data_offset, k.k->size,
+                       rbio->pick.crc.csum_type)) {
+               bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+               ret = 0;
+               goto out;
+       }
+
+       /*
+        * going to be temporarily appending another checksum entry:
+        */
+       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+                                sizeof(struct bch_extent_crc128));
+       if ((ret = PTR_ERR_OR_ZERO(new)))
+               goto out;
+
+       bkey_reassemble(new, k);
+
+       if (!bch2_bkey_narrow_crcs(new, new_crc))
+               goto out;
+
+       ret = bch2_trans_update(trans, &iter, new,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+out:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+       bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+                     __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+       struct bch_read_bio *rbio =
+               container_of(work, struct bch_read_bio, work);
+       struct bch_fs *c        = rbio->c;
+       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+       struct bio *src         = &rbio->bio;
+       struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
+       struct bvec_iter dst_iter = rbio->bvec_iter;
+       struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+       struct nonce nonce = extent_nonce(rbio->version, crc);
+       unsigned nofs_flags;
+       struct bch_csum csum;
+       int ret;
+
+       nofs_flags = memalloc_nofs_save();
+
+       /* Reset iterator for checksumming and copying bounced data: */
+       if (rbio->bounce) {
+               src->bi_iter.bi_size            = crc.compressed_size << 9;
+               src->bi_iter.bi_idx             = 0;
+               src->bi_iter.bi_bvec_done       = 0;
+       } else {
+               src->bi_iter                    = rbio->bvec_iter;
+       }
+
+       csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+       if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+               goto csum_err;
+
+       /*
+        * XXX
+        * We need to rework the narrow_crcs path to deliver the read completion
+        * first, and then punt to a different workqueue, otherwise we're
+        * holding up reads while doing btree updates which is bad for memory
+        * reclaim.
+        */
+       if (unlikely(rbio->narrow_crcs))
+               bch2_rbio_narrow_crcs(rbio);
+
+       if (rbio->flags & BCH_READ_NODECODE)
+               goto nodecode;
+
+       /* Adjust crc to point to subset of data we want: */
+       crc.offset     += rbio->offset_into_extent;
+       crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
+
+       if (crc_is_compressed(crc)) {
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
+
+               if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+                   !c->opts.no_data_io)
+                       goto decompression_err;
+       } else {
+               /* don't need to decrypt the entire bio: */
+               nonce = nonce_add(nonce, crc.offset << 9);
+               bio_advance(src, crc.offset << 9);
+
+               BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+               src->bi_iter.bi_size = dst_iter.bi_size;
+
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
+
+               if (rbio->bounce) {
+                       struct bvec_iter src_iter = src->bi_iter;
+
+                       bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+               }
+       }
+
+       if (rbio->promote) {
+               /*
+                * Re encrypt data we decrypted, so it's consistent with
+                * rbio->crc:
+                */
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
+
+               promote_start(rbio->promote, rbio);
+               rbio->promote = NULL;
+       }
+nodecode:
+       if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+               rbio = bch2_rbio_free(rbio);
+               bch2_rbio_done(rbio);
+       }
+out:
+       memalloc_nofs_restore(nofs_flags);
+       return;
+csum_err:
+       /*
+        * Checksum error: if the bio wasn't bounced, we may have been
+        * reading into buffers owned by userspace (that userspace can
+        * scribble over) - retry the read, bouncing it this time:
+        */
+       if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+               rbio->flags |= BCH_READ_MUST_BOUNCE;
+               bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+               goto out;
+       }
+
+       bch_err_inum_offset_ratelimited(ca,
+               rbio->read_pos.inode,
+               rbio->read_pos.offset << 9,
+               "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
+               rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+               csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+       bch2_io_error(ca);
+       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+       goto out;
+decompression_err:
+       bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+                                       rbio->read_pos.offset << 9,
+                                       "decompression error");
+       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+       goto out;
+decrypt_err:
+       bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+                                       rbio->read_pos.offset << 9,
+                                       "decrypt error");
+       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+       goto out;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+       struct bch_read_bio *rbio =
+               container_of(bio, struct bch_read_bio, bio);
+       struct bch_fs *c        = rbio->c;
+       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+       struct workqueue_struct *wq = NULL;
+       enum rbio_context context = RBIO_CONTEXT_NULL;
+
+       if (rbio->have_ioref) {
+               bch2_latency_acct(ca, rbio->submit_time, READ);
+               percpu_ref_put(&ca->io_ref);
+       }
+
+       if (!rbio->split)
+               rbio->bio.bi_end_io = rbio->end_io;
+
+       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+                                   rbio->read_pos.inode,
+                                   rbio->read_pos.offset,
+                                   "data read error: %s",
+                              bch2_blk_status_to_str(bio->bi_status))) {
+               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+               return;
+       }
+
+       if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+           ptr_stale(ca, &rbio->pick.ptr)) {
+               trace_and_count(c, read_reuse_race, &rbio->bio);
+
+               if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+                       bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+               else
+                       bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+               return;
+       }
+
+       if (rbio->narrow_crcs ||
+           rbio->promote ||
+           crc_is_compressed(rbio->pick.crc) ||
+           bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+               context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
+       else if (rbio->pick.crc.csum_type)
+               context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
+
+       bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+                               unsigned *offset_into_extent,
+                               struct bkey_buf *orig_k)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 reflink_offset;
+       int ret;
+
+       reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
+               *offset_into_extent;
+
+       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+                              POS(0, reflink_offset), 0);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_reflink_v &&
+           k.k->type != KEY_TYPE_indirect_inline_data) {
+               bch_err_inum_offset_ratelimited(trans->c,
+                       orig_k->k->k.p.inode,
+                       orig_k->k->k.p.offset << 9,
+                       "%llu len %u points to nonexistent indirect extent %llu",
+                       orig_k->k->k.p.offset,
+                       orig_k->k->k.size,
+                       reflink_offset);
+               bch2_inconsistent_error(trans->c);
+               ret = -EIO;
+               goto err;
+       }
+
+       *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+       bch2_bkey_buf_reassemble(orig_k, trans->c, k);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+                                                  struct bkey_s_c k,
+                                                  struct bch_extent_ptr ptr)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+       struct btree_iter iter;
+       struct printbuf buf = PRINTBUF;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            PTR_BUCKET_POS(c, &ptr),
+                            BTREE_ITER_CACHED);
+
+       prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+       printbuf_indent_add(&buf, 2);
+       prt_newline(&buf);
+
+       bch2_bkey_val_to_text(&buf, c, k);
+       prt_newline(&buf);
+
+       prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+       ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+       if (!ret) {
+               prt_newline(&buf);
+               bch2_bkey_val_to_text(&buf, c, k);
+       }
+
+       bch2_fs_inconsistent(c, "%s", buf.buf);
+
+       bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
+}
+
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
+                      struct bvec_iter iter, struct bpos read_pos,
+                      enum btree_id data_btree, struct bkey_s_c k,
+                      unsigned offset_into_extent,
+                      struct bch_io_failures *failed, unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct extent_ptr_decoded pick;
+       struct bch_read_bio *rbio = NULL;
+       struct bch_dev *ca = NULL;
+       struct promote_op *promote = NULL;
+       bool bounce = false, read_full = false, narrow_crcs = false;
+       struct bpos data_pos = bkey_start_pos(k.k);
+       int pick_ret;
+
+       if (bkey_extent_is_inline_data(k.k)) {
+               unsigned bytes = min_t(unsigned, iter.bi_size,
+                                      bkey_inline_data_bytes(k.k));
+
+               swap(iter.bi_size, bytes);
+               memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
+               swap(iter.bi_size, bytes);
+               bio_advance_iter(&orig->bio, &iter, bytes);
+               zero_fill_bio_iter(&orig->bio, iter);
+               goto out_read_done;
+       }
+retry_pick:
+       pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+       /* hole or reservation - just zero fill: */
+       if (!pick_ret)
+               goto hole;
+
+       if (pick_ret < 0) {
+               bch_err_inum_offset_ratelimited(c,
+                               read_pos.inode, read_pos.offset << 9,
+                               "no device to read from");
+               goto err;
+       }
+
+       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+       /*
+        * Stale dirty pointers are treated as IO errors, but @failed isn't
+        * allocated unless we're in the retry path - so if we're not in the
+        * retry path, don't check here, it'll be caught in bch2_read_endio()
+        * and we'll end up in the retry path:
+        */
+       if ((flags & BCH_READ_IN_RETRY) &&
+           !pick.ptr.cached &&
+           unlikely(ptr_stale(ca, &pick.ptr))) {
+               read_from_stale_dirty_pointer(trans, k, pick.ptr);
+               bch2_mark_io_failure(failed, &pick);
+               goto retry_pick;
+       }
+
+       /*
+        * Unlock the iterator while the btree node's lock is still in
+        * cache, before doing the IO:
+        */
+       bch2_trans_unlock(trans);
+
+       if (flags & BCH_READ_NODECODE) {
+               /*
+                * can happen if we retry, and the extent we were going to read
+                * has been merged in the meantime:
+                */
+               if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+                       goto hole;
+
+               iter.bi_size    = pick.crc.compressed_size << 9;
+               goto get_bio;
+       }
+
+       if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+           bio_flagged(&orig->bio, BIO_CHAIN))
+               flags |= BCH_READ_MUST_CLONE;
+
+       narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+               bch2_can_narrow_extent_crcs(k, pick.crc);
+
+       if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+               flags |= BCH_READ_MUST_BOUNCE;
+
+       EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+       if (crc_is_compressed(pick.crc) ||
+           (pick.crc.csum_type != BCH_CSUM_none &&
+            (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+             (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+              (flags & BCH_READ_USER_MAPPED)) ||
+             (flags & BCH_READ_MUST_BOUNCE)))) {
+               read_full = true;
+               bounce = true;
+       }
+
+       if (orig->opts.promote_target)
+               promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
+                                       &rbio, &bounce, &read_full);
+
+       if (!read_full) {
+               EBUG_ON(crc_is_compressed(pick.crc));
+               EBUG_ON(pick.crc.csum_type &&
+                       (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+                        bvec_iter_sectors(iter) != pick.crc.live_size ||
+                        pick.crc.offset ||
+                        offset_into_extent));
+
+               data_pos.offset += offset_into_extent;
+               pick.ptr.offset += pick.crc.offset +
+                       offset_into_extent;
+               offset_into_extent              = 0;
+               pick.crc.compressed_size        = bvec_iter_sectors(iter);
+               pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
+               pick.crc.offset                 = 0;
+               pick.crc.live_size              = bvec_iter_sectors(iter);
+       }
+get_bio:
+       if (rbio) {
+               /*
+                * promote already allocated bounce rbio:
+                * promote needs to allocate a bio big enough for uncompressing
+                * data in the write path, but we're not going to use it all
+                * here:
+                */
+               EBUG_ON(rbio->bio.bi_iter.bi_size <
+                      pick.crc.compressed_size << 9);
+               rbio->bio.bi_iter.bi_size =
+                       pick.crc.compressed_size << 9;
+       } else if (bounce) {
+               unsigned sectors = pick.crc.compressed_size;
+
+               rbio = rbio_init(bio_alloc_bioset(NULL,
+                                                 DIV_ROUND_UP(sectors, PAGE_SECTORS),
+                                                 0,
+                                                 GFP_NOFS,
+                                                 &c->bio_read_split),
+                                orig->opts);
+
+               bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+               rbio->bounce    = true;
+               rbio->split     = true;
+       } else if (flags & BCH_READ_MUST_CLONE) {
+               /*
+                * Have to clone if there were any splits, due to error
+                * reporting issues (if a split errored, and retrying didn't
+                * work, when it reports the error to its parent (us) we don't
+                * know if the error was from our bio, and we should retry, or
+                * from the whole bio, in which case we don't want to retry and
+                * lose the error)
+                */
+               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+                                                &c->bio_read_split),
+                                orig->opts);
+               rbio->bio.bi_iter = iter;
+               rbio->split     = true;
+       } else {
+               rbio = orig;
+               rbio->bio.bi_iter = iter;
+               EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+       }
+
+       EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+       rbio->c                 = c;
+       rbio->submit_time       = local_clock();
+       if (rbio->split)
+               rbio->parent    = orig;
+       else
+               rbio->end_io    = orig->bio.bi_end_io;
+       rbio->bvec_iter         = iter;
+       rbio->offset_into_extent= offset_into_extent;
+       rbio->flags             = flags;
+       rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+       rbio->narrow_crcs       = narrow_crcs;
+       rbio->hole              = 0;
+       rbio->retry             = 0;
+       rbio->context           = 0;
+       /* XXX: only initialize this if needed */
+       rbio->devs_have         = bch2_bkey_devs(k);
+       rbio->pick              = pick;
+       rbio->subvol            = orig->subvol;
+       rbio->read_pos          = read_pos;
+       rbio->data_btree        = data_btree;
+       rbio->data_pos          = data_pos;
+       rbio->version           = k.k->version;
+       rbio->promote           = promote;
+       INIT_WORK(&rbio->work, NULL);
+
+       rbio->bio.bi_opf        = orig->bio.bi_opf;
+       rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+       rbio->bio.bi_end_io     = bch2_read_endio;
+
+       if (rbio->bounce)
+               trace_and_count(c, read_bounce, &rbio->bio);
+
+       this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+       bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+       /*
+        * If it's being moved internally, we don't want to flag it as a cache
+        * hit:
+        */
+       if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+               bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+                       PTR_BUCKET_NR(ca, &pick.ptr), READ);
+
+       if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+               bio_inc_remaining(&orig->bio);
+               trace_and_count(c, read_split, &orig->bio);
+       }
+
+       if (!rbio->pick.idx) {
+               if (!rbio->have_ioref) {
+                       bch_err_inum_offset_ratelimited(c,
+                                       read_pos.inode,
+                                       read_pos.offset << 9,
+                                       "no device to read from");
+                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+                       goto out;
+               }
+
+               this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
+                            bio_sectors(&rbio->bio));
+               bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+               if (unlikely(c->opts.no_data_io)) {
+                       if (likely(!(flags & BCH_READ_IN_RETRY)))
+                               bio_endio(&rbio->bio);
+               } else {
+                       if (likely(!(flags & BCH_READ_IN_RETRY)))
+                               submit_bio(&rbio->bio);
+                       else
+                               submit_bio_wait(&rbio->bio);
+               }
+
+               /*
+                * We just submitted IO which may block, we expect relock fail
+                * events and shouldn't count them:
+                */
+               trans->notrace_relock_fail = true;
+       } else {
+               /* Attempting reconstruct read: */
+               if (bch2_ec_read_extent(c, rbio)) {
+                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+                       goto out;
+               }
+
+               if (likely(!(flags & BCH_READ_IN_RETRY)))
+                       bio_endio(&rbio->bio);
+       }
+out:
+       if (likely(!(flags & BCH_READ_IN_RETRY))) {
+               return 0;
+       } else {
+               int ret;
+
+               rbio->context = RBIO_CONTEXT_UNBOUND;
+               bch2_read_endio(&rbio->bio);
+
+               ret = rbio->retry;
+               rbio = bch2_rbio_free(rbio);
+
+               if (ret == READ_RETRY_AVOID) {
+                       bch2_mark_io_failure(failed, &pick);
+                       ret = READ_RETRY;
+               }
+
+               if (!ret)
+                       goto out_read_done;
+
+               return ret;
+       }
+
+err:
+       if (flags & BCH_READ_IN_RETRY)
+               return READ_ERR;
+
+       orig->bio.bi_status = BLK_STS_IOERR;
+       goto out_read_done;
+
+hole:
+       /*
+        * won't normally happen in the BCH_READ_NODECODE
+        * (bch2_move_extent()) path, but if we retry and the extent we wanted
+        * to read no longer exists we have to signal that:
+        */
+       if (flags & BCH_READ_NODECODE)
+               orig->hole = true;
+
+       zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+       if (flags & BCH_READ_LAST_FRAGMENT)
+               bch2_rbio_done(orig);
+       return 0;
+}
+
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+                struct bvec_iter bvec_iter, subvol_inum inum,
+                struct bch_io_failures *failed, unsigned flags)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       struct bkey_buf sk;
+       struct bkey_s_c k;
+       u32 snapshot;
+       int ret;
+
+       BUG_ON(flags & BCH_READ_NODECODE);
+
+       bch2_bkey_buf_init(&sk);
+retry:
+       bch2_trans_begin(trans);
+       iter = (struct btree_iter) { NULL };
+
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+                            BTREE_ITER_SLOTS);
+       while (1) {
+               unsigned bytes, sectors, offset_into_extent;
+               enum btree_id data_btree = BTREE_ID_extents;
+
+               /*
+                * read_extent -> io_time_reset may cause a transaction restart
+                * without returning an error, we need to check for that here:
+                */
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       break;
+
+               bch2_btree_iter_set_pos(&iter,
+                               POS(inum.inum, bvec_iter.bi_sector));
+
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               offset_into_extent = iter.pos.offset -
+                       bkey_start_offset(k.k);
+               sectors = k.k->size - offset_into_extent;
+
+               bch2_bkey_buf_reassemble(&sk, c, k);
+
+               ret = bch2_read_indirect_extent(trans, &data_btree,
+                                       &offset_into_extent, &sk);
+               if (ret)
+                       break;
+
+               k = bkey_i_to_s_c(sk.k);
+
+               /*
+                * With indirect extents, the amount of data to read is the min
+                * of the original extent and the indirect extent:
+                */
+               sectors = min(sectors, k.k->size - offset_into_extent);
+
+               bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+               swap(bvec_iter.bi_size, bytes);
+
+               if (bvec_iter.bi_size == bytes)
+                       flags |= BCH_READ_LAST_FRAGMENT;
+
+               ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
+                                        data_btree, k,
+                                        offset_into_extent, failed, flags);
+               if (ret)
+                       break;
+
+               if (flags & BCH_READ_LAST_FRAGMENT)
+                       break;
+
+               swap(bvec_iter.bi_size, bytes);
+               bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+               ret = btree_trans_too_many_iters(trans);
+               if (ret)
+                       break;
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+           ret == READ_RETRY ||
+           ret == READ_RETRY_AVOID)
+               goto retry;
+
+       bch2_trans_put(trans);
+       bch2_bkey_buf_exit(&sk, c);
+
+       if (ret) {
+               bch_err_inum_offset_ratelimited(c, inum.inum,
+                                               bvec_iter.bi_sector << 9,
+                                               "read error %i from btree lookup", ret);
+               rbio->bio.bi_status = BLK_STS_IOERR;
+               bch2_rbio_done(rbio);
+       }
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *c)
+{
+       if (c->promote_table.tbl)
+               rhashtable_destroy(&c->promote_table);
+       bioset_exit(&c->bio_read_split);
+       bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_read_init(struct bch_fs *c)
+{
+       if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_read_init;
+
+       if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+       if (rhashtable_init(&c->promote_table, &bch_promote_params))
+               return -BCH_ERR_ENOMEM_promote_table_init;
+
+       return 0;
+}
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
new file mode 100644 (file)
index 0000000..d9c18bb
--- /dev/null
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_READ_H
+#define _BCACHEFS_IO_READ_H
+
+#include "bkey_buf.h"
+
+struct bch_read_bio {
+       struct bch_fs           *c;
+       u64                     start_time;
+       u64                     submit_time;
+
+       /*
+        * Reads will often have to be split, and if the extent being read from
+        * was checksummed or compressed we'll also have to allocate bounce
+        * buffers and copy the data back into the original bio.
+        *
+        * If we didn't have to split, we have to save and restore the original
+        * bi_end_io - @split below indicates which:
+        */
+       union {
+       struct bch_read_bio     *parent;
+       bio_end_io_t            *end_io;
+       };
+
+       /*
+        * Saved copy of bio->bi_iter, from submission time - allows us to
+        * resubmit on IO error, and also to copy data back to the original bio
+        * when we're bouncing:
+        */
+       struct bvec_iter        bvec_iter;
+
+       unsigned                offset_into_extent;
+
+       u16                     flags;
+       union {
+       struct {
+       u16                     bounce:1,
+                               split:1,
+                               kmalloc:1,
+                               have_ioref:1,
+                               narrow_crcs:1,
+                               hole:1,
+                               retry:2,
+                               context:2;
+       };
+       u16                     _state;
+       };
+
+       struct bch_devs_list    devs_have;
+
+       struct extent_ptr_decoded pick;
+
+       /*
+        * pos we read from - different from data_pos for indirect extents:
+        */
+       u32                     subvol;
+       struct bpos             read_pos;
+
+       /*
+        * start pos of data we read (may not be pos of data we want) - for
+        * promote, narrow extents paths:
+        */
+       enum btree_id           data_btree;
+       struct bpos             data_pos;
+       struct bversion         version;
+
+       struct promote_op       *promote;
+
+       struct bch_io_opts      opts;
+
+       struct work_struct      work;
+
+       struct bio              bio;
+};
+
+#define to_rbio(_bio)          container_of((_bio), struct bch_read_bio, bio)
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+                               struct bkey_buf *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+                                           enum btree_id *data_btree,
+                                           unsigned *offset_into_extent,
+                                           struct bkey_buf *k)
+{
+       if (k->k->k.type != KEY_TYPE_reflink_p)
+               return 0;
+
+       *data_btree = BTREE_ID_reflink;
+       return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+}
+
+enum bch_read_flags {
+       BCH_READ_RETRY_IF_STALE         = 1 << 0,
+       BCH_READ_MAY_PROMOTE            = 1 << 1,
+       BCH_READ_USER_MAPPED            = 1 << 2,
+       BCH_READ_NODECODE               = 1 << 3,
+       BCH_READ_LAST_FRAGMENT          = 1 << 4,
+
+       /* internal: */
+       BCH_READ_MUST_BOUNCE            = 1 << 5,
+       BCH_READ_MUST_CLONE             = 1 << 6,
+       BCH_READ_IN_RETRY               = 1 << 7,
+};
+
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
+                      struct bvec_iter, struct bpos, enum btree_id,
+                      struct bkey_s_c, unsigned,
+                      struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct btree_trans *trans,
+                       struct bch_read_bio *rbio, struct bpos read_pos,
+                       enum btree_id data_btree, struct bkey_s_c k,
+                       unsigned offset_into_extent, unsigned flags)
+{
+       __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+                          data_btree, k, offset_into_extent, NULL, flags);
+}
+
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+                subvol_inum, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+                            subvol_inum inum)
+{
+       struct bch_io_failures failed = { .nr = 0 };
+
+       BUG_ON(rbio->_state);
+
+       rbio->c = c;
+       rbio->start_time = local_clock();
+       rbio->subvol = inum.subvol;
+
+       __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
+                   BCH_READ_RETRY_IF_STALE|
+                   BCH_READ_MAY_PROMOTE|
+                   BCH_READ_USER_MAPPED);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+                                            struct bch_io_opts opts)
+{
+       struct bch_read_bio *rbio = to_rbio(bio);
+
+       rbio->_state    = 0;
+       rbio->promote   = NULL;
+       rbio->opts      = opts;
+       return rbio;
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *);
+int bch2_fs_io_read_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_READ_H */
similarity index 53%
rename from libbcachefs/io.c
rename to libbcachefs/io_write.c
index 3c614c864b6eef698c9dac8ebc1884d7f4a61bed..d2a0de886c7a48b47577975d4e3fd5d0647acf8f 100644 (file)
@@ -1,29 +1,24 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Some low level IO code, and hacks for various block layer limitations
- *
  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  * Copyright 2012 Google, Inc.
  */
 
 #include "bcachefs.h"
-#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_buf.h"
 #include "bset.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "checksum.h"
-#include "compress.h"
 #include "clock.h"
-#include "data_update.h"
+#include "compress.h"
 #include "debug.h"
-#include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
 #include "extent_update.h"
 #include "inode.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
-       if (status == BLK_STS_REMOVED)
-               return "device removed";
-       return blk_status_to_str(status);
-}
-
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-       const struct bch_devs_mask *devs;
-       unsigned d, nr = 0, total = 0;
-       u64 now = local_clock(), last;
-       s64 congested;
-       struct bch_dev *ca;
-
-       if (!target)
-               return false;
-
-       rcu_read_lock();
-       devs = bch2_target_to_mask(c, target) ?:
-               &c->rw_devs[BCH_DATA_user];
-
-       for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
-               ca = rcu_dereference(c->devs[d]);
-               if (!ca)
-                       continue;
-
-               congested = atomic_read(&ca->congested);
-               last = READ_ONCE(ca->congested_last);
-               if (time_after64(now, last))
-                       congested -= (now - last) >> 12;
-
-               total += max(congested, 0LL);
-               nr++;
-       }
-       rcu_read_unlock();
-
-       return bch2_rand_range(nr * CONGESTED_MAX) < total;
-}
-
 static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
                                       u64 now, int rw)
 {
@@ -136,13 +91,6 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
        __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 }
 
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-       return false;
-}
-
 #endif
 
 /* Allocate, free from mempool: */
@@ -368,213 +316,13 @@ int bch2_extent_update(struct btree_trans *trans,
        return 0;
 }
 
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
-                         subvol_inum inum,
-                         struct btree_iter *iter,
-                         unsigned sectors,
-                         struct bch_io_opts opts,
-                         s64 *i_sectors_delta,
-                         struct write_point_specifier write_point)
-{
-       struct bch_fs *c = trans->c;
-       struct disk_reservation disk_res = { 0 };
-       struct closure cl;
-       struct open_buckets open_buckets = { 0 };
-       struct bkey_s_c k;
-       struct bkey_buf old, new;
-       unsigned sectors_allocated = 0;
-       bool have_reservation = false;
-       bool unwritten = opts.nocow &&
-           c->sb.version >= bcachefs_metadata_version_unwritten_extents;
-       int ret;
-
-       bch2_bkey_buf_init(&old);
-       bch2_bkey_buf_init(&new);
-       closure_init_stack(&cl);
-
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
-
-       if (!have_reservation) {
-               unsigned new_replicas =
-                       max(0, (int) opts.data_replicas -
-                           (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-               /*
-                * Get a disk reservation before (in the nocow case) calling
-                * into the allocator:
-                */
-               ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
-               if (unlikely(ret))
-                       goto err;
-
-               bch2_bkey_buf_reassemble(&old, c, k);
-       }
-
-       if (have_reservation) {
-               if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
-                       goto err;
-
-               bch2_key_resize(&new.k->k, sectors);
-       } else if (!unwritten) {
-               struct bkey_i_reservation *reservation;
-
-               bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
-               reservation = bkey_reservation_init(new.k);
-               reservation->k.p = iter->pos;
-               bch2_key_resize(&reservation->k, sectors);
-               reservation->v.nr_replicas = opts.data_replicas;
-       } else {
-               struct bkey_i_extent *e;
-               struct bch_devs_list devs_have;
-               struct write_point *wp;
-               struct bch_extent_ptr *ptr;
-
-               devs_have.nr = 0;
-
-               bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
-               e = bkey_extent_init(new.k);
-               e->k.p = iter->pos;
-
-               ret = bch2_alloc_sectors_start_trans(trans,
-                               opts.foreground_target,
-                               false,
-                               write_point,
-                               &devs_have,
-                               opts.data_replicas,
-                               opts.data_replicas,
-                               BCH_WATERMARK_normal, 0, &cl, &wp);
-               if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-                       ret = -BCH_ERR_transaction_restart_nested;
-               if (ret)
-                       goto err;
-
-               sectors = min(sectors, wp->sectors_free);
-               sectors_allocated = sectors;
-
-               bch2_key_resize(&e->k, sectors);
-
-               bch2_open_bucket_get(c, wp, &open_buckets);
-               bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-               bch2_alloc_sectors_done(c, wp);
-
-               extent_for_each_ptr(extent_i_to_s(e), ptr)
-                       ptr->unwritten = true;
-       }
-
-       have_reservation = true;
-
-       ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
-                                0, i_sectors_delta, true);
-err:
-       if (!ret && sectors_allocated)
-               bch2_increment_clock(c, sectors_allocated, WRITE);
-
-       bch2_open_buckets_put(c, &open_buckets);
-       bch2_disk_reservation_put(c, &disk_res);
-       bch2_bkey_buf_exit(&new, c);
-       bch2_bkey_buf_exit(&old, c);
-
-       if (closure_nr_remaining(&cl) != 1) {
-               bch2_trans_unlock(trans);
-               closure_sync(&cl);
-       }
-
-       return ret;
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-                  subvol_inum inum, u64 end,
-                  s64 *i_sectors_delta)
-{
-       struct bch_fs *c        = trans->c;
-       unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
-       struct bpos end_pos = POS(inum.inum, end);
-       struct bkey_s_c k;
-       int ret = 0, ret2 = 0;
-       u32 snapshot;
-
-       while (!ret ||
-              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(c, 0);
-               struct bkey_i delete;
-
-               if (ret)
-                       ret2 = ret;
-
-               bch2_trans_begin(trans);
-
-               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-               if (ret)
-                       continue;
-
-               bch2_btree_iter_set_snapshot(iter, snapshot);
-
-               /*
-                * peek_upto() doesn't have ideal semantics for extents:
-                */
-               k = bch2_btree_iter_peek_upto(iter, end_pos);
-               if (!k.k)
-                       break;
-
-               ret = bkey_err(k);
-               if (ret)
-                       continue;
-
-               bkey_init(&delete.k);
-               delete.k.p = iter->pos;
-
-               /* create the biggest key we can */
-               bch2_key_resize(&delete.k, max_sectors);
-               bch2_cut_back(end_pos, &delete);
-
-               ret = bch2_extent_update(trans, inum, iter, &delete,
-                               &disk_res, 0, i_sectors_delta, false);
-               bch2_disk_reservation_put(c, &disk_res);
-       }
-
-       return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
-               s64 *i_sectors_delta)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            POS(inum.inum, start),
-                            BTREE_ITER_INTENT);
-
-       ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
-
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               ret = 0;
-
-       return ret;
-}
-
 static int bch2_write_index_default(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
        struct bkey_buf sk;
        struct keylist *keys = &op->insert_keys;
        struct bkey_i *k = bch2_keylist_front(keys);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        subvol_inum inum = {
                .subvol = op->subvol,
@@ -585,30 +333,29 @@ static int bch2_write_index_default(struct bch_write_op *op)
        BUG_ON(!inum.subvol);
 
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
        do {
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                k = bch2_keylist_front(keys);
                bch2_bkey_buf_copy(&sk, c, k);
 
-               ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
                                                  &sk.k->k.p.snapshot);
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        continue;
                if (ret)
                        break;
 
-               bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+               bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                                     bkey_start_pos(&sk.k->k),
                                     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-               ret = bch2_extent_update(&trans, inum, &iter, sk.k,
+               ret = bch2_extent_update(trans, inum, &iter, sk.k,
                                         &op->res,
                                         op->new_i_size, &op->i_sectors_delta,
                                         op->flags & BCH_WRITE_CHECK_ENOSPC);
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
 
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        continue;
@@ -621,7 +368,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
                        bch2_cut_front(iter.pos, k);
        } while (!bch2_keylist_empty(keys));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&sk, c);
 
        return ret;
@@ -741,7 +488,8 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
 }
 
 /**
- * bch_write_index - after a write, update index to point to new data
+ * __bch2_write_index - after a write, update index to point to new data
+ * @op:                bch_write_op to process
  */
 static void __bch2_write_index(struct bch_write_op *op)
 {
@@ -778,10 +526,10 @@ static void __bch2_write_index(struct bch_write_op *op)
                op->written += sectors_start - keylist_sectors(keys);
 
                if (ret && !bch2_err_matches(ret, EROFS)) {
-                       struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+                       struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
                        bch_err_inum_offset_ratelimited(c,
-                               k->k.p.inode, k->k.p.offset << 9,
+                               insert->k.p.inode, insert->k.p.offset << 9,
                                "write error while doing btree update: %s",
                                bch2_err_str(ret));
                }
@@ -1182,7 +930,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
        do {
                struct bch_extent_crc_unpacked crc = { 0 };
                struct bversion version = op->version;
-               size_t dst_len, src_len;
+               size_t dst_len = 0, src_len = 0;
 
                if (page_alloc_failed &&
                    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
@@ -1414,27 +1162,25 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
 static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_i *orig;
        struct bkey_s_c k;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for_each_keylist_key(&op->insert_keys, orig) {
-               ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
+               ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
                                     bkey_start_pos(&orig->k), orig->k.p,
                                     BTREE_ITER_INTENT, k,
                                     NULL, NULL, BTREE_INSERT_NOFAIL, ({
-                       bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
+                       bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
                }));
 
                if (ret && !bch2_err_matches(ret, EROFS)) {
-                       struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
+                       struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
                        bch_err_inum_offset_ratelimited(c,
-                               k->k.p.inode, k->k.p.offset << 9,
+                               insert->k.p.inode, insert->k.p.offset << 9,
                                "write error while doing btree update: %s",
                                bch2_err_str(ret));
                }
@@ -1445,7 +1191,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
                }
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 }
 
 static void __bch2_nocow_write_done(struct bch_write_op *op)
@@ -1469,7 +1215,7 @@ static void bch2_nocow_write_done(struct closure *cl)
 static void bch2_nocow_write(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_ptrs_c ptrs;
@@ -1486,15 +1232,15 @@ static void bch2_nocow_write(struct bch_write_op *op)
        if (op->flags & BCH_WRITE_MOVE)
                return;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
        if (unlikely(ret))
                goto err;
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                             SPOS(op->pos.inode, op->pos.offset, snapshot),
                             BTREE_ITER_SLOTS);
        while (1) {
@@ -1540,7 +1286,7 @@ retry:
 
                /* Unlock before taking nocow locks, doing IO: */
                bkey_reassemble(op->insert_keys.top, k);
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
 
                bch2_cut_front(op->pos, op->insert_keys.top);
                if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
@@ -1589,7 +1335,7 @@ retry:
                bch2_btree_iter_advance(&iter);
        }
 out:
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
@@ -1604,7 +1350,7 @@ err:
                op->flags |= BCH_WRITE_DONE;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        /* fallback to cow write path? */
        if (!(op->flags & BCH_WRITE_DONE)) {
@@ -1682,7 +1428,7 @@ again:
                 * allocations for specific disks may hang arbitrarily long:
                 */
                ret = bch2_trans_do(c, NULL, NULL, 0,
-                       bch2_alloc_sectors_start_trans(&trans,
+                       bch2_alloc_sectors_start_trans(trans,
                                op->target,
                                op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
                                op->write_point,
@@ -1798,7 +1544,8 @@ err:
 }
 
 /**
- * bch_write - handle a write to a cache device or flash only volume
+ * bch2_write() - handle a write to a cache device or flash only volume
+ * @cl:                &bch_write_op->cl
  *
  * This is the starting point for any data to end up in a cache device; it could
  * be from a normal write, or a writeback write, or a write to a flash only
@@ -1899,1143 +1646,17 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
        printbuf_indent_sub(out, 2);
 }
 
-/* Cache promotion on read */
-
-struct promote_op {
-       struct rcu_head         rcu;
-       u64                     start_time;
-
-       struct rhash_head       hash;
-       struct bpos             pos;
-
-       struct data_update      write;
-       struct bio_vec          bi_inline_vecs[0]; /* must be last */
-};
-
-static const struct rhashtable_params bch_promote_params = {
-       .head_offset    = offsetof(struct promote_op, hash),
-       .key_offset     = offsetof(struct promote_op, pos),
-       .key_len        = sizeof(struct bpos),
-};
-
-static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
-                                 struct bpos pos,
-                                 struct bch_io_opts opts,
-                                 unsigned flags)
-{
-       if (!(flags & BCH_READ_MAY_PROMOTE))
-               return false;
-
-       if (!opts.promote_target)
-               return false;
-
-       if (bch2_bkey_has_target(c, k, opts.promote_target))
-               return false;
-
-       if (bkey_extent_is_unwritten(k))
-               return false;
-
-       if (bch2_target_congested(c, opts.promote_target)) {
-               /* XXX trace this */
-               return false;
-       }
-
-       if (rhashtable_lookup_fast(&c->promote_table, &pos,
-                                  bch_promote_params))
-               return false;
-
-       return true;
-}
-
-static void promote_free(struct bch_fs *c, struct promote_op *op)
-{
-       int ret;
-
-       bch2_data_update_exit(&op->write);
-
-       ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-                                    bch_promote_params);
-       BUG_ON(ret);
-       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-       kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
-       struct promote_op *op =
-               container_of(wop, struct promote_op, write.op);
-       struct bch_fs *c = op->write.op.c;
-
-       bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
-                              op->start_time);
-       promote_free(c, op);
-}
-
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
-{
-       struct bio *bio = &op->write.op.wbio.bio;
-
-       trace_and_count(op->write.op.c, read_promote, &rbio->bio);
-
-       /* we now own pages: */
-       BUG_ON(!rbio->bounce);
-       BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
-
-       memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
-              sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-       swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-
-       bch2_data_update_read_done(&op->write, rbio->pick.crc);
-}
-
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
-                                         enum btree_id btree_id,
-                                         struct bkey_s_c k,
-                                         struct bpos pos,
-                                         struct extent_ptr_decoded *pick,
-                                         struct bch_io_opts opts,
-                                         unsigned sectors,
-                                         struct bch_read_bio **rbio)
-{
-       struct bch_fs *c = trans->c;
-       struct promote_op *op = NULL;
-       struct bio *bio;
-       unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-       int ret;
-
-       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
-               return NULL;
-
-       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
-       if (!op)
-               goto err;
-
-       op->start_time = local_clock();
-       op->pos = pos;
-
-       /*
-        * We don't use the mempool here because extents that aren't
-        * checksummed or compressed can be too big for the mempool:
-        */
-       *rbio = kzalloc(sizeof(struct bch_read_bio) +
-                       sizeof(struct bio_vec) * pages,
-                       GFP_NOFS);
-       if (!*rbio)
-               goto err;
-
-       rbio_init(&(*rbio)->bio, opts);
-       bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
-
-       if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
-                                GFP_NOFS))
-               goto err;
-
-       (*rbio)->bounce         = true;
-       (*rbio)->split          = true;
-       (*rbio)->kmalloc        = true;
-
-       if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
-                                         bch_promote_params))
-               goto err;
-
-       bio = &op->write.op.wbio.bio;
-       bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
-
-       ret = bch2_data_update_init(trans, NULL, &op->write,
-                       writepoint_hashed((unsigned long) current),
-                       opts,
-                       (struct data_update_opts) {
-                               .target         = opts.promote_target,
-                               .extra_replicas = 1,
-                               .write_flags    = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
-                       },
-                       btree_id, k);
-       /*
-        * possible errors: -BCH_ERR_nocow_lock_blocked,
-        * -BCH_ERR_ENOSPC_disk_reservation:
-        */
-       if (ret) {
-               ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-                                       bch_promote_params);
-               BUG_ON(ret);
-               goto err;
-       }
-
-       op->write.op.end_io = promote_done;
-
-       return op;
-err:
-       if (*rbio)
-               bio_free_pages(&(*rbio)->bio);
-       kfree(*rbio);
-       *rbio = NULL;
-       kfree(op);
-       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-       return NULL;
-}
-
-noinline
-static struct promote_op *promote_alloc(struct btree_trans *trans,
-                                       struct bvec_iter iter,
-                                       struct bkey_s_c k,
-                                       struct extent_ptr_decoded *pick,
-                                       struct bch_io_opts opts,
-                                       unsigned flags,
-                                       struct bch_read_bio **rbio,
-                                       bool *bounce,
-                                       bool *read_full)
-{
-       struct bch_fs *c = trans->c;
-       bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
-       /* data might have to be decompressed in the write path: */
-       unsigned sectors = promote_full
-               ? max(pick->crc.compressed_size, pick->crc.live_size)
-               : bvec_iter_sectors(iter);
-       struct bpos pos = promote_full
-               ? bkey_start_pos(k.k)
-               : POS(k.k->p.inode, iter.bi_sector);
-       struct promote_op *promote;
-
-       if (!should_promote(c, k, pos, opts, flags))
-               return NULL;
-
-       promote = __promote_alloc(trans,
-                                 k.k->type == KEY_TYPE_reflink_v
-                                 ? BTREE_ID_reflink
-                                 : BTREE_ID_extents,
-                                 k, pos, pick, opts, sectors, rbio);
-       if (!promote)
-               return NULL;
-
-       *bounce         = true;
-       *read_full      = promote_full;
-       return promote;
-}
-
-/* Read */
-
-#define READ_RETRY_AVOID       1
-#define READ_RETRY             2
-#define READ_ERR               3
-
-enum rbio_context {
-       RBIO_CONTEXT_NULL,
-       RBIO_CONTEXT_HIGHPRI,
-       RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
-       return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
-                          enum rbio_context context,
-                          struct workqueue_struct *wq)
-{
-       if (context <= rbio->context) {
-               fn(&rbio->work);
-       } else {
-               rbio->work.func         = fn;
-               rbio->context           = context;
-               queue_work(wq, &rbio->work);
-       }
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
-       BUG_ON(rbio->bounce && !rbio->split);
-
-       if (rbio->promote)
-               promote_free(rbio->c, rbio->promote);
-       rbio->promote = NULL;
-
-       if (rbio->bounce)
-               bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
-       if (rbio->split) {
-               struct bch_read_bio *parent = rbio->parent;
-
-               if (rbio->kmalloc)
-                       kfree(rbio);
-               else
-                       bio_put(&rbio->bio);
-
-               rbio = parent;
-       }
-
-       return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-       if (rbio->start_time)
-               bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
-                                      rbio->start_time);
-       bio_endio(&rbio->bio);
-}
-
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-                                    struct bvec_iter bvec_iter,
-                                    struct bch_io_failures *failed,
-                                    unsigned flags)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_buf sk;
-       struct bkey_s_c k;
-       int ret;
-
-       flags &= ~BCH_READ_LAST_FRAGMENT;
-       flags |= BCH_READ_MUST_CLONE;
-
-       bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
-                            rbio->read_pos, BTREE_ITER_SLOTS);
-retry:
-       rbio->bio.bi_status = 0;
-
-       k = bch2_btree_iter_peek_slot(&iter);
-       if (bkey_err(k))
-               goto err;
-
-       bch2_bkey_buf_reassemble(&sk, c, k);
-       k = bkey_i_to_s_c(sk.k);
-       bch2_trans_unlock(&trans);
-
-       if (!bch2_bkey_matches_ptr(c, k,
-                                  rbio->pick.ptr,
-                                  rbio->data_pos.offset -
-                                  rbio->pick.crc.offset)) {
-               /* extent we wanted to read no longer exists: */
-               rbio->hole = true;
-               goto out;
-       }
-
-       ret = __bch2_read_extent(&trans, rbio, bvec_iter,
-                                rbio->read_pos,
-                                rbio->data_btree,
-                                k, 0, failed, flags);
-       if (ret == READ_RETRY)
-               goto retry;
-       if (ret)
-               goto err;
-out:
-       bch2_rbio_done(rbio);
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&sk, c);
-       return;
-err:
-       rbio->bio.bi_status = BLK_STS_IOERR;
-       goto out;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
+void bch2_fs_io_write_exit(struct bch_fs *c)
 {
-       struct bch_read_bio *rbio =
-               container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c        = rbio->c;
-       struct bvec_iter iter   = rbio->bvec_iter;
-       unsigned flags          = rbio->flags;
-       subvol_inum inum = {
-               .subvol = rbio->subvol,
-               .inum   = rbio->read_pos.inode,
-       };
-       struct bch_io_failures failed = { .nr = 0 };
-
-       trace_and_count(c, read_retry, &rbio->bio);
-
-       if (rbio->retry == READ_RETRY_AVOID)
-               bch2_mark_io_failure(&failed, &rbio->pick);
-
-       rbio->bio.bi_status = 0;
-
-       rbio = bch2_rbio_free(rbio);
-
-       flags |= BCH_READ_IN_RETRY;
-       flags &= ~BCH_READ_MAY_PROMOTE;
-
-       if (flags & BCH_READ_NODECODE) {
-               bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
-       } else {
-               flags &= ~BCH_READ_LAST_FRAGMENT;
-               flags |= BCH_READ_MUST_CLONE;
-
-               __bch2_read(c, rbio, iter, inum, &failed, flags);
-       }
+       mempool_exit(&c->bio_bounce_pages);
+       bioset_exit(&c->bio_write);
 }
 
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
-                           blk_status_t error)
+int bch2_fs_io_write_init(struct bch_fs *c)
 {
-       rbio->retry = retry;
-
-       if (rbio->flags & BCH_READ_IN_RETRY)
-               return;
-
-       if (retry == READ_ERR) {
-               rbio = bch2_rbio_free(rbio);
-
-               rbio->bio.bi_status = error;
-               bch2_rbio_done(rbio);
-       } else {
-               bch2_rbio_punt(rbio, bch2_rbio_retry,
-                              RBIO_CONTEXT_UNBOUND, system_unbound_wq);
-       }
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
-                                  struct bch_read_bio *rbio)
-{
-       struct bch_fs *c = rbio->c;
-       u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
-       struct bch_extent_crc_unpacked new_crc;
-       struct btree_iter iter;
-       struct bkey_i *new;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       if (crc_is_compressed(rbio->pick.crc))
-               return 0;
-
-       k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
-                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       if ((ret = bkey_err(k)))
-               goto out;
-
-       if (bversion_cmp(k.k->version, rbio->version) ||
-           !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
-               goto out;
-
-       /* Extent was merged? */
-       if (bkey_start_offset(k.k) < data_offset ||
-           k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
-               goto out;
-
-       if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
-                       rbio->pick.crc, NULL, &new_crc,
-                       bkey_start_offset(k.k) - data_offset, k.k->size,
-                       rbio->pick.crc.csum_type)) {
-               bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
-               ret = 0;
-               goto out;
-       }
-
-       /*
-        * going to be temporarily appending another checksum entry:
-        */
-       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-                                sizeof(struct bch_extent_crc128));
-       if ((ret = PTR_ERR_OR_ZERO(new)))
-               goto out;
-
-       bkey_reassemble(new, k);
-
-       if (!bch2_bkey_narrow_crcs(new, new_crc))
-               goto out;
-
-       ret = bch2_trans_update(trans, &iter, new,
-                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
-       bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
-                     __bch2_rbio_narrow_crcs(&trans, rbio));
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
-       struct bch_read_bio *rbio =
-               container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c        = rbio->c;
-       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-       struct bio *src         = &rbio->bio;
-       struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
-       struct bvec_iter dst_iter = rbio->bvec_iter;
-       struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-       struct nonce nonce = extent_nonce(rbio->version, crc);
-       unsigned nofs_flags;
-       struct bch_csum csum;
-       int ret;
-
-       nofs_flags = memalloc_nofs_save();
-
-       /* Reset iterator for checksumming and copying bounced data: */
-       if (rbio->bounce) {
-               src->bi_iter.bi_size            = crc.compressed_size << 9;
-               src->bi_iter.bi_idx             = 0;
-               src->bi_iter.bi_bvec_done       = 0;
-       } else {
-               src->bi_iter                    = rbio->bvec_iter;
-       }
-
-       csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-       if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
-               goto csum_err;
-
-       /*
-        * XXX
-        * We need to rework the narrow_crcs path to deliver the read completion
-        * first, and then punt to a different workqueue, otherwise we're
-        * holding up reads while doing btree updates which is bad for memory
-        * reclaim.
-        */
-       if (unlikely(rbio->narrow_crcs))
-               bch2_rbio_narrow_crcs(rbio);
-
-       if (rbio->flags & BCH_READ_NODECODE)
-               goto nodecode;
-
-       /* Adjust crc to point to subset of data we want: */
-       crc.offset     += rbio->offset_into_extent;
-       crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
-
-       if (crc_is_compressed(crc)) {
-               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-               if (ret)
-                       goto decrypt_err;
-
-               if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
-                   !c->opts.no_data_io)
-                       goto decompression_err;
-       } else {
-               /* don't need to decrypt the entire bio: */
-               nonce = nonce_add(nonce, crc.offset << 9);
-               bio_advance(src, crc.offset << 9);
-
-               BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
-               src->bi_iter.bi_size = dst_iter.bi_size;
-
-               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-               if (ret)
-                       goto decrypt_err;
-
-               if (rbio->bounce) {
-                       struct bvec_iter src_iter = src->bi_iter;
-
-                       bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-               }
-       }
-
-       if (rbio->promote) {
-               /*
-                * Re encrypt data we decrypted, so it's consistent with
-                * rbio->crc:
-                */
-               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-               if (ret)
-                       goto decrypt_err;
-
-               promote_start(rbio->promote, rbio);
-               rbio->promote = NULL;
-       }
-nodecode:
-       if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
-               rbio = bch2_rbio_free(rbio);
-               bch2_rbio_done(rbio);
-       }
-out:
-       memalloc_nofs_restore(nofs_flags);
-       return;
-csum_err:
-       /*
-        * Checksum error: if the bio wasn't bounced, we may have been
-        * reading into buffers owned by userspace (that userspace can
-        * scribble over) - retry the read, bouncing it this time:
-        */
-       if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-               rbio->flags |= BCH_READ_MUST_BOUNCE;
-               bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
-               goto out;
-       }
-
-       bch_err_inum_offset_ratelimited(ca,
-               rbio->read_pos.inode,
-               rbio->read_pos.offset << 9,
-               "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
-               rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
-               csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
-       bch2_io_error(ca);
-       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-       goto out;
-decompression_err:
-       bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-                                       rbio->read_pos.offset << 9,
-                                       "decompression error");
-       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-       goto out;
-decrypt_err:
-       bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-                                       rbio->read_pos.offset << 9,
-                                       "decrypt error");
-       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-       goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
-       struct bch_read_bio *rbio =
-               container_of(bio, struct bch_read_bio, bio);
-       struct bch_fs *c        = rbio->c;
-       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-       struct workqueue_struct *wq = NULL;
-       enum rbio_context context = RBIO_CONTEXT_NULL;
-
-       if (rbio->have_ioref) {
-               bch2_latency_acct(ca, rbio->submit_time, READ);
-               percpu_ref_put(&ca->io_ref);
-       }
-
-       if (!rbio->split)
-               rbio->bio.bi_end_io = rbio->end_io;
-
-       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
-                                   rbio->read_pos.inode,
-                                   rbio->read_pos.offset,
-                                   "data read error: %s",
-                              bch2_blk_status_to_str(bio->bi_status))) {
-               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
-               return;
-       }
-
-       if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-           ptr_stale(ca, &rbio->pick.ptr)) {
-               trace_and_count(c, read_reuse_race, &rbio->bio);
-
-               if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-                       bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
-               else
-                       bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
-               return;
-       }
-
-       if (rbio->narrow_crcs ||
-           rbio->promote ||
-           crc_is_compressed(rbio->pick.crc) ||
-           bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
-               context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
-       else if (rbio->pick.crc.csum_type)
-               context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
-
-       bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-int __bch2_read_indirect_extent(struct btree_trans *trans,
-                               unsigned *offset_into_extent,
-                               struct bkey_buf *orig_k)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 reflink_offset;
-       int ret;
-
-       reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
-               *offset_into_extent;
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
-                              POS(0, reflink_offset), 0);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (k.k->type != KEY_TYPE_reflink_v &&
-           k.k->type != KEY_TYPE_indirect_inline_data) {
-               bch_err_inum_offset_ratelimited(trans->c,
-                       orig_k->k->k.p.inode,
-                       orig_k->k->k.p.offset << 9,
-                       "%llu len %u points to nonexistent indirect extent %llu",
-                       orig_k->k->k.p.offset,
-                       orig_k->k->k.size,
-                       reflink_offset);
-               bch2_inconsistent_error(trans->c);
-               ret = -EIO;
-               goto err;
-       }
-
-       *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
-       bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
-                                                  struct bkey_s_c k,
-                                                  struct bch_extent_ptr ptr)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
-       struct btree_iter iter;
-       struct printbuf buf = PRINTBUF;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-                            PTR_BUCKET_POS(c, &ptr),
-                            BTREE_ITER_CACHED);
-
-       prt_printf(&buf, "Attempting to read from stale dirty pointer:");
-       printbuf_indent_add(&buf, 2);
-       prt_newline(&buf);
-
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_newline(&buf);
-
-       prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
-
-       ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-       if (!ret) {
-               prt_newline(&buf);
-               bch2_bkey_val_to_text(&buf, c, k);
-       }
-
-       bch2_fs_inconsistent(c, "%s", buf.buf);
-
-       bch2_trans_iter_exit(trans, &iter);
-       printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
-                      struct bvec_iter iter, struct bpos read_pos,
-                      enum btree_id data_btree, struct bkey_s_c k,
-                      unsigned offset_into_extent,
-                      struct bch_io_failures *failed, unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct extent_ptr_decoded pick;
-       struct bch_read_bio *rbio = NULL;
-       struct bch_dev *ca = NULL;
-       struct promote_op *promote = NULL;
-       bool bounce = false, read_full = false, narrow_crcs = false;
-       struct bpos data_pos = bkey_start_pos(k.k);
-       int pick_ret;
-
-       if (bkey_extent_is_inline_data(k.k)) {
-               unsigned bytes = min_t(unsigned, iter.bi_size,
-                                      bkey_inline_data_bytes(k.k));
-
-               swap(iter.bi_size, bytes);
-               memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
-               swap(iter.bi_size, bytes);
-               bio_advance_iter(&orig->bio, &iter, bytes);
-               zero_fill_bio_iter(&orig->bio, iter);
-               goto out_read_done;
-       }
-retry_pick:
-       pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
-
-       /* hole or reservation - just zero fill: */
-       if (!pick_ret)
-               goto hole;
-
-       if (pick_ret < 0) {
-               bch_err_inum_offset_ratelimited(c,
-                               read_pos.inode, read_pos.offset << 9,
-                               "no device to read from");
-               goto err;
-       }
-
-       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
-
-       /*
-        * Stale dirty pointers are treated as IO errors, but @failed isn't
-        * allocated unless we're in the retry path - so if we're not in the
-        * retry path, don't check here, it'll be caught in bch2_read_endio()
-        * and we'll end up in the retry path:
-        */
-       if ((flags & BCH_READ_IN_RETRY) &&
-           !pick.ptr.cached &&
-           unlikely(ptr_stale(ca, &pick.ptr))) {
-               read_from_stale_dirty_pointer(trans, k, pick.ptr);
-               bch2_mark_io_failure(failed, &pick);
-               goto retry_pick;
-       }
-
-       /*
-        * Unlock the iterator while the btree node's lock is still in
-        * cache, before doing the IO:
-        */
-       bch2_trans_unlock(trans);
-
-       if (flags & BCH_READ_NODECODE) {
-               /*
-                * can happen if we retry, and the extent we were going to read
-                * has been merged in the meantime:
-                */
-               if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
-                       goto hole;
-
-               iter.bi_size    = pick.crc.compressed_size << 9;
-               goto get_bio;
-       }
-
-       if (!(flags & BCH_READ_LAST_FRAGMENT) ||
-           bio_flagged(&orig->bio, BIO_CHAIN))
-               flags |= BCH_READ_MUST_CLONE;
-
-       narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
-               bch2_can_narrow_extent_crcs(k, pick.crc);
-
-       if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
-               flags |= BCH_READ_MUST_BOUNCE;
-
-       EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
-       if (crc_is_compressed(pick.crc) ||
-           (pick.crc.csum_type != BCH_CSUM_none &&
-            (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-             (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
-              (flags & BCH_READ_USER_MAPPED)) ||
-             (flags & BCH_READ_MUST_BOUNCE)))) {
-               read_full = true;
-               bounce = true;
-       }
-
-       if (orig->opts.promote_target)
-               promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
-                                       &rbio, &bounce, &read_full);
-
-       if (!read_full) {
-               EBUG_ON(crc_is_compressed(pick.crc));
-               EBUG_ON(pick.crc.csum_type &&
-                       (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-                        bvec_iter_sectors(iter) != pick.crc.live_size ||
-                        pick.crc.offset ||
-                        offset_into_extent));
-
-               data_pos.offset += offset_into_extent;
-               pick.ptr.offset += pick.crc.offset +
-                       offset_into_extent;
-               offset_into_extent              = 0;
-               pick.crc.compressed_size        = bvec_iter_sectors(iter);
-               pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
-               pick.crc.offset                 = 0;
-               pick.crc.live_size              = bvec_iter_sectors(iter);
-               offset_into_extent              = 0;
-       }
-get_bio:
-       if (rbio) {
-               /*
-                * promote already allocated bounce rbio:
-                * promote needs to allocate a bio big enough for uncompressing
-                * data in the write path, but we're not going to use it all
-                * here:
-                */
-               EBUG_ON(rbio->bio.bi_iter.bi_size <
-                      pick.crc.compressed_size << 9);
-               rbio->bio.bi_iter.bi_size =
-                       pick.crc.compressed_size << 9;
-       } else if (bounce) {
-               unsigned sectors = pick.crc.compressed_size;
-
-               rbio = rbio_init(bio_alloc_bioset(NULL,
-                                                 DIV_ROUND_UP(sectors, PAGE_SECTORS),
-                                                 0,
-                                                 GFP_NOFS,
-                                                 &c->bio_read_split),
-                                orig->opts);
-
-               bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
-               rbio->bounce    = true;
-               rbio->split     = true;
-       } else if (flags & BCH_READ_MUST_CLONE) {
-               /*
-                * Have to clone if there were any splits, due to error
-                * reporting issues (if a split errored, and retrying didn't
-                * work, when it reports the error to its parent (us) we don't
-                * know if the error was from our bio, and we should retry, or
-                * from the whole bio, in which case we don't want to retry and
-                * lose the error)
-                */
-               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
-                                                &c->bio_read_split),
-                                orig->opts);
-               rbio->bio.bi_iter = iter;
-               rbio->split     = true;
-       } else {
-               rbio = orig;
-               rbio->bio.bi_iter = iter;
-               EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
-       }
-
-       EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
-       rbio->c                 = c;
-       rbio->submit_time       = local_clock();
-       if (rbio->split)
-               rbio->parent    = orig;
-       else
-               rbio->end_io    = orig->bio.bi_end_io;
-       rbio->bvec_iter         = iter;
-       rbio->offset_into_extent= offset_into_extent;
-       rbio->flags             = flags;
-       rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
-       rbio->narrow_crcs       = narrow_crcs;
-       rbio->hole              = 0;
-       rbio->retry             = 0;
-       rbio->context           = 0;
-       /* XXX: only initialize this if needed */
-       rbio->devs_have         = bch2_bkey_devs(k);
-       rbio->pick              = pick;
-       rbio->subvol            = orig->subvol;
-       rbio->read_pos          = read_pos;
-       rbio->data_btree        = data_btree;
-       rbio->data_pos          = data_pos;
-       rbio->version           = k.k->version;
-       rbio->promote           = promote;
-       INIT_WORK(&rbio->work, NULL);
-
-       rbio->bio.bi_opf        = orig->bio.bi_opf;
-       rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
-       rbio->bio.bi_end_io     = bch2_read_endio;
-
-       if (rbio->bounce)
-               trace_and_count(c, read_bounce, &rbio->bio);
-
-       this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
-       bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
-       /*
-        * If it's being moved internally, we don't want to flag it as a cache
-        * hit:
-        */
-       if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
-               bch2_bucket_io_time_reset(trans, pick.ptr.dev,
-                       PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
-       if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
-               bio_inc_remaining(&orig->bio);
-               trace_and_count(c, read_split, &orig->bio);
-       }
-
-       if (!rbio->pick.idx) {
-               if (!rbio->have_ioref) {
-                       bch_err_inum_offset_ratelimited(c,
-                                       read_pos.inode,
-                                       read_pos.offset << 9,
-                                       "no device to read from");
-                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-                       goto out;
-               }
-
-               this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
-                            bio_sectors(&rbio->bio));
-               bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-               if (unlikely(c->opts.no_data_io)) {
-                       if (likely(!(flags & BCH_READ_IN_RETRY)))
-                               bio_endio(&rbio->bio);
-               } else {
-                       if (likely(!(flags & BCH_READ_IN_RETRY)))
-                               submit_bio(&rbio->bio);
-                       else
-                               submit_bio_wait(&rbio->bio);
-               }
-
-               /*
-                * We just submitted IO which may block, we expect relock fail
-                * events and shouldn't count them:
-                */
-               trans->notrace_relock_fail = true;
-       } else {
-               /* Attempting reconstruct read: */
-               if (bch2_ec_read_extent(c, rbio)) {
-                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-                       goto out;
-               }
-
-               if (likely(!(flags & BCH_READ_IN_RETRY)))
-                       bio_endio(&rbio->bio);
-       }
-out:
-       if (likely(!(flags & BCH_READ_IN_RETRY))) {
-               return 0;
-       } else {
-               int ret;
-
-               rbio->context = RBIO_CONTEXT_UNBOUND;
-               bch2_read_endio(&rbio->bio);
-
-               ret = rbio->retry;
-               rbio = bch2_rbio_free(rbio);
-
-               if (ret == READ_RETRY_AVOID) {
-                       bch2_mark_io_failure(failed, &pick);
-                       ret = READ_RETRY;
-               }
-
-               if (!ret)
-                       goto out_read_done;
-
-               return ret;
-       }
-
-err:
-       if (flags & BCH_READ_IN_RETRY)
-               return READ_ERR;
-
-       orig->bio.bi_status = BLK_STS_IOERR;
-       goto out_read_done;
-
-hole:
-       /*
-        * won't normally happen in the BCH_READ_NODECODE
-        * (bch2_move_extent()) path, but if we retry and the extent we wanted
-        * to read no longer exists we have to signal that:
-        */
-       if (flags & BCH_READ_NODECODE)
-               orig->hole = true;
-
-       zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
-       if (flags & BCH_READ_LAST_FRAGMENT)
-               bch2_rbio_done(orig);
-       return 0;
-}
-
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                struct bvec_iter bvec_iter, subvol_inum inum,
-                struct bch_io_failures *failed, unsigned flags)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_buf sk;
-       struct bkey_s_c k;
-       u32 snapshot;
-       int ret;
-
-       BUG_ON(flags & BCH_READ_NODECODE);
-
-       bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-       iter = (struct btree_iter) { NULL };
-
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
-                            BTREE_ITER_SLOTS);
-       while (1) {
-               unsigned bytes, sectors, offset_into_extent;
-               enum btree_id data_btree = BTREE_ID_extents;
-
-               /*
-                * read_extent -> io_time_reset may cause a transaction restart
-                * without returning an error, we need to check for that here:
-                */
-               ret = bch2_trans_relock(&trans);
-               if (ret)
-                       break;
-
-               bch2_btree_iter_set_pos(&iter,
-                               POS(inum.inum, bvec_iter.bi_sector));
-
-               k = bch2_btree_iter_peek_slot(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               offset_into_extent = iter.pos.offset -
-                       bkey_start_offset(k.k);
-               sectors = k.k->size - offset_into_extent;
-
-               bch2_bkey_buf_reassemble(&sk, c, k);
-
-               ret = bch2_read_indirect_extent(&trans, &data_btree,
-                                       &offset_into_extent, &sk);
-               if (ret)
-                       break;
-
-               k = bkey_i_to_s_c(sk.k);
-
-               /*
-                * With indirect extents, the amount of data to read is the min
-                * of the original extent and the indirect extent:
-                */
-               sectors = min(sectors, k.k->size - offset_into_extent);
-
-               bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
-               swap(bvec_iter.bi_size, bytes);
-
-               if (bvec_iter.bi_size == bytes)
-                       flags |= BCH_READ_LAST_FRAGMENT;
-
-               ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
-                                        data_btree, k,
-                                        offset_into_extent, failed, flags);
-               if (ret)
-                       break;
-
-               if (flags & BCH_READ_LAST_FRAGMENT)
-                       break;
-
-               swap(bvec_iter.bi_size, bytes);
-               bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
-               ret = btree_trans_too_many_iters(&trans);
-               if (ret)
-                       break;
-       }
-err:
-       bch2_trans_iter_exit(&trans, &iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-           ret == READ_RETRY ||
-           ret == READ_RETRY_AVOID)
-               goto retry;
-
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&sk, c);
-
-       if (ret) {
-               bch_err_inum_offset_ratelimited(c, inum.inum,
-                                               bvec_iter.bi_sector << 9,
-                                               "read error %i from btree lookup", ret);
-               rbio->bio.bi_status = BLK_STS_IOERR;
-               bch2_rbio_done(rbio);
-       }
-}
-
-void bch2_fs_io_exit(struct bch_fs *c)
-{
-       if (c->promote_table.tbl)
-               rhashtable_destroy(&c->promote_table);
-       mempool_exit(&c->bio_bounce_pages);
-       bioset_exit(&c->bio_write);
-       bioset_exit(&c->bio_read_split);
-       bioset_exit(&c->bio_read);
-}
-
-int bch2_fs_io_init(struct bch_fs *c)
-{
-       if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_bio_read_init;
-
-       if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_bio_read_split_init;
-
-       if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_bio_write_init;
+       if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_write_init;
 
        if (mempool_init_page_pool(&c->bio_bounce_pages,
                                   max_t(unsigned,
@@ -3044,8 +1665,5 @@ int bch2_fs_io_init(struct bch_fs *c)
                                   PAGE_SIZE, 0))
                return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
 
-       if (rhashtable_init(&c->promote_table, &bch_promote_params))
-               return -BCH_ERR_ENOMEM_promote_table_init;
-
        return 0;
 }
diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h
new file mode 100644 (file)
index 0000000..9323167
--- /dev/null
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_H
+#define _BCACHEFS_IO_WRITE_H
+
+#include "checksum.h"
+#include "io_write_types.h"
+
+#define to_wbio(_bio)                  \
+       container_of((_bio), struct bch_write_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+                              enum bch_data_type, const struct bkey_i *, bool);
+
+#define BCH_WRITE_FLAGS()              \
+       x(ALLOC_NOWAIT)                 \
+       x(CACHED)                       \
+       x(DATA_ENCODED)                 \
+       x(PAGES_STABLE)                 \
+       x(PAGES_OWNED)                  \
+       x(ONLY_SPECIFIED_DEVS)          \
+       x(WROTE_DATA_INLINE)            \
+       x(FROM_INTERNAL)                \
+       x(CHECK_ENOSPC)                 \
+       x(SYNC)                         \
+       x(MOVE)                         \
+       x(IN_WORKER)                    \
+       x(DONE)                         \
+       x(IO_ERROR)                     \
+       x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)   __BCH_WRITE_##f,
+       BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f)   BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+       BCH_WRITE_FLAGS()
+#undef x
+};
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+       return op->watermark == BCH_WATERMARK_copygc
+               ? op->c->copygc_wq
+               : op->c->btree_update_wq;
+}
+
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+                              struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+                      struct btree_iter *, struct bkey_i *,
+                      struct disk_reservation *, u64, s64 *, bool);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+                                     struct bch_io_opts opts)
+{
+       op->c                   = c;
+       op->end_io              = NULL;
+       op->flags               = 0;
+       op->written             = 0;
+       op->error               = 0;
+       op->csum_type           = bch2_data_checksum_type(c, opts);
+       op->compression_opt     = opts.compression;
+       op->nr_replicas         = 0;
+       op->nr_replicas_required = c->opts.data_replicas_required;
+       op->watermark           = BCH_WATERMARK_normal;
+       op->incompressible      = 0;
+       op->open_buckets.nr     = 0;
+       op->devs_have.nr        = 0;
+       op->target              = 0;
+       op->opts                = opts;
+       op->subvol              = 0;
+       op->pos                 = POS_MAX;
+       op->version             = ZERO_VERSION;
+       op->write_point         = (struct write_point_specifier) { 0 };
+       op->res                 = (struct disk_reservation) { 0 };
+       op->new_i_size          = U64_MAX;
+       op->i_sectors_delta     = 0;
+       op->devs_need_flush     = NULL;
+}
+
+void bch2_write(struct closure *);
+
+void bch2_write_point_do_index_updates(struct work_struct *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+       struct bch_write_bio *wbio = to_wbio(bio);
+
+       memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+       return wbio;
+}
+
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
+void bch2_fs_io_write_exit(struct bch_fs *);
+int bch2_fs_io_write_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_WRITE_H */
similarity index 54%
rename from libbcachefs/io_types.h
rename to libbcachefs/io_write_types.h
index 737f16d78c48e50c3a325124c9f4f2ea4418388e..c7f97c2c4805f20656476db7da04e840c165734b 100644 (file)
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_TYPES_H
-#define _BCACHEFS_IO_TYPES_H
+#ifndef _BCACHEFS_IO_WRITE_TYPES_H
+#define _BCACHEFS_IO_WRITE_TYPES_H
 
 #include "alloc_types.h"
 #include "btree_types.h"
 #include <linux/llist.h>
 #include <linux/workqueue.h>
 
-struct bch_read_bio {
-       struct bch_fs           *c;
-       u64                     start_time;
-       u64                     submit_time;
-
-       /*
-        * Reads will often have to be split, and if the extent being read from
-        * was checksummed or compressed we'll also have to allocate bounce
-        * buffers and copy the data back into the original bio.
-        *
-        * If we didn't have to split, we have to save and restore the original
-        * bi_end_io - @split below indicates which:
-        */
-       union {
-       struct bch_read_bio     *parent;
-       bio_end_io_t            *end_io;
-       };
-
-       /*
-        * Saved copy of bio->bi_iter, from submission time - allows us to
-        * resubmit on IO error, and also to copy data back to the original bio
-        * when we're bouncing:
-        */
-       struct bvec_iter        bvec_iter;
-
-       unsigned                offset_into_extent;
-
-       u16                     flags;
-       union {
-       struct {
-       u16                     bounce:1,
-                               split:1,
-                               kmalloc:1,
-                               have_ioref:1,
-                               narrow_crcs:1,
-                               hole:1,
-                               retry:2,
-                               context:2;
-       };
-       u16                     _state;
-       };
-
-       struct bch_devs_list    devs_have;
-
-       struct extent_ptr_decoded pick;
-
-       /*
-        * pos we read from - different from data_pos for indirect extents:
-        */
-       u32                     subvol;
-       struct bpos             read_pos;
-
-       /*
-        * start pos of data we read (may not be pos of data we want) - for
-        * promote, narrow extents paths:
-        */
-       enum btree_id           data_btree;
-       struct bpos             data_pos;
-       struct bversion         version;
-
-       struct promote_op       *promote;
-
-       struct bch_io_opts      opts;
-
-       struct work_struct      work;
-
-       struct bio              bio;
-};
-
 struct bch_write_bio {
        struct_group(wbio,
        struct bch_fs           *c;
@@ -162,4 +93,4 @@ struct bch_write_op {
        struct bch_write_bio    wbio;
 };
 
-#endif /* _BCACHEFS_IO_TYPES_H */
+#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
index 055920c26da613e1529ecef1f43c949794b640a9..fc3dd5bef386abbd59b1c240240a43d6fd4a871b 100644 (file)
@@ -132,13 +132,21 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
        return stuck;
 }
 
-/* journal entry close/open: */
-
-void __bch2_journal_buf_put(struct journal *j)
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-       closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+       lockdep_assert_held(&j->lock);
+
+       if (__bch2_journal_pin_put(j, seq))
+               bch2_journal_reclaim_fast(j);
+       if (write)
+               closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 }
 
 /*
@@ -204,13 +212,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
        buf->data->last_seq     = cpu_to_le64(buf->last_seq);
        BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 
-       __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
-
        cancel_delayed_work(&j->write_work);
 
        bch2_journal_space_available(j);
 
-       bch2_journal_buf_put(j, old.idx);
+       __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
 }
 
 void bch2_journal_halt(struct journal *j)
@@ -588,8 +594,13 @@ out:
 
 /**
  * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j:         journal object
+ * @seq:       seq to flush
+ * @parent:    closure object to wait with
+ * Returns:    1 if @seq has already been flushed, 0 if @seq is being flushed,
+ *             -EIO if @seq will never be flushed
  *
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
 int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
@@ -829,12 +840,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                break;
 
                        ret = bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                               bch2_trans_mark_metadata_bucket(trans, ca,
                                                ob[nr_got]->bucket, BCH_DATA_journal,
                                                ca->mi.bucket_size));
                        if (ret) {
                                bch2_open_bucket_put(c, ob[nr_got]);
-                               bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+                               bch_err_msg(c, ret, "marking new journal buckets");
                                break;
                        }
 
@@ -910,7 +921,7 @@ err_unblock:
        if (ret && !new_fs)
                for (i = 0; i < nr_got; i++)
                        bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                               bch2_trans_mark_metadata_bucket(trans, ca,
                                                bu[i], BCH_DATA_free, 0));
 err_free:
        if (!new_fs)
@@ -944,7 +955,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
                goto unlock;
 
        while (ja->nr < nr) {
-               struct disk_reservation disk_res = { 0, 0 };
+               struct disk_reservation disk_res = { 0, 0, 0 };
 
                /*
                 * note: journal buckets aren't really counted as _sectors_ used yet, so
index 008a2e25a4fac93df681233b679c9fc6eaca95d0..491133cc52f3bf38f9c80cf94daf7d0d5b0cc6c1 100644 (file)
@@ -252,9 +252,10 @@ static inline bool journal_entry_empty(struct jset *j)
        return true;
 }
 
-void __bch2_journal_buf_put(struct journal *);
-
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
 {
        union journal_res_state s;
 
@@ -264,9 +265,30 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
                                    .buf2_count = idx == 2,
                                    .buf3_count = idx == 3,
                                    }).v, &j->reservations.counter);
+       return s;
+}
+
+void bch2_journal_buf_put_final(struct journal *, u64, bool);
+
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+       union journal_res_state s;
+
+       s = journal_state_buf_put(j, idx);
+       if (!journal_state_count(s, idx))
+               bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+}
 
-       if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
-               __bch2_journal_buf_put(j);
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+       union journal_res_state s;
+
+       s = journal_state_buf_put(j, idx);
+       if (!journal_state_count(s, idx)) {
+               spin_lock(&j->lock);
+               bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+               spin_unlock(&j->lock);
+       }
 }
 
 /*
@@ -286,7 +308,7 @@ static inline void bch2_journal_res_put(struct journal *j,
                                       BCH_JSET_ENTRY_btree_keys,
                                       0, 0, 0);
 
-       bch2_journal_buf_put(j, res->idx);
+       bch2_journal_buf_put(j, res->idx, res->seq);
 
        res->ref = 0;
 }
index 34740dca4b15523c44f8b49540810c36a39ef7e7..6a3d6a374e9cc4385547eaf8a32894cfda15b49e 100644 (file)
@@ -8,7 +8,6 @@
 #include "checksum.h"
 #include "disk_groups.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
@@ -238,17 +237,17 @@ static void journal_entry_err_msg(struct printbuf *out,
 
 #define journal_entry_err(c, version, jset, entry, msg, ...)           \
 ({                                                                     \
-       struct printbuf buf = PRINTBUF;                                 \
+       struct printbuf _buf = PRINTBUF;                                \
                                                                        \
-       journal_entry_err_msg(&buf, version, jset, entry);              \
-       prt_printf(&buf, msg, ##__VA_ARGS__);                           \
+       journal_entry_err_msg(&_buf, version, jset, entry);             \
+       prt_printf(&_buf, msg, ##__VA_ARGS__);                          \
                                                                        \
        switch (flags & BKEY_INVALID_WRITE) {                           \
        case READ:                                                      \
-               mustfix_fsck_err(c, "%s", buf.buf);                     \
+               mustfix_fsck_err(c, "%s", _buf.buf);                    \
                break;                                                  \
        case WRITE:                                                     \
-               bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
+               bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
                if (bch2_fs_inconsistent(c)) {                          \
                        ret = -BCH_ERR_fsck_errors_not_fixed;           \
                        goto fsck_err;                                  \
@@ -256,7 +255,7 @@ static void journal_entry_err_msg(struct printbuf *out,
                break;                                                  \
        }                                                               \
                                                                        \
-       printbuf_exit(&buf);                                            \
+       printbuf_exit(&_buf);                                           \
        true;                                                           \
 })
 
@@ -1282,7 +1281,7 @@ int bch2_journal_read(struct bch_fs *c,
                        continue;
 
                for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
-                       struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+                       ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
 
                        if (!i->ptrs[ptr].csum_good)
                                bch_err_dev_offset(ca, i->ptrs[ptr].sector,
@@ -1380,16 +1379,21 @@ static void __journal_write_alloc(struct journal *j,
 }
 
 /**
- * journal_next_bucket - move on to the next journal bucket if possible
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j:         journal object
+ * @w:         journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
  */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-                              unsigned sectors)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_devs_mask devs;
        struct journal_device *ja;
        struct bch_dev *ca;
        struct dev_alloc_list devs_sorted;
+       unsigned sectors = vstruct_sectors(w->data, c->block_bits);
        unsigned target = c->opts.metadata_target ?:
                c->opts.foreground_target;
        unsigned i, replicas = 0, replicas_want =
@@ -1550,6 +1554,7 @@ static void journal_write_done(struct closure *cl)
 
        if (!journal_state_count(new, new.unwritten_idx) &&
            journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+               spin_unlock(&j->lock);
                closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
        } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
                   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
@@ -1562,10 +1567,11 @@ static void journal_write_done(struct closure *cl)
                 * might want to be written now:
                 */
 
+               spin_unlock(&j->lock);
                mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+       } else {
+               spin_unlock(&j->lock);
        }
-
-       spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1813,7 +1819,7 @@ void bch2_journal_write(struct closure *cl)
 
 retry_alloc:
        spin_lock(&j->lock);
-       ret = journal_write_alloc(j, w, sectors);
+       ret = journal_write_alloc(j, w);
 
        if (ret && j->can_discard) {
                spin_unlock(&j->lock);
index 10e1860dad79acba08a5ff904dd92f7e3aa3f1ce..9a584aaaa2eba9abadc7f2016a20c70834e0610c 100644 (file)
@@ -290,9 +290,8 @@ void bch2_journal_do_discards(struct journal *j)
  * entry, holding it open to ensure it gets replayed during recovery:
  */
 
-static void bch2_journal_reclaim_fast(struct journal *j)
+void bch2_journal_reclaim_fast(struct journal *j)
 {
-       struct journal_entry_pin_list temp;
        bool popped = false;
 
        lockdep_assert_held(&j->lock);
@@ -303,7 +302,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
         */
        while (!fifo_empty(&j->pin) &&
               !atomic_read(&fifo_peek_front(&j->pin).count)) {
-               fifo_pop(&j->pin, temp);
+               j->pin.front++;
                popped = true;
        }
 
@@ -311,19 +310,16 @@ static void bch2_journal_reclaim_fast(struct journal *j)
                bch2_journal_space_available(j);
 }
 
-void __bch2_journal_pin_put(struct journal *j, u64 seq)
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
 {
        struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 
-       if (atomic_dec_and_test(&pin_list->count))
-               bch2_journal_reclaim_fast(j);
+       return atomic_dec_and_test(&pin_list->count);
 }
 
 void bch2_journal_pin_put(struct journal *j, u64 seq)
 {
-       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-       if (atomic_dec_and_test(&pin_list->count)) {
+       if (__bch2_journal_pin_put(j, seq)) {
                spin_lock(&j->lock);
                bch2_journal_reclaim_fast(j);
                spin_unlock(&j->lock);
@@ -419,6 +415,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 
 /**
  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j:         journal object
+ * @pin:       pin to flush
  */
 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 {
@@ -579,7 +577,11 @@ static u64 journal_seq_to_flush(struct journal *j)
 }
 
 /**
- * bch2_journal_reclaim - free up journal buckets
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j:         journal object
+ * @direct:    direct or background reclaim?
+ * @kicked:    requested to run since we last ran?
+ * Returns:    0 on success, or -EIO if the journal has been shutdown
  *
  * Background journal reclaim writes out btree nodes. It should be run
  * early enough so that we never completely run out of journal buckets.
@@ -758,7 +760,7 @@ int bch2_journal_reclaim_start(struct journal *j)
                           "bch-reclaim/%s", c->name);
        ret = PTR_ERR_OR_ZERO(p);
        if (ret) {
-               bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating journal reclaim thread");
                return ret;
        }
 
index 0fd1af120db551746fc5cac54000c8616914a4f3..494d1a6eddb011fd5c0aa0b41676522949b12577 100644 (file)
@@ -31,7 +31,8 @@ journal_seq_pin(struct journal *j, u64 seq)
        return &j->pin.data[seq & j->pin.mask];
 }
 
-void __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 
index d6b9f2cdf8e7df2664abd4f30df9d67559e0d926..1e1a79405693610a8781a6ef1ed6da49bf831b4a 100644 (file)
@@ -250,20 +250,18 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
        struct journal_seq_blacklist_table *t;
        struct bch_sb_field_journal_seq_blacklist *bl;
        struct journal_seq_blacklist_entry *src, *dst;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        unsigned i, nr, new_nr;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < BTREE_ID_NR; i++) {
                struct btree_iter iter;
                struct btree *b;
 
-               bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+               bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
                                          0, 0, BTREE_ITER_PREFETCH);
 retry:
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                b = bch2_btree_iter_peek_node(&iter);
 
@@ -275,10 +273,10 @@ retry:
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                return;
 
diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c
new file mode 100644 (file)
index 0000000..1bf19aa
--- /dev/null
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "error.h"
+#include "io_misc.h"
+#include "logged_ops.h"
+
+struct bch_logged_op_fn {
+       u8              type;
+       int             (*resume)(struct btree_trans *, struct bkey_i *);
+};
+
+static const struct bch_logged_op_fn logged_op_fns[] = {
+#define x(n)           {                                       \
+       .type           = KEY_TYPE_logged_op_##n,               \
+       .resume         = bch2_resume_logged_op_##n,            \
+},
+       BCH_LOGGED_OPS()
+#undef x
+};
+
+static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
+{
+       for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
+               if (logged_op_fns[i].type == type)
+                       return logged_op_fns + i;
+       return NULL;
+}
+
+static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
+                           struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
+       struct bkey_buf sk;
+       u32 restart_count = trans->restart_count;
+       int ret;
+
+       if (!fn)
+               return 0;
+
+       bch2_bkey_buf_init(&sk);
+       bch2_bkey_buf_reassemble(&sk, c, k);
+
+       ret = fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+
+       bch2_bkey_buf_exit(&sk, c);
+       return ret;
+}
+
+int bch2_resume_logged_ops(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       ret = bch2_trans_run(c,
+               for_each_btree_key2(trans, iter,
+                               BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+                       resume_logged_op(trans, &iter, k)));
+       if (ret)
+               bch_err_fn(c, ret);
+       return ret;
+}
+
+static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+       struct btree_iter iter;
+       int ret;
+
+       ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+       if (ret)
+               return ret;
+
+       k->k.p = iter.pos;
+
+       ret = bch2_trans_update(trans, &iter, k, 0);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+       return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                        __bch2_logged_op_start(trans, k));
+}
+
+void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+{
+       int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                           bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
+       /*
+        * This needs to be a fatal error because we've left an unfinished
+        * operation in the logged ops btree.
+        *
+        * We should only ever see an error here if the filesystem has already
+        * been shut down, but make sure of that here:
+        */
+       if (ret) {
+               struct bch_fs *c = trans->c;
+               struct printbuf buf = PRINTBUF;
+
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+               bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
+                                    __func__, buf.buf, bch2_err_str(ret));
+               printbuf_exit(&buf);
+       }
+}
diff --git a/libbcachefs/logged_ops.h b/libbcachefs/logged_ops.h
new file mode 100644 (file)
index 0000000..4d1e786
--- /dev/null
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_H
+#define _BCACHEFS_LOGGED_OPS_H
+
+#include "bkey.h"
+
+#define BCH_LOGGED_OPS()                       \
+       x(truncate)                             \
+       x(finsert)
+
+static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
+{
+       return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *);
+int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
+void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+
+#endif /* _BCACHEFS_LOGGED_OPS_H */
index 3e8b8f2f38a31fbc17f4f341442754792fdd9ecc..215a653322f3b49c58fa0a4a8c775c1785ff1aeb 100644 (file)
@@ -151,10 +151,10 @@ int bch2_check_lrus(struct bch_fs *c)
        int ret = 0;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                                BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
                                NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-                       bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos)));
+                       bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
index 81c8cdbac28597c9b88e140f47f357fd07b4ac83..e3a51f6d6c9b25dcae89934eace9e68b038531de 100644 (file)
@@ -10,7 +10,7 @@
 #include "buckets.h"
 #include "errcode.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "keylist.h"
 #include "migrate.h"
@@ -78,34 +78,32 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        enum btree_id id;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
        for (id = 0; id < BTREE_ID_NR; id++) {
                if (!btree_type_has_ptrs(id))
                        continue;
 
-               ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+               ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
                                NULL, NULL, BTREE_INSERT_NOFAIL,
-                       bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
+                       bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
                if (ret)
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
 
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct closure cl;
        struct btree *b;
@@ -117,16 +115,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        if (flags & BCH_FORCE_IF_METADATA_LOST)
                return -EINVAL;
 
+       trans = bch2_trans_get(c);
        bch2_bkey_buf_init(&k);
-       bch2_trans_init(&trans, c, 0, 0);
        closure_init_stack(&cl);
 
        for (id = 0; id < BTREE_ID_NR; id++) {
-               bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+               bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
                                          BTREE_ITER_PREFETCH);
 retry:
                ret = 0;
-               while (bch2_trans_begin(&trans),
+               while (bch2_trans_begin(trans),
                       (b = bch2_btree_iter_peek_node(&iter)) &&
                       !(ret = PTR_ERR_OR_ZERO(b))) {
                        if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
@@ -141,15 +139,14 @@ retry:
                                break;
                        }
 
-                       ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false);
+                       ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
                                ret = 0;
                                continue;
                        }
 
                        if (ret) {
-                               bch_err(c, "Error updating btree node key: %s",
-                                       bch2_err_str(ret));
+                               bch_err_msg(c, ret, "updating btree node key");
                                break;
                        }
 next:
@@ -158,7 +155,7 @@ next:
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
 
                if (ret)
                        goto err;
@@ -167,8 +164,8 @@ next:
        bch2_btree_interior_updates_flush(c);
        ret = 0;
 err:
-       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&k, c);
+       bch2_trans_put(trans);
 
        BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
index fb76a1dac74ec5af615f6a5ea0557dac1919cc46..39a14e3216807d222fc11856fa285448f4a24ca3 100644 (file)
@@ -14,7 +14,8 @@
 #include "errcode.h"
 #include "error.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
@@ -524,7 +525,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
        struct bch_fs *c = ctxt->c;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct bkey_buf sk;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct data_update_opts data_opts;
@@ -532,7 +533,6 @@ static int __bch2_move_data(struct moving_context *ctxt,
        int ret = 0, ret2;
 
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
 
        if (ctxt->stats) {
                ctxt->stats->data_type  = BCH_DATA_user;
@@ -540,15 +540,15 @@ static int __bch2_move_data(struct moving_context *ctxt,
                ctxt->stats->pos        = start;
        }
 
-       bch2_trans_iter_init(&trans, &iter, btree_id, start,
+       bch2_trans_iter_init(trans, &iter, btree_id, start,
                             BTREE_ITER_PREFETCH|
                             BTREE_ITER_ALL_SNAPSHOTS);
 
        if (ctxt->rate)
                bch2_ratelimit_reset(ctxt->rate);
 
-       while (!move_ratelimit(&trans, ctxt)) {
-               bch2_trans_begin(&trans);
+       while (!move_ratelimit(trans, ctxt)) {
+               bch2_trans_begin(trans);
 
                k = bch2_btree_iter_peek(&iter);
                if (!k.k)
@@ -569,7 +569,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
                if (!bkey_extent_is_direct_data(k.k))
                        goto next_nondata;
 
-               ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+               ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
                if (ret)
                        continue;
 
@@ -584,7 +584,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
                bch2_bkey_buf_reassemble(&sk, c, k);
                k = bkey_i_to_s_c(sk.k);
 
-               ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
+               ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
                                        io_opts, btree_id, k, data_opts);
                if (ret2) {
                        if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
@@ -592,7 +592,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
 
                        if (ret2 == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt, &trans);
+                               bch2_move_ctxt_wait_for_io(ctxt, trans);
                                continue;
                        }
 
@@ -609,8 +609,8 @@ next_nondata:
                bch2_btree_iter_advance(&iter);
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&sk, c);
 
        return ret;
@@ -627,7 +627,7 @@ int bch2_move_data(struct bch_fs *c,
 {
        struct moving_context ctxt;
        enum btree_id id;
-       int ret;
+       int ret = 0;
 
        bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
 
@@ -723,7 +723,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 
                if (!bp.level) {
                        const struct bch_extent_ptr *ptr;
-                       struct bkey_s_c k;
                        unsigned i = 0;
 
                        k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
@@ -826,15 +825,14 @@ int bch2_evacuate_bucket(struct bch_fs *c,
                         struct write_point_specifier wp,
                         bool wait_on_copygc)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct moving_context ctxt;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
        bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-       ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
+       ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
        bch2_moving_ctxt_exit(&ctxt);
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -851,14 +849,13 @@ static int bch2_move_btree(struct bch_fs *c,
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct btree *b;
        enum btree_id id;
        struct data_update_opts data_opts;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
        progress_list_add(c, stats);
 
        stats->data_type = BCH_DATA_btree;
@@ -871,11 +868,11 @@ static int bch2_move_btree(struct bch_fs *c,
                if (!bch2_btree_id_root(c, id)->b)
                        continue;
 
-               bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+               bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
                                          BTREE_ITER_PREFETCH);
 retry:
                ret = 0;
-               while (bch2_trans_begin(&trans),
+               while (bch2_trans_begin(trans),
                       (b = bch2_btree_iter_peek_node(&iter)) &&
                       !(ret = PTR_ERR_OR_ZERO(b))) {
                        if (kthread && kthread_should_stop())
@@ -890,7 +887,7 @@ retry:
                        if (!pred(c, arg, b, &io_opts, &data_opts))
                                goto next;
 
-                       ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+                       ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
                        if (ret)
@@ -901,13 +898,13 @@ next:
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
 
                if (kthread && kthread_should_stop())
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
index c3136abe8587f5cced50bdfbeb8b015dd396f4ab..cbdd58db8782b24e06f03fbecd23efeaf8aaace7 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_MOVE_H
 #define _BCACHEFS_MOVE_H
 
+#include "bcachefs_ioctl.h"
 #include "btree_iter.h"
 #include "buckets.h"
 #include "data_update.h"
index 256431a6dc0caf502b11808877b625848048ad75..4017120baeeebddecee6180522fe99f84f291db1 100644 (file)
 #include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
-#include "disk_groups.h"
 #include "errcode.h"
 #include "error.h"
-#include "extents.h"
-#include "eytzinger.h"
-#include "io.h"
-#include "keylist.h"
 #include "lru.h"
 #include "move.h"
 #include "movinggc.h"
-#include "super-io.h"
 #include "trace.h"
 
-#include <linux/bsearch.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/sched/task.h>
-#include <linux/sort.h>
 #include <linux/wait.h>
 
 struct buckets_in_flight {
@@ -156,7 +148,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bkey_s_c k;
-       size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
+       size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
        size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
        int ret;
 
@@ -172,7 +164,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
                                  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
                                  0, k, ({
                struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
-               int ret = 0;
+               int ret2 = 0;
 
                saw++;
 
@@ -181,11 +173,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
                else if (bucket_in_flight(buckets_in_flight, b.k))
                        in_flight++;
                else {
-                       ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
-                       if (ret >= 0)
+                       ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+                       if (ret2 >= 0)
                                sectors += b.sectors;
                }
-               ret;
+               ret2;
        }));
 
        pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
@@ -242,7 +234,7 @@ err:
                ret = 0;
 
        if (ret < 0 && !bch2_err_matches(ret, EROFS))
-               bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "from bch2_move_data()");
 
        moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
        trace_and_count(c, copygc, c, moved, 0, 0, 0);
@@ -308,25 +300,24 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 static int bch2_copygc_thread(void *arg)
 {
        struct bch_fs *c = arg;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct moving_context ctxt;
        struct bch_move_stats move_stats;
        struct io_clock *clock = &c->io_clock[WRITE];
-       struct buckets_in_flight move_buckets;
+       struct buckets_in_flight buckets;
        u64 last, wait;
        int ret = 0;
 
-       memset(&move_buckets, 0, sizeof(move_buckets));
+       memset(&buckets, 0, sizeof(buckets));
 
-       ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
+       ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
        if (ret) {
-               bch_err(c, "error allocating copygc buckets in flight: %s",
-                       bch2_err_str(ret));
+               bch_err_msg(c, ret, "allocating copygc buckets in flight");
                return ret;
        }
 
        set_freezable();
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
        bch2_move_stats_init(&move_stats, "copygc");
        bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@@ -334,16 +325,16 @@ static int bch2_copygc_thread(void *arg)
                              false);
 
        while (!ret && !kthread_should_stop()) {
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
                cond_resched();
 
                if (!c->copy_gc_enabled) {
-                       move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+                       move_buckets_wait(trans, &ctxt, &buckets, true);
                        kthread_wait_freezable(c->copy_gc_enabled);
                }
 
                if (unlikely(freezing(current))) {
-                       move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+                       move_buckets_wait(trans, &ctxt, &buckets, true);
                        __refrigerator(false);
                        continue;
                }
@@ -354,7 +345,7 @@ static int bch2_copygc_thread(void *arg)
                if (wait > clock->max_slop) {
                        c->copygc_wait_at = last;
                        c->copygc_wait = last + wait;
-                       move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+                       move_buckets_wait(trans, &ctxt, &buckets, true);
                        trace_and_count(c, copygc_wait, c, wait, last + wait);
                        bch2_kthread_io_clock_wait(clock, last + wait,
                                        MAX_SCHEDULE_TIMEOUT);
@@ -364,15 +355,15 @@ static int bch2_copygc_thread(void *arg)
                c->copygc_wait = 0;
 
                c->copygc_running = true;
-               ret = bch2_copygc(&trans, &ctxt, &move_buckets);
+               ret = bch2_copygc(trans, &ctxt, &buckets);
                c->copygc_running = false;
 
                wake_up(&c->copygc_running_wq);
        }
 
-       move_buckets_wait(&trans, &ctxt, &move_buckets, true);
-       rhashtable_destroy(&move_buckets.table);
-       bch2_trans_exit(&trans);
+       move_buckets_wait(trans, &ctxt, &buckets, true);
+       rhashtable_destroy(&buckets.table);
+       bch2_trans_put(trans);
        bch2_moving_ctxt_exit(&ctxt);
 
        return 0;
@@ -404,7 +395,7 @@ int bch2_copygc_start(struct bch_fs *c)
        t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
        ret = PTR_ERR_OR_ZERO(t);
        if (ret) {
-               bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating copygc thread");
                return ret;
        }
 
index 960bb247f3a0ab8b84c64e8f439738ffdf76b3b6..739a2ef80945b095a19aae195084888db817cd46 100644 (file)
@@ -471,8 +471,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
                        val = "0";
                }
 
+               /* Unknown options are ignored: */
                if (id < 0)
-                       goto bad_opt;
+                       continue;
 
                if (!(bch2_opt_table[id].flags & OPT_MOUNT))
                        goto bad_opt;
index 8a9db110d64fccee8a23e94b461efd09881a6a09..c21c258e40184b63e6fcee7ec708c6c2509e532e 100644 (file)
@@ -469,7 +469,7 @@ struct bch_opts {
 #undef x
 };
 
-static const struct bch_opts bch2_opts_default = {
+static const __maybe_unused struct bch_opts bch2_opts_default = {
 #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)          \
        ._name##_defined = true,                                        \
        ._name = _default,                                              \
index c41daa1806821198ad4f640f6dc508841718b51b..de41f9a144920b83fb816182a4c97fecc1df87bf 100644 (file)
@@ -81,8 +81,10 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
 }
 
 /**
- * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
- * terminated
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf:       printbuf to terminate
+ * Returns:    Printbuf contents, as a nul terminated C string
  */
 const char *bch2_printbuf_str(const struct printbuf *buf)
 {
@@ -97,8 +99,9 @@ const char *bch2_printbuf_str(const struct printbuf *buf)
 }
 
 /**
- * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
  * against accidental use.
+ * @buf:       printbuf to exit
  */
 void bch2_printbuf_exit(struct printbuf *buf)
 {
@@ -120,7 +123,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *buf)
 }
 
 /*
- * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
  *
  * @buf: printbuf to control
  * @spaces: number of spaces from previous tabpstop
@@ -144,7 +147,7 @@ int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
 }
 
 /**
- * printbuf_indent_add - add to the current indent level
+ * bch2_printbuf_indent_add() - add to the current indent level
  *
  * @buf: printbuf to control
  * @spaces: number of spaces to add to the current indent level
@@ -164,7 +167,7 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
 }
 
 /**
- * printbuf_indent_sub - subtract from the current indent level
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
  *
  * @buf: printbuf to control
  * @spaces: number of spaces to subtract from the current indent level
@@ -227,9 +230,8 @@ static void __prt_tab(struct printbuf *out)
 }
 
 /**
- * prt_tab - Advance printbuf to the next tabstop
- *
- * @buf: printbuf to control
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out:       printbuf to control
  *
  * Advance output to the next tabstop by printing spaces.
  */
@@ -267,7 +269,7 @@ static void __prt_tab_rjust(struct printbuf *buf)
 }
 
 /**
- * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
  * previous output
  *
  * @buf: printbuf to control
@@ -284,11 +286,11 @@ void bch2_prt_tab_rjust(struct printbuf *buf)
 }
 
 /**
- * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
  *
- * @out: printbuf to output to
- * @str: string to print
- * @count: number of bytes to print
+ * @out:       output printbuf
+ * @str:       string to print
+ * @count:     number of bytes to print
  *
  * The following contol characters are handled as so:
  *   \n: prt_newline   newline that obeys current indent level
@@ -335,32 +337,38 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou
 }
 
 /**
- * prt_human_readable_u64 - Print out a u64 in human readable units
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out:       output printbuf
+ * @v:         integer to print
  *
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
  */
-void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v)
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
 {
-       bch2_printbuf_make_room(buf, 10);
-       buf->pos += string_get_size(v, 1, !buf->si_units,
-                                   buf->buf + buf->pos,
-                                   printbuf_remaining_size(buf));
+       bch2_printbuf_make_room(out, 10);
+       out->pos += string_get_size(v, 1, !out->si_units,
+                                   out->buf + out->pos,
+                                   printbuf_remaining_size(out));
 }
 
 /**
- * prt_human_readable_s64 - Print out a s64 in human readable units
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out:       output printbuf
+ * @v:         integer to print
  *
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
  */
-void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v)
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
 {
        if (v < 0)
-               prt_char(buf, '-');
-       bch2_prt_human_readable_u64(buf, abs(v));
+               prt_char(out, '-');
+       bch2_prt_human_readable_u64(out, abs(v));
 }
 
 /**
- * prt_units_u64 - Print out a u64 according to printbuf unit options
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out:       output printbuf
+ * @v:         integer to print
  *
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
@@ -374,7 +382,9 @@ void bch2_prt_units_u64(struct printbuf *out, u64 v)
 }
 
 /**
- * prt_units_s64 - Print out a s64 according to printbuf unit options
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out:       output printbuf
+ * @v:         integer to print
  *
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
index ca99772aedc61851ea9cc84faa5b68671a6c9360..36de2f071d8000e8218745f8d7c926cfc0c778a5 100644 (file)
@@ -572,7 +572,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
        if (!s_t.master_subvol)
                goto advance;
 
-       ret = bch2_inode_find_by_inum_trans(trans,
+       ret = bch2_inode_find_by_inum_nowarn_trans(trans,
                                (subvol_inum) {
                                        le32_to_cpu(s_t.master_subvol),
                                        k.k->p.offset,
@@ -599,7 +599,7 @@ advance:
 int bch2_fs_quota_read(struct bch_fs *c)
 {
        struct bch_sb_field_quota *sb_quota;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
@@ -614,16 +614,16 @@ int bch2_fs_quota_read(struct bch_fs *c)
        bch2_sb_quota_read(c);
        mutex_unlock(&c->sb_lock);
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
-       ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+       ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
                        POS_MIN, BTREE_ITER_PREFETCH, k,
                __bch2_quota_set(c, k, NULL)) ?:
-             for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+             for_each_btree_key2(trans, iter, BTREE_ID_inodes,
                        POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-               bch2_fs_quota_read_inode(&trans, &iter, k));
+               bch2_fs_quota_read_inode(trans, &iter, k));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
@@ -786,7 +786,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 {
        struct bch_fs *c = sb->s_fs_info;
        struct bch_sb_field_quota *sb_quota;
-       struct bch_memquota_type *q;
        int ret = 0;
 
        if (0) {
@@ -810,8 +809,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
            ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
                return -EINVAL;
 
-       q = &c->quotas[type];
-
        mutex_lock(&c->sb_lock);
        sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
        if (!sb_quota) {
@@ -959,7 +956,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
        new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-                           bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
+                           bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
                __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
 
        return bch2_err_class(ret);
index 15ce3ecba0baf2e08412bff7f5601a183dbf636f..568f1e8e7507e73913ff70c2e1769b9750f3289e 100644 (file)
@@ -8,8 +8,6 @@
 #include "compress.h"
 #include "disk_groups.h"
 #include "errcode.h"
-#include "extents.h"
-#include "io.h"
 #include "move.h"
 #include "rebalance.h"
 #include "super-io.h"
@@ -350,7 +348,7 @@ int bch2_rebalance_start(struct bch_fs *c)
        p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
        ret = PTR_ERR_OR_ZERO(p);
        if (ret) {
-               bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating rebalance thread");
                return ret;
        }
 
index 30efb3c9056009f930bbe0da6b9fde6cd0255c07..1dceb7eeb20562663e9297d16be728cb59e6a9f8 100644 (file)
@@ -20,6 +20,7 @@
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "lru.h"
+#include "logged_ops.h"
 #include "move.h"
 #include "quota.h"
 #include "recovery.h"
@@ -164,7 +165,7 @@ static int bch2_journal_replay(struct bch_fs *c)
                                    (!k->allocated
                                     ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
                                     : 0),
-                            bch2_journal_replay_key(&trans, k));
+                            bch2_journal_replay_key(trans, k));
                if (ret) {
                        bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
                                bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
@@ -422,15 +423,9 @@ static int bch2_initialize_subvolumes(struct bch_fs *c)
        root_volume.v.snapshot  = cpu_to_le32(U32_MAX);
        root_volume.v.inode     = cpu_to_le64(BCACHEFS_ROOT_INO);
 
-       ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,
-                                 &root_tree.k_i,
-                                 NULL, NULL, 0) ?:
-               bch2_btree_insert(c, BTREE_ID_snapshots,
-                                 &root_snapshot.k_i,
-                                 NULL, NULL, 0) ?:
-               bch2_btree_insert(c, BTREE_ID_subvolumes,
-                                 &root_volume.k_i,
-                                 NULL, NULL, 0);
+       ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,   &root_tree.k_i, NULL, 0) ?:
+               bch2_btree_insert(c, BTREE_ID_snapshots,        &root_snapshot.k_i, NULL, 0) ?:
+               bch2_btree_insert(c, BTREE_ID_subvolumes,       &root_volume.k_i, NULL, 0);
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -471,7 +466,7 @@ noinline_for_stack
 static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 {
        int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-                               __bch2_fs_upgrade_for_subvolumes(&trans));
+                               __bch2_fs_upgrade_for_subvolumes(trans));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -561,7 +556,7 @@ static void check_version_upgrade(struct bch_fs *c)
                        if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
                                prt_str(&buf, "fsck required");
                        else {
-                               prt_str(&buf, "running recovery passses: ");
+                               prt_str(&buf, "running recovery passes: ");
                                prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
                        }
 
@@ -1009,9 +1004,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch2_inode_pack(&packed_inode, &root_inode);
        packed_inode.inode.k.p.snapshot = U32_MAX;
 
-       ret = bch2_btree_insert(c, BTREE_ID_inodes,
-                               &packed_inode.inode.k_i,
-                               NULL, NULL, 0);
+       ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
        if (ret) {
                bch_err_msg(c, ret, "creating root directory");
                goto err;
@@ -1020,7 +1013,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch2_inode_init_early(c, &lostfound_inode);
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_create_trans(&trans,
+               bch2_create_trans(trans,
                                  BCACHEFS_ROOT_SUBVOL_INUM,
                                  &root_inode, &lostfound_inode,
                                  &lostfound,
index abf1f834ec7a86434b42b3d46ba43fe5a7102242..f3c9ea7720ca53cee860fcb7a11d9f9f85e26866 100644 (file)
@@ -24,6 +24,7 @@
        x(check_alloc_to_lru_refs,      PASS_FSCK)                                              \
        x(fs_freespace_init,            PASS_ALWAYS|PASS_SILENT)                                \
        x(bucket_gens_init,             0)                                                      \
+       x(resume_logged_ops,            PASS_ALWAYS)                                            \
        x(check_snapshot_trees,         PASS_FSCK)                                              \
        x(check_snapshots,              PASS_FSCK)                                              \
        x(check_subvols,                PASS_FSCK)                                              \
index 39f711d5069e9f0f483fa9cfb0ad1d2ffced92be..d77d0ea9affffe14b71a7a5a377a38c1a4143672 100644 (file)
@@ -5,9 +5,11 @@
 #include "buckets.h"
 #include "extents.h"
 #include "inode.h"
-#include "io.h"
+#include "io_misc.h"
+#include "io_write.h"
 #include "reflink.h"
 #include "subvolume.h"
+#include "super-io.h"
 
 #include <linux/sched/signal.h>
 
@@ -89,6 +91,9 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
        bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+#if 0
+Currently disabled, needs to be debugged:
+
 bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
        struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
@@ -96,6 +101,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 
        return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
 }
+#endif
 
 int bch2_trans_mark_reflink_v(struct btree_trans *trans,
                              enum btree_id btree_id, unsigned level,
@@ -247,7 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                     u64 remap_sectors,
                     u64 new_i_size, s64 *i_sectors_delta)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter dst_iter, src_iter;
        struct bkey_s_c src_k;
        struct bkey_buf new_dst, new_src;
@@ -269,11 +275,11 @@ s64 bch2_remap_range(struct bch_fs *c,
 
        bch2_bkey_buf_init(&new_dst);
        bch2_bkey_buf_init(&new_src);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+       trans = bch2_trans_get(c);
 
-       bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+       bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
                             BTREE_ITER_INTENT);
-       bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+       bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
                             BTREE_ITER_INTENT);
 
        while ((ret == 0 ||
@@ -281,21 +287,21 @@ s64 bch2_remap_range(struct bch_fs *c,
               bkey_lt(dst_iter.pos, dst_end)) {
                struct disk_reservation disk_res = { 0 };
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
 
-               ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+               ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
                                                  &src_snapshot);
                if (ret)
                        continue;
 
                bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
 
-               ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+               ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
                                                  &dst_snapshot);
                if (ret)
                        continue;
@@ -312,7 +318,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        continue;
 
                if (bkey_lt(src_want, src_iter.pos)) {
-                       ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+                       ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
                                        min(dst_end.offset,
                                            dst_iter.pos.offset +
                                            src_iter.pos.offset - src_want.offset),
@@ -326,7 +332,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        bch2_bkey_buf_reassemble(&new_src, c, src_k);
                        src_k = bkey_i_to_s_c(new_src.k);
 
-                       ret = bch2_make_extent_indirect(&trans, &src_iter,
+                       ret = bch2_make_extent_indirect(trans, &src_iter,
                                                new_src.k);
                        if (ret)
                                continue;
@@ -354,14 +360,14 @@ s64 bch2_remap_range(struct bch_fs *c,
                                min(src_k.k->p.offset - src_want.offset,
                                    dst_end.offset - dst_iter.pos.offset));
 
-               ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+               ret = bch2_extent_update(trans, dst_inum, &dst_iter,
                                         new_dst.k, &disk_res,
                                         new_i_size, i_sectors_delta,
                                         true);
                bch2_disk_reservation_put(c, &disk_res);
        }
-       bch2_trans_iter_exit(&trans, &dst_iter);
-       bch2_trans_iter_exit(&trans, &src_iter);
+       bch2_trans_iter_exit(trans, &dst_iter);
+       bch2_trans_iter_exit(trans, &src_iter);
 
        BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
        BUG_ON(bkey_gt(dst_iter.pos, dst_end));
@@ -373,23 +379,23 @@ s64 bch2_remap_range(struct bch_fs *c,
                struct bch_inode_unpacked inode_u;
                struct btree_iter inode_iter = { NULL };
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
-               ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
+               ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
                                       dst_inum, BTREE_ITER_INTENT);
 
                if (!ret2 &&
                    inode_u.bi_size < new_i_size) {
                        inode_u.bi_size = new_i_size;
-                       ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-                               bch2_trans_commit(&trans, NULL, NULL,
+                       ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+                               bch2_trans_commit(trans, NULL, NULL,
                                                  BTREE_INSERT_NOFAIL);
                }
 
-               bch2_trans_iter_exit(&trans, &inode_iter);
+               bch2_trans_iter_exit(trans, &inode_iter);
        } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&new_src, c);
        bch2_bkey_buf_exit(&new_dst, c);
 
index 5b591c59bc3eadaf86ffd83360609950c9eaad5a..dbef41cd8593709bdd25f94866c9f796c1cbd086 100644 (file)
@@ -429,7 +429,7 @@ out:
 
        return ret;
 err:
-       bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret));
+       bch_err_msg(c, ret, "adding replicas entry");
        goto out;
 }
 
index 14cffa68d7222efa083266834a5e3223c6900370..458a1de0a6e39c89bb6840be729a4fc3f2a52ec5 100644 (file)
@@ -31,7 +31,6 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 #define SIX_LOCK_HELD_intent           (1U << 26)
 #define SIX_LOCK_HELD_write            (1U << 27)
 #define SIX_LOCK_WAITING_read          (1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_intent                (1U << (28 + SIX_LOCK_intent))
 #define SIX_LOCK_WAITING_write         (1U << (28 + SIX_LOCK_write))
 #define SIX_LOCK_NOSPIN                        (1U << 31)
 
index 03ae280aee3a668e67179a9d8a675291996020e7..cdf9eda2ee0284c764583a3ddb8a535d67459680 100644 (file)
@@ -163,8 +163,7 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
 
        rcu_assign_pointer(c->snapshots, new);
        c->snapshot_table_size = new_size;
-       if (old)
-               kvfree_rcu(old);
+       kvfree_rcu_mightsleep(old);
 
        return &rcu_dereference_protected(c->snapshots, true)->s[idx];
 }
@@ -344,7 +343,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
                                       BTREE_ITER_WITH_UPDATES, snapshot, s);
 }
 
-int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
 {
        struct bch_snapshot v;
        int ret;
@@ -371,7 +370,7 @@ int bch2_snapshot_live(struct btree_trans *trans, u32 id)
  * it's part of such a linear chain: this correctly sets equivalence classes on
  * startup if we run leaf to root (i.e. in natural key order).
  */
-int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
        unsigned i, nr_live = 0, live_idx = 0;
@@ -488,18 +487,18 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
        bch2_trans_iter_exit(trans, &iter);
 
        if (!ret && !found) {
-               struct bkey_i_subvolume *s;
+               struct bkey_i_subvolume *u;
 
                *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
 
-               s = bch2_bkey_get_mut_typed(trans, &iter,
+               u = bch2_bkey_get_mut_typed(trans, &iter,
                                            BTREE_ID_subvolumes, POS(0, *subvol_id),
                                            0, subvolume);
-               ret = PTR_ERR_OR_ZERO(s);
+               ret = PTR_ERR_OR_ZERO(u);
                if (ret)
                        return ret;
 
-               SET_BCH_SUBVOLUME_SNAP(&s->v, false);
+               SET_BCH_SUBVOLUME_SNAP(&u->v, false);
        }
 
        return ret;
@@ -591,11 +590,11 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
        int ret;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                        BTREE_ID_snapshot_trees, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
                        NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_snapshot_tree(&trans, &iter, k)));
+               check_snapshot_tree(trans, &iter, k)));
 
        if (ret)
                bch_err(c, "error %i checking snapshot trees", ret);
@@ -864,11 +863,11 @@ int bch2_check_snapshots(struct bch_fs *c)
         * the parent's depth already be correct:
         */
        ret = bch2_trans_run(c,
-               for_each_btree_key_reverse_commit(&trans, iter,
+               for_each_btree_key_reverse_commit(trans, iter,
                        BTREE_ID_snapshots, POS_MAX,
                        BTREE_ITER_PREFETCH, k,
                        NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_snapshot(&trans, &iter, k)));
+               check_snapshot(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -911,7 +910,7 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
                swap(s->children[0], s->children[1]);
 }
 
-int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
@@ -1072,6 +1071,10 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
                        goto err;
 
                new_snapids[i]  = iter.pos.offset;
+
+               mutex_lock(&c->snapshot_table_lock);
+               snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+               mutex_unlock(&c->snapshot_table_lock);
        }
 err:
        bch2_trans_iter_exit(trans, &iter);
@@ -1354,7 +1357,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 
 int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_snapshot snap;
@@ -1366,35 +1369,35 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
        if (!test_bit(BCH_FS_STARTED, &c->flags)) {
                ret = bch2_fs_read_write_early(c);
                if (ret) {
-                       bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+                       bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
                        return ret;
                }
        }
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
        /*
         * For every snapshot node: If we have no live children and it's not
         * pointed to by a subvolume, delete it:
         */
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
                        POS_MIN, 0, k,
                        NULL, NULL, 0,
-               bch2_delete_redundant_snapshot(&trans, &iter, k));
+               bch2_delete_redundant_snapshot(trans, &iter, k));
        if (ret) {
-               bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "deleting redundant snapshots");
                goto err;
        }
 
-       for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k,
-               bch2_snapshot_set_equiv(&trans, k));
+       ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+                                 POS_MIN, 0, k,
+               bch2_snapshot_set_equiv(trans, k));
        if (ret) {
-               bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
                goto err;
        }
 
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+       for_each_btree_key(trans, iter, BTREE_ID_snapshots,
                           POS_MIN, 0, k, ret) {
                if (k.k->type != KEY_TYPE_snapshot)
                        continue;
@@ -1406,7 +1409,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
                                break;
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret) {
                bch_err_msg(c, ret, "walking snapshots");
@@ -1421,16 +1424,16 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
                if (!btree_type_has_snapshots(id))
                        continue;
 
-               ret = for_each_btree_key_commit(&trans, iter,
+               ret = for_each_btree_key_commit(trans, iter,
                                id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
                                &res, NULL, BTREE_INSERT_NOFAIL,
-                       snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
-                     for_each_btree_key_commit(&trans, iter,
+                       snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+                     for_each_btree_key_commit(trans, iter,
                                id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
                                &res, NULL, BTREE_INSERT_NOFAIL,
-                       move_key_to_correct_snapshot(&trans, &iter, k));
+                       move_key_to_correct_snapshot(trans, &iter, k));
 
                bch2_disk_reservation_put(c, &res);
                darray_exit(&equiv_seen);
@@ -1441,7 +1444,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
                }
        }
 
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+       for_each_btree_key(trans, iter, BTREE_ID_snapshots,
                           POS_MIN, 0, k, ret) {
                u32 snapshot = k.k->p.offset;
                u32 equiv = bch2_snapshot_equiv(c, snapshot);
@@ -1449,23 +1452,23 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
                if (equiv != snapshot)
                        snapshot_list_add(c, &deleted_interior, snapshot);
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        /*
         * Fixing children of deleted snapshots can't be done completely
         * atomically, if we crash between here and when we delete the interior
         * nodes some depth fields will be off:
         */
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
                                  BTREE_ITER_INTENT, k,
                                  NULL, NULL, BTREE_INSERT_NOFAIL,
-               bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior));
+               bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
        if (ret)
                goto err;
 
        darray_for_each(deleted, i) {
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(&trans, *i));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(trans, *i));
                if (ret) {
                        bch_err_msg(c, ret, "deleting snapshot %u", *i);
                        goto err;
@@ -1473,8 +1476,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
        }
 
        darray_for_each(deleted_interior, i) {
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(&trans, *i));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(trans, *i));
                if (ret) {
                        bch_err_msg(c, ret, "deleting snapshot %u", *i);
                        goto err;
@@ -1485,7 +1488,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 err:
        darray_exit(&deleted_interior);
        darray_exit(&deleted);
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -1618,7 +1621,8 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct bkey_buf sk;
-       int ret;
+       u32 restart_count = trans->restart_count;
+       int ret = 0;
 
        bch2_bkey_buf_init(&sk);
        bch2_bkey_buf_reassemble(&sk, c, k);
@@ -1640,7 +1644,8 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
        }
 
        bch2_bkey_buf_exit(&sk, c);
-       return ret;
+
+       return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 int bch2_snapshots_read(struct bch_fs *c)
@@ -1650,11 +1655,11 @@ int bch2_snapshots_read(struct bch_fs *c)
        int ret = 0;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+               for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
                           POS_MIN, 0, k,
-                       bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-                       bch2_snapshot_set_equiv(&trans, k)) ?:
-               for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+                       bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+                       bch2_snapshot_set_equiv(trans, k)) ?:
+               for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
                           POS_MIN, 0, k,
                           (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
        if (ret)
index dabc9b9d921b4766be3f84d1b9bc4c494a518f8f..de215d9d1252549db99d7d8c6cea352f90264382 100644 (file)
@@ -235,8 +235,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
                         struct bch_snapshot *s);
 int bch2_snapshot_get_subvol(struct btree_trans *, u32,
                             struct bch_subvolume *);
-int bch2_snapshot_live(struct btree_trans *trans, u32 id);
-int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k);
 
 /* only exported for tests: */
 int bch2_snapshot_node_create(struct btree_trans *, u32,
index 0214a98deb4ff2a566eaea9915457182af04bd34..caf2dd7dafff65e636b32d8893ead65bfa6dd150 100644 (file)
@@ -41,8 +41,7 @@ static int check_subvol(struct btree_trans *trans,
 
                ret = bch2_subvolume_delete(trans, iter->pos.offset);
                if (ret)
-                       bch_err(c, "error deleting subvolume %llu: %s",
-                               iter->pos.offset, bch2_err_str(ret));
+                       bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
                return ret ?: -BCH_ERR_transaction_restart_nested;
        }
 
@@ -87,10 +86,10 @@ int bch2_check_subvols(struct bch_fs *c)
        int ret;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                        BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
                        NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_subvol(&trans, &iter, k)));
+               check_subvol(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -99,7 +98,7 @@ int bch2_check_subvols(struct bch_fs *c)
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          unsigned flags, struct printbuf *err)
+                          enum bkey_invalid_flags flags, struct printbuf *err)
 {
        if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
            bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
@@ -294,9 +293,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
                bch2_evict_subvolume_inodes(c, &s);
 
                for (id = s.data; id < s.data + s.nr; id++) {
-                       ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
+                       ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
                        if (ret) {
-                               bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
+                               bch_err_msg(c, ret, "deleting subvolume %u", *id);
                                break;
                        }
                }
index 8d4c50f4cd059b05743fac7d5d319f16a60c93a8..bb14f92e8687185c4a702e643072695f28e2edf8 100644 (file)
@@ -10,7 +10,7 @@ enum bkey_invalid_flags;
 int bch2_check_subvols(struct bch_fs *);
 
 int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
-                          unsigned, struct printbuf *);
+                          enum bkey_invalid_flags, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {           \
index b6021b734bf02c1cc3e770b8717315274778874a..c9bf342d14aa694e494f5a04e754a68c625ec58b 100644 (file)
@@ -6,7 +6,6 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
@@ -23,6 +22,9 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
 struct bch2_metadata_version {
        u16             version;
        const char      *name;
@@ -161,7 +163,8 @@ void bch2_free_super(struct bch_sb_handle *sb)
 {
        kfree(sb->bio);
        if (!IS_ERR_OR_NULL(sb->bdev))
-               blkdev_put(sb->bdev, sb->mode);
+               blkdev_put(sb->bdev, sb->holder);
+       kfree(sb->holder);
 
        kfree(sb->sb);
        memset(sb, 0, sizeof(*sb));
@@ -182,7 +185,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
        if (sb->sb && sb->buffer_size >= new_buffer_size)
                return 0;
 
-       if (sb->have_layout) {
+       if (sb->sb && sb->have_layout) {
                u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
 
                if (new_bytes > max_bytes) {
@@ -243,9 +246,9 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
                /* XXX: we're not checking that offline device have enough space */
 
                for_each_online_member(ca, c, i) {
-                       struct bch_sb_handle *sb = &ca->disk_sb;
+                       struct bch_sb_handle *dev_sb = &ca->disk_sb;
 
-                       if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+                       if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
                                percpu_ref_put(&ca->ref);
                                return NULL;
                        }
@@ -381,7 +384,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
        }
 
        if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
-               prt_printf(out, "Bad intenal UUID (got zeroes)");
+               prt_printf(out, "Bad internal UUID (got zeroes)");
                return -BCH_ERR_invalid_sb_uuid;
        }
 
@@ -664,27 +667,30 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 retry:
 #endif
        memset(sb, 0, sizeof(*sb));
-       sb->mode        = FMODE_READ;
+       sb->mode        = BLK_OPEN_READ;
        sb->have_bio    = true;
+       sb->holder      = kmalloc(1, GFP_KERNEL);
+       if (!sb->holder)
+               return -ENOMEM;
 
 #ifndef __KERNEL__
        if (opt_get(*opts, direct_io) == false)
-               sb->mode |= FMODE_BUFFERED;
+               sb->mode |= BLK_OPEN_BUFFERED;
 #endif
 
        if (!opt_get(*opts, noexcl))
-               sb->mode |= FMODE_EXCL;
+               sb->mode |= BLK_OPEN_EXCL;
 
        if (!opt_get(*opts, nochanges))
-               sb->mode |= FMODE_WRITE;
+               sb->mode |= BLK_OPEN_WRITE;
 
-       sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+       sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
        if (IS_ERR(sb->bdev) &&
            PTR_ERR(sb->bdev) == -EACCES &&
            opt_get(*opts, read_only)) {
-               sb->mode &= ~FMODE_WRITE;
+               sb->mode &= ~BLK_OPEN_WRITE;
 
-               sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+               sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
                if (!IS_ERR(sb->bdev))
                        opt_set(*opts, nochanges, true);
        }
index e7dbc31be36d84f9b91f77cf348e072bf60932f1..e94a63a22704a3b2490348eec763b2e0cf507a7a 100644 (file)
@@ -35,7 +35,8 @@
 #include "fs-io-direct.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
@@ -68,6 +69,7 @@
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
 
 #define KTYPE(type)                                                    \
 static const struct attribute_group type ## _group = {                 \
@@ -421,6 +423,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
                return ret;
        }
 
+       ret = bch2_journal_reclaim_start(&c->journal);
+       if (ret)
+               goto err;
+
        if (!early) {
                ret = bch2_fs_read_write_late(c);
                if (ret)
@@ -430,7 +436,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 #ifndef BCH_WRITE_REF_DEBUG
        percpu_ref_reinit(&c->writes);
 #else
-       for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+       for (i = 0; i < BCH_WRITE_REF_NR; i++) {
                BUG_ON(atomic_long_read(&c->writes[i]));
                atomic_long_inc(&c->writes[i]);
        }
@@ -465,7 +471,6 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 static void __bch2_fs_free(struct bch_fs *c)
 {
        unsigned i;
-       int cpu;
 
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_exit(&c->times[i]);
@@ -479,7 +484,8 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_fs_fsio_exit(c);
        bch2_fs_ec_exit(c);
        bch2_fs_encryption_exit(c);
-       bch2_fs_io_exit(c);
+       bch2_fs_io_write_exit(c);
+       bch2_fs_io_read_exit(c);
        bch2_fs_buckets_waiting_for_journal_exit(c);
        bch2_fs_btree_interior_update_exit(c);
        bch2_fs_btree_iter_exit(c);
@@ -496,12 +502,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        percpu_free_rwsem(&c->mark_lock);
        free_percpu(c->online_reserved);
 
-       if (c->btree_paths_bufs)
-               for_each_possible_cpu(cpu)
-                       kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
-
        darray_exit(&c->btree_roots_extra);
-       free_percpu(c->btree_paths_bufs);
        free_percpu(c->pcpu);
        mempool_exit(&c->large_bkey_pool);
        mempool_exit(&c->btree_bounce_pool);
@@ -581,8 +582,6 @@ void bch2_fs_free(struct bch_fs *c)
 {
        unsigned i;
 
-       BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags));
-
        mutex_lock(&bch_fs_list_lock);
        list_del(&c->list);
        mutex_unlock(&bch_fs_list_lock);
@@ -787,6 +786,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
        if (c->opts.inodes_use_key_cache)
                c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+       c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
 
        c->block_bits           = ilog2(block_sectors(c));
        c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
@@ -824,7 +824,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                        BIOSET_NEED_BVECS) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
            !(c->online_reserved = alloc_percpu(u64)) ||
-           !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
            mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
@@ -846,13 +845,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_buckets_waiting_for_journal_init(c) ?:
            bch2_fs_btree_write_buffer_init(c) ?:
            bch2_fs_subvolumes_init(c) ?:
-           bch2_fs_io_init(c) ?:
+           bch2_fs_io_read_init(c) ?:
+           bch2_fs_io_write_init(c) ?:
            bch2_fs_nocow_locking_init(c) ?:
            bch2_fs_encryption_init(c) ?:
            bch2_fs_compress_init(c) ?:
            bch2_fs_ec_init(c) ?:
            bch2_fs_fsio_init(c) ?:
-           bch2_fs_fs_io_buffered_init(c);
+           bch2_fs_fs_io_buffered_init(c) ?:
            bch2_fs_fs_io_direct_init(c);
        if (ret)
                goto err;
@@ -990,7 +990,7 @@ out:
        up_write(&c->state_lock);
        return ret;
 err:
-       bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
+       bch_err_msg(c, ret, "starting filesystem");
        goto out;
 }
 
@@ -1237,8 +1237,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 
        /* Commit: */
        ca->disk_sb = *sb;
-       if (sb->mode & FMODE_EXCL)
-               ca->disk_sb.bdev->bd_holder = ca;
        memset(sb, 0, sizeof(*sb));
 
        ca->dev = ca->disk_sb.bdev->bd_dev;
@@ -1457,7 +1455,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
                bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
                                        BTREE_TRIGGER_NORUN, NULL);
        if (ret)
-               bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "removing dev alloc info");
 
        return ret;
 }
@@ -1486,31 +1484,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
        ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
        if (ret) {
-               bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "dropping data");
                goto err;
        }
 
        ret = bch2_dev_remove_alloc(c, ca);
        if (ret) {
-               bch_err(ca, "Remove failed, error deleting alloc info");
+               bch_err_msg(ca, ret, "deleting alloc info");
                goto err;
        }
 
        ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
        if (ret) {
-               bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "flushing journal");
                goto err;
        }
 
        ret = bch2_journal_flush(&c->journal);
        if (ret) {
-               bch_err(ca, "Remove failed, journal error");
+               bch_err(ca, "journal error");
                goto err;
        }
 
        ret = bch2_replicas_gc2(c);
        if (ret) {
-               bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "in replicas_gc2()");
                goto err;
        }
 
@@ -1585,7 +1583,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
        ret = bch2_read_super(path, &opts, &sb);
        if (ret) {
-               bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "reading super");
                goto err;
        }
 
@@ -1601,13 +1599,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
        ret = bch2_dev_may_add(sb.sb, c);
        if (ret) {
-               bch_err(c, "device add error: %s", bch2_err_str(ret));
+               bch_err_fn(c, ret);
                goto err;
        }
 
        ca = __bch2_dev_alloc(c, &dev_mi);
        if (!ca) {
-               bch2_free_super(&sb);
                ret = -ENOMEM;
                goto err;
        }
@@ -1615,14 +1612,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        bch2_dev_usage_init(ca);
 
        ret = __bch2_dev_attach_bdev(ca, &sb);
-       if (ret) {
-               bch2_dev_free(ca);
+       if (ret)
                goto err;
-       }
 
        ret = bch2_dev_journal_alloc(ca);
        if (ret) {
-               bch_err(c, "device add error: journal alloc failed");
+               bch_err_msg(c, ret, "allocating journal");
                goto err;
        }
 
@@ -1631,7 +1626,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
        ret = bch2_sb_from_fs(c, ca);
        if (ret) {
-               bch_err(c, "device add error: new device superblock too small");
+               bch_err_msg(c, ret, "setting up new superblock");
                goto err_unlock;
        }
 
@@ -1640,8 +1635,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        if (!bch2_sb_resize_members(&ca->disk_sb,
                                le32_to_cpu(mi->field.u64s) +
                                sizeof(dev_mi) / sizeof(u64))) {
-               bch_err(c, "device add error: new device superblock too small");
                ret = -BCH_ERR_ENOSPC_sb_members;
+               bch_err_msg(c, ret, "setting up new superblock");
                goto err_unlock;
        }
 
@@ -1653,8 +1648,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
                if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
                        goto have_slot;
 no_slot:
-       bch_err(c, "device add error: already have maximum number of devices");
        ret = -BCH_ERR_ENOSPC_sb_members;
+       bch_err_msg(c, ret, "setting up new superblock");
        goto err_unlock;
 
 have_slot:
@@ -1664,8 +1659,8 @@ have_slot:
 
        mi = bch2_sb_resize_members(&c->disk_sb, u64s);
        if (!mi) {
-               bch_err(c, "device add error: no room in superblock for member info");
                ret = -BCH_ERR_ENOSPC_sb_members;
+               bch_err_msg(c, ret, "setting up new superblock");
                goto err_unlock;
        }
 
@@ -1681,7 +1676,7 @@ have_slot:
        if (BCH_MEMBER_GROUP(&dev_mi)) {
                ret = __bch2_dev_group_set(c, ca, label.buf);
                if (ret) {
-                       bch_err(c, "device add error: error setting label");
+                       bch_err_msg(c, ret, "creating new label");
                        goto err_unlock;
                }
        }
@@ -1693,13 +1688,13 @@ have_slot:
 
        ret = bch2_trans_mark_dev_sb(c, ca);
        if (ret) {
-               bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "marking new superblock");
                goto err_late;
        }
 
        ret = bch2_fs_freespace_init(c);
        if (ret) {
-               bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "initializing free space");
                goto err_late;
        }
 
@@ -1749,7 +1744,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
        ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
        if (ret) {
-               bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret));
+               bch_err_msg(c, ret, "bringing %s online", path);
                goto err;
        }
 
@@ -1761,8 +1756,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
        ret = bch2_trans_mark_dev_sb(c, ca);
        if (ret) {
-               bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
-                       path, bch2_err_str(ret));
+               bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
                goto err;
        }
 
@@ -1780,7 +1774,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
        ret = bch2_fs_freespace_init(c);
        if (ret)
-               bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "initializing free space");
 
        up_write(&c->state_lock);
        return 0;
@@ -1835,7 +1829,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
        ret = bch2_dev_buckets_resize(c, ca, nbuckets);
        if (ret) {
-               bch_err(ca, "Resize error: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "resizing buckets");
                goto err;
        }
 
index 89419fc7930d004f5b68cc80a53630ac625003d3..597a8db73585b8ad2793c59e1fa6668dfe3f87e5 100644 (file)
@@ -6,8 +6,9 @@ struct bch_sb_handle {
        struct bch_sb           *sb;
        struct block_device     *bdev;
        struct bio              *bio;
+       void                    *holder;
        size_t                  buffer_size;
-       fmode_t                 mode;
+       blk_mode_t              mode;
        unsigned                have_layout:1;
        unsigned                have_bio:1;
        unsigned                fs_sb:1;
index 941f4bcb997e7ea00fb851f4cb68663606e586dd..1abc61cb3f7efce557702527ceac6ffc3eeb7c62 100644 (file)
@@ -113,10 +113,6 @@ do {                                                                       \
                prt_human_readable_s64(out, val);                       \
 } while (0)
 
-#define var_printf(_var, fmt)  sysfs_printf(_var, fmt, var(_var))
-#define var_print(_var)                sysfs_print(_var, var(_var))
-#define var_hprint(_var)       sysfs_hprint(_var, var(_var))
-
 #define sysfs_strtoul(file, var)                                       \
 do {                                                                   \
        if (attr == &sysfs_ ## file)                                    \
@@ -139,30 +135,6 @@ do {                                                                       \
        _v;                                                             \
 })
 
-#define strtoul_restrict_or_return(cp, min, max)                       \
-({                                                                     \
-       unsigned long __v = 0;                                          \
-       int _r = strtoul_safe_restrict(cp, __v, min, max);              \
-       if (_r)                                                         \
-               return _r;                                              \
-       __v;                                                            \
-})
-
-#define strtoi_h_or_return(cp)                                         \
-({                                                                     \
-       u64 _v;                                                         \
-       int _r = strtoi_h(cp, &_v);                                     \
-       if (_r)                                                         \
-               return _r;                                              \
-       _v;                                                             \
-})
-
-#define sysfs_hatoi(file, var)                                         \
-do {                                                                   \
-       if (attr == &sysfs_ ## file)                                    \
-               return strtoi_h(buf, &var) ?: (ssize_t) size;           \
-} while (0)
-
 write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
@@ -280,7 +252,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        enum btree_id id;
@@ -291,18 +263,18 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
            incompressible_sectors = 0,
            compressed_sectors_compressed = 0,
            compressed_sectors_uncompressed = 0;
-       int ret;
+       int ret = 0;
 
        if (!test_bit(BCH_FS_STARTED, &c->flags))
                return -EPERM;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
        for (id = 0; id < BTREE_ID_NR; id++) {
                if (!btree_type_has_ptrs(id))
                        continue;
 
-               for_each_btree_key(&trans, iter, id, POS_MIN,
+               for_each_btree_key(trans, iter, id, POS_MIN,
                                   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
                        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                        const union bch_extent_entry *entry;
@@ -336,10 +308,10 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
                        else if (compressed)
                                nr_compressed_extents++;
                }
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                return ret;
@@ -1005,7 +977,7 @@ STORE(bch2_dev)
                mutex_lock(&c->sb_lock);
                mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
 
-               if (v != BCH_MEMBER_DURABILITY(mi)) {
+               if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
                        SET_BCH_MEMBER_DURABILITY(mi, v + 1);
                        bch2_write_super(c);
                }
index 72389c7376d609c16a658f70a1af60ca1e4ec028..c907b3e00176e5031af3ff2222f9a69c5eb13807 100644 (file)
@@ -31,7 +31,7 @@ static void delete_test_keys(struct bch_fs *c)
 
 static int test_delete(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_i_cookie k;
        int ret;
@@ -39,44 +39,43 @@ static int test_delete(struct bch_fs *c, u64 nr)
        bkey_cookie_init(&k.k_i);
        k.k.p.snapshot = U32_MAX;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
                             BTREE_ITER_INTENT);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(&trans, &iter, &k.k_i, 0));
+               bch2_trans_update(trans, &iter, &k.k_i, 0));
        if (ret) {
                bch_err_msg(c, ret, "update error");
                goto err;
        }
 
        pr_info("deleting once");
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_btree_delete_at(&trans, &iter, 0));
+               bch2_btree_delete_at(trans, &iter, 0));
        if (ret) {
                bch_err_msg(c, ret, "delete error (first)");
                goto err;
        }
 
        pr_info("deleting twice");
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_btree_delete_at(&trans, &iter, 0));
+               bch2_btree_delete_at(trans, &iter, 0));
        if (ret) {
                bch_err_msg(c, ret, "delete error (second)");
                goto err;
        }
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_delete_written(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_i_cookie k;
        int ret;
@@ -84,58 +83,53 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
        bkey_cookie_init(&k.k_i);
        k.k.p.snapshot = U32_MAX;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
                             BTREE_ITER_INTENT);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(&trans, &iter, &k.k_i, 0));
+               bch2_trans_update(trans, &iter, &k.k_i, 0));
        if (ret) {
                bch_err_msg(c, ret, "update error");
                goto err;
        }
 
-       bch2_trans_unlock(&trans);
+       bch2_trans_unlock(trans);
        bch2_journal_flush_all_pins(&c->journal);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_btree_delete_at(&trans, &iter, 0));
+               bch2_btree_delete_at(trans, &iter, 0));
        if (ret) {
                bch_err_msg(c, ret, "delete error");
                goto err;
        }
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        delete_test_keys(c);
 
        pr_info("inserting test keys");
 
        for (i = 0; i < nr; i++) {
-               struct bkey_i_cookie k;
+               struct bkey_i_cookie ck;
 
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = i;
-               k.k.p.snapshot = U32_MAX;
+               bkey_cookie_init(&ck.k_i);
+               ck.k.p.offset = i;
+               ck.k.p.snapshot = U32_MAX;
 
-               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
-                                       NULL, NULL, 0);
+               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
                if (ret) {
                        bch_err_msg(c, ret, "insert error");
                        goto err;
@@ -146,7 +140,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k, ({
                BUG_ON(k.k->p.offset != i++);
@@ -161,7 +155,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
        pr_info("iterating backwards");
 
-       ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+       ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
                                         SPOS(0, U64_MAX, U32_MAX), 0, k,
                ({
                        BUG_ON(k.k->p.offset != --i);
@@ -174,35 +168,32 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
        BUG_ON(i);
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        delete_test_keys(c);
 
        pr_info("inserting test extents");
 
        for (i = 0; i < nr; i += 8) {
-               struct bkey_i_cookie k;
+               struct bkey_i_cookie ck;
 
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = i + 8;
-               k.k.p.snapshot = U32_MAX;
-               k.k.size = 8;
+               bkey_cookie_init(&ck.k_i);
+               ck.k.p.offset = i + 8;
+               ck.k.p.snapshot = U32_MAX;
+               ck.k.size = 8;
 
-               ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-                                       NULL, NULL, 0);
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
                if (ret) {
                        bch_err_msg(c, ret, "insert error");
                        goto err;
@@ -213,7 +204,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k, ({
                BUG_ON(bkey_start_offset(k.k) != i);
@@ -229,7 +220,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
        pr_info("iterating backwards");
 
-       ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
                                         SPOS(0, U64_MAX, U32_MAX), 0, k,
                ({
                        BUG_ON(k.k->p.offset != i);
@@ -243,34 +234,31 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
        BUG_ON(i);
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        delete_test_keys(c);
 
        pr_info("inserting test keys");
 
        for (i = 0; i < nr; i++) {
-               struct bkey_i_cookie k;
+               struct bkey_i_cookie ck;
 
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = i * 2;
-               k.k.p.snapshot = U32_MAX;
+               bkey_cookie_init(&ck.k_i);
+               ck.k.p.offset = i * 2;
+               ck.k.p.snapshot = U32_MAX;
 
-               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
-                                       NULL, NULL, 0);
+               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
                if (ret) {
                        bch_err_msg(c, ret, "insert error");
                        goto err;
@@ -281,7 +269,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k, ({
                BUG_ON(k.k->p.offset != i);
@@ -299,7 +287,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  BTREE_ITER_SLOTS, k, ({
                if (i >= nr * 2)
@@ -317,34 +305,31 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
        }
        ret = 0;
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        delete_test_keys(c);
 
        pr_info("inserting test keys");
 
        for (i = 0; i < nr; i += 16) {
-               struct bkey_i_cookie k;
+               struct bkey_i_cookie ck;
 
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = i + 16;
-               k.k.p.snapshot = U32_MAX;
-               k.k.size = 8;
+               bkey_cookie_init(&ck.k_i);
+               ck.k.p.offset = i + 16;
+               ck.k.p.snapshot = U32_MAX;
+               ck.k.size = 8;
 
-               ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-                                       NULL, NULL, 0);
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
                if (ret) {
                        bch_err_msg(c, ret, "insert error");
                        goto err;
@@ -355,7 +340,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k, ({
                BUG_ON(bkey_start_offset(k.k) != i + 8);
@@ -374,7 +359,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
                                 SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                 BTREE_ITER_SLOTS, k, ({
                if (i == nr)
@@ -392,7 +377,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
        }
        ret = 0;
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return 0;
 }
 
@@ -402,43 +387,41 @@ err:
  */
 static int test_peek_end(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, U32_MAX), 0);
 
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
        BUG_ON(k.k);
 
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
        BUG_ON(k.k);
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return 0;
 }
 
 static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                             SPOS(0, 0, U32_MAX), 0);
 
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
        BUG_ON(k.k);
 
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
        BUG_ON(k.k);
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return 0;
 }
 
@@ -458,8 +441,7 @@ static int insert_test_extent(struct bch_fs *c,
        k.k_i.k.size = end - start;
        k.k_i.k.version.lo = test_version++;
 
-       ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-                               NULL, NULL, 0);
+       ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -515,7 +497,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
        k.k_i.k.size = len;
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i,
+               bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
        if (ret)
                bch_err_fn(c, ret);
@@ -538,7 +520,7 @@ static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
 /* Test skipping over keys in unrelated snapshots: */
 static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i_cookie cookie;
@@ -546,20 +528,19 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 
        bkey_cookie_init(&cookie.k_i);
        cookie.k.p.snapshot = snapid_hi;
-       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
-                               NULL, NULL, 0);
+       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
        if (ret)
                return ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+       trans = bch2_trans_get(c);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, snapid_lo), 0);
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 
        BUG_ON(k.k->p.snapshot != U32_MAX);
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -572,13 +553,12 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
        bkey_cookie_init(&cookie.k_i);
        cookie.k.p.snapshot = U32_MAX;
-       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
-                               NULL, NULL, 0);
+       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
        if (ret)
                return ret;
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-                     bch2_snapshot_node_create(&trans, U32_MAX,
+                     bch2_snapshot_node_create(trans, U32_MAX,
                                                snapids,
                                                snapid_subvols,
                                                2));
@@ -609,38 +589,34 @@ static u64 test_rand(void)
 
 static int rand_insert(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bkey_i_cookie k;
        int ret = 0;
        u64 i;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < nr; i++) {
                bkey_cookie_init(&k.k_i);
                k.k.p.offset = test_rand();
                k.k.p.snapshot = U32_MAX;
 
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
                if (ret)
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int rand_insert_multi(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bkey_i_cookie k[8];
        int ret = 0;
        unsigned j;
        u64 i;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
                for (j = 0; j < ARRAY_SIZE(k); j++) {
                        bkey_cookie_init(&k[j].k_i);
@@ -648,46 +624,45 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
                        k[j].k.p.snapshot = U32_MAX;
                }
 
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
                if (ret)
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
        u64 i;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, U32_MAX), 0);
 
        for (i = 0; i < nr; i++) {
                bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
-               lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+               lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
                ret = bkey_err(k);
                if (ret)
                        break;
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -719,26 +694,25 @@ static int rand_mixed_trans(struct btree_trans *trans,
 
 static int rand_mixed(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_i_cookie cookie;
        int ret = 0;
        u64 i, rand;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, U32_MAX), 0);
 
        for (i = 0; i < nr; i++) {
                rand = test_rand();
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       rand_mixed_trans(&trans, &iter, &cookie, i, rand));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       rand_mixed_trans(trans, &iter, &cookie, i, rand));
                if (ret)
                        break;
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -766,22 +740,20 @@ err:
 
 static int rand_delete(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        int ret = 0;
        u64 i;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < nr; i++) {
                struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       __do_delete(&trans, pos));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       __do_delete(trans, pos));
                if (ret)
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -794,14 +766,14 @@ static int seq_insert(struct bch_fs *c, u64 nr)
        bkey_cookie_init(&insert.k_i);
 
        return bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
                                        SPOS(0, 0, U32_MAX),
                                        BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
                                        NULL, NULL, 0, ({
                        if (iter.pos.offset >= nr)
                                break;
                        insert.k.p = iter.pos;
-                       bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+                       bch2_trans_update(trans, &iter, &insert.k_i, 0);
                })));
 }
 
@@ -811,7 +783,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
        struct bkey_s_c k;
 
        return bch2_trans_run(c,
-               for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+               for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k,
                0));
@@ -823,14 +795,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
        struct bkey_s_c k;
 
        return bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
                                        SPOS(0, 0, U32_MAX),
                                        BTREE_ITER_INTENT, k,
                                        NULL, NULL, 0, ({
                        struct bkey_i_cookie u;
 
                        bkey_reassemble(&u.k_i, k);
-                       bch2_trans_update(&trans, &iter, &u.k_i, 0);
+                       bch2_trans_update(trans, &iter, &u.k_i, 0);
                })));
 }
 
index 97fe774237d0bdd8cd6988383e86205bc92f9905..19264492151b3a2a97edcceb57c66f2d6b31d68d 100644 (file)
@@ -137,6 +137,25 @@ DEFINE_EVENT(bio, read_promote,
        TP_ARGS(bio)
 );
 
+TRACE_EVENT(read_nopromote,
+       TP_PROTO(struct bch_fs *c, int ret),
+       TP_ARGS(c, ret),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev             )
+               __array(char,           ret, 32         )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = c->dev;
+               strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+       ),
+
+       TP_printk("%d,%d ret %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ret)
+);
+
 DEFINE_EVENT(bio, read_bounce,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
index 80a6c5667b5d9119b912d6dbb9cdd0add03959b0..adeec805dd0cdb3dce3caecc0bef9a59b7ded7b8 100644 (file)
@@ -112,10 +112,10 @@ got_unit:
 
 #define parse_or_ret(cp, _f)                   \
 do {                                           \
-       int ret = _f;                           \
-       if (ret < 0)                            \
-               return ret;                     \
-       cp += ret;                              \
+       int _ret = _f;                          \
+       if (_ret < 0)                           \
+               return _ret;                    \
+       cp += _ret;                             \
 } while (0)
 
 static int __bch2_strtou64_h(const char *cp, u64 *res)
@@ -605,11 +605,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats)
 
 /**
  * bch2_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- *
- * @d - the struct bch_ratelimit to update
- *
- * Returns the amount of time to delay by, in jiffies
+ *             some work
+ * @d:         the struct bch_ratelimit to update
+ * Returns:    the amount of time to delay by, in jiffies
  */
 u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 {
@@ -622,9 +620,8 @@ u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 
 /**
  * bch2_ratelimit_increment() - increment @d by the amount of work done
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
+ * @d:         the struct bch_ratelimit to update
+ * @done:      the amount of work done, in arbitrary units
  */
 void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
 {
@@ -761,10 +758,10 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size)
        }
 }
 
-int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask)
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
 {
        while (size) {
-               struct page *page = alloc_pages_noprof(gfp_mask, 0);
+               struct page *page = alloc_pages(gfp_mask, 0);
                unsigned len = min_t(size_t, PAGE_SIZE, size);
 
                if (!page)
index d06671a09852ada4329013206595ec00eaf1fbd0..67f1a1d2a02d31a22edb411c5f7057b6cc8326a5 100644 (file)
@@ -60,13 +60,12 @@ static inline void vpfree(void *p, size_t size)
                free_pages((unsigned long) p, get_order(size));
 }
 
-static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask)
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
 {
-       return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN,
-                                             get_order(size)) ?:
-               __vmalloc_noprof(size, gfp_mask);
+       return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+                                        get_order(size)) ?:
+               __vmalloc(size, gfp_mask);
 }
-#define vpmalloc(_size, _gfp)  alloc_hooks(vpmalloc_noprof(_size, _gfp))
 
 static inline void kvpfree(void *p, size_t size)
 {
@@ -76,13 +75,12 @@ static inline void kvpfree(void *p, size_t size)
                vpfree(p, size);
 }
 
-static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask)
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
 {
        return size < PAGE_SIZE
-               ? kmalloc_noprof(size, gfp_mask)
-               : vpmalloc_noprof(size, gfp_mask);
+               ? kmalloc(size, gfp_mask)
+               : vpmalloc(size, gfp_mask);
 }
-#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp))
 
 int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
 
@@ -534,9 +532,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 }
 
 void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t);
-#define bch2_bio_alloc_pages(_bio, _size, _gfp)                                \
-       alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp))
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
 
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
@@ -779,12 +775,12 @@ static inline void __move_gap(void *array, size_t element_size,
 
 #define bubble_sort(_base, _nr, _cmp)                                  \
 do {                                                                   \
-       ssize_t _i, _end;                                               \
+       ssize_t _i, _last;                                              \
        bool _swapped = true;                                           \
                                                                        \
-       for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+       for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
                _swapped = false;                                       \
-               for (_i = 0; _i < _end; _i++)                           \
+               for (_i = 0; _i < _last; _i++)                          \
                        if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {   \
                                swap((_base)[_i], (_base)[_i + 1]);     \
                                _swapped = true;                        \
index 2a2ab86ed6e1c7f09b9abe1994e8091e917665fc..cb4f33ed9ab374fbd50bbf74419335564f55df9a 100644 (file)
 
 /**
  * bch2_varint_encode - encode a variable length integer
- * @out - destination to encode to
- * @v  - unsigned integer to encode
- *
- * Returns the size in bytes of the encoded integer - at most 9 bytes
+ * @out:       destination to encode to
+ * @v:         unsigned integer to encode
+ * Returns:    size in bytes of the encoded integer - at most 9 bytes
  */
 int bch2_varint_encode(u8 *out, u64 v)
 {
@@ -40,11 +39,10 @@ int bch2_varint_encode(u8 *out, u64 v)
 
 /**
  * bch2_varint_decode - encode a variable length integer
- * @in - varint to decode
- * @end        - end of buffer to decode from
- * @out        - on success, decoded integer
- *
- * Returns the size in bytes of the decoded integer - or -1 on failure (would
+ * @in:                varint to decode
+ * @end:       end of buffer to decode from
+ * @out:       on success, decoded integer
+ * Returns:    size in bytes of the decoded integer - or -1 on failure (would
  * have read past the end of the buffer)
  */
 int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
@@ -73,6 +71,9 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
 
 /**
  * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out:       destination to encode to
+ * @v:         unsigned integer to encode
+ * Returns:    size in bytes of the encoded integer - at most 9 bytes
  *
  * This version assumes it's always safe to write 8 bytes to @out, even if the
  * encoded integer would be smaller.
@@ -96,6 +97,11 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
 
 /**
  * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in:                varint to decode
+ * @end:       end of buffer to decode from
+ * @out:       on success, decoded integer
+ * Returns:    size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
  *
  * This version assumes that it is safe to read at most 8 bytes past the end of
  * @end (we still return an error if the varint extends past @end).
index 53a694d71967196ad2784f89da5ea5c3966644a1..a6561b4b36a6e15cf020a82ba2c6741659dbf757 100644 (file)
        (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
 
 #define vstruct_next(_s)                                               \
-       ((typeof(_s))                   ((_s)->_data + __vstruct_u64s(_s)))
+       ((typeof(_s))                   ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_last(_s)                                               \
-       ((typeof(&(_s)->start[0]))      ((_s)->_data + __vstruct_u64s(_s)))
+       ((typeof(&(_s)->start[0]))      ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_end(_s)                                                        \
-       ((void *)                       ((_s)->_data + __vstruct_u64s(_s)))
+       ((void *)                       ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 
 #define vstruct_for_each(_s, _i)                                       \
        for (_i = (_s)->start;                                          \
index 6f6b3caf06078684426ce740cf35a580d26131f1..b069b1a62e25186be7fb068255080ca179593868 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "acl.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "extents.h"
@@ -130,6 +131,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
               xattr.v->x_name,
               le16_to_cpu(xattr.v->x_val_len),
               (char *) xattr_val(xattr.v));
+
+       if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+           xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+               prt_char(out, ' ');
+               bch2_acl_to_text(out, xattr_val(xattr.v),
+                                le16_to_cpu(xattr.v->x_val_len));
+       }
 }
 
 static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
@@ -299,24 +307,22 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
        struct bch_fs *c = dentry->d_sb->s_fs_info;
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
        u64 offset = 0, inum = inode->ei_inode.bi_inum;
        u32 snapshot;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
        iter = (struct btree_iter) { NULL };
 
-       ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
                           SPOS(inum, offset, snapshot),
                           POS(inum, U64_MAX), 0, k, ret) {
                if (k.k->type != KEY_TYPE_xattr)
@@ -328,12 +334,12 @@ retry:
        }
 
        offset = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                goto out;
@@ -358,7 +364,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        int ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags));
+               bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
 
        return bch2_err_class(ret);
 }
@@ -373,18 +379,14 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
        struct bch_inode_unpacked inode_u;
-       struct btree_trans trans;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_xattr_set(&trans, inode_inum(inode), &inode_u,
+       ret = bch2_trans_run(c,
+               commit_do(trans, NULL, NULL, 0,
+                       bch2_xattr_set(trans, inode_inum(inode), &inode_u,
                                       &hash, name, value, size,
-                                      handler->flags, flags));
-       if (!ret)
-               bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
-       bch2_trans_exit(&trans);
+                                      handler->flags, flags)) ?:
+               (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
 
        return bch2_err_class(ret);
 }
index ea901a462494b97fb220b60897d140a0295da3ce..54af9f87d221174e8adff2b1d9a7549861e5eb8e 100644 (file)
@@ -162,7 +162,7 @@ sector_t get_capacity(struct gendisk *disk)
        return bytes >> 9;
 }
 
-void blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, void *holder)
 {
        fdatasync(bdev->bd_fd);
        close(bdev->bd_sync_fd);
@@ -170,25 +170,25 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
        free(bdev);
 }
 
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
-                                       void *holder)
+struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+                                       void *holder, const struct blk_holder_ops *hop)
 {
        struct block_device *bdev;
        int fd, sync_fd, buffered_fd, flags = 0;
 
-       if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE))
+       if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE))
                flags = O_RDWR;
-       else if (mode & FMODE_READ)
+       else if (mode & BLK_OPEN_READ)
                flags = O_RDONLY;
-       else if (mode & FMODE_WRITE)
+       else if (mode & BLK_OPEN_WRITE)
                flags = O_WRONLY;
 
-       if (!(mode & FMODE_BUFFERED))
+       if (!(mode & BLK_OPEN_BUFFERED))
                flags |= O_DIRECT;
 
 #if 0
        /* using O_EXCL doesn't work with opening twice for an O_SYNC fd: */
-       if (mode & FMODE_EXCL)
+       if (mode & BLK_OPEN_EXCL)
                flags |= O_EXCL;
 #endif
        buffered_fd = open(path, flags & ~O_DIRECT);
index 64697ea65863c8f8d804b4c5dfbc6c2c1f49850a..d483083914b40f59fb5c5bcc251e237ba6f041d6 100644 (file)
@@ -47,6 +47,8 @@ pub enum BkeyValC<'a> {
     inode_v3(&'a c::bch_inode_v3),
     bucket_gens(&'a c::bch_bucket_gens),
     snapshot_tree(&'a c::bch_snapshot_tree),
+    logged_op_truncate(&'a c::bch_logged_op_truncate),
+    logged_op_finsert(&'a c::bch_logged_op_finsert),
 }
 
 impl<'a, 'b> BkeySC<'a> {
@@ -96,6 +98,8 @@ impl<'a, 'b> BkeySC<'a> {
             KEY_TYPE_inode_v3               => inode_v3(unsafe { transmute(self.v) }),
             KEY_TYPE_bucket_gens            => bucket_gens(unsafe { transmute(self.v) }),
             KEY_TYPE_snapshot_tree          => snapshot_tree(unsafe { transmute(self.v) }),
+            KEY_TYPE_logged_op_truncate     => logged_op_truncate(unsafe { transmute(self.v) }),
+            KEY_TYPE_logged_op_finsert      => logged_op_finsert(unsafe { transmute(self.v) }),
             KEY_TYPE_MAX                    => unreachable!(),
         }
     }
index 32b4e7439ef08f725afa7bfa8a1444e1c9c1c386..f738a46689766681f67a12de7ee5d23b3838da2e 100644 (file)
@@ -11,24 +11,21 @@ use std::ptr;
 use bitflags::bitflags;
 
 pub struct BtreeTrans<'f> {
-    raw:    c::btree_trans,
+    raw:    *mut c::btree_trans,
     fs:     PhantomData<&'f Fs>
 }
 
 impl<'f> BtreeTrans<'f> {
     pub fn new(fs: &'f Fs) -> BtreeTrans {
         unsafe {
-            let mut trans: MaybeUninit<c::btree_trans> = MaybeUninit::uninit();
-
-            c::__bch2_trans_init(&mut (*trans.as_mut_ptr()), fs.raw, 0);
-            BtreeTrans { raw: trans.assume_init(), fs: PhantomData }
+            BtreeTrans { raw: &mut *c::__bch2_trans_get(fs.raw, 0), fs: PhantomData }
         }
     }
 }
 
 impl<'f> Drop for BtreeTrans<'f> {
     fn drop(&mut self) {
-        unsafe { c::bch2_trans_exit(&mut self.raw) }
+        unsafe { c::bch2_trans_put(&mut *self.raw) }
     }             
 }
 
@@ -64,9 +61,9 @@ impl<'t> BtreeIter<'t> {
             let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
 
             c::bch2_trans_iter_init_outlined(
-                ptr::addr_of!(trans.raw).cast_mut(),
+                trans.raw,
                 iter.as_mut_ptr(),
-                btree as u32,
+                btree,
                 pos,
                 flags.bits as u32);
 
@@ -123,7 +120,7 @@ impl<'t> BtreeNodeIter<'t> {
         unsafe {
             let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
             c::bch2_trans_node_iter_init(
-                ptr::addr_of!(trans.raw).cast_mut(),
+                trans.raw,
                 iter.as_mut_ptr(),
                 btree,
                 pos,
index e7bcfcfb22574f6ea61a1c4980334ba31e624e31..e68de6640e2ad1f0614edc316f2e35c3d73408f3 100644 (file)
@@ -13,8 +13,8 @@
 #include "../include/linux/blkdev.h"
 
 
-#define MARK_FIX_753(req_name) const fmode_t Fix753_##req_name = req_name;
+#define MARK_FIX_753(req_name) const blk_mode_t Fix753_##req_name = req_name;
 
-MARK_FIX_753(FMODE_READ);
-MARK_FIX_753(FMODE_WRITE);
-MARK_FIX_753(FMODE_EXCL);
\ No newline at end of file
+MARK_FIX_753(BLK_OPEN_READ);
+MARK_FIX_753(BLK_OPEN_WRITE);
+MARK_FIX_753(BLK_OPEN_EXCL);