]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
New upstream snapshot
authorJonathan Carter <jcc@debian.org>
Tue, 6 Apr 2021 13:19:46 +0000 (15:19 +0200)
committerJonathan Carter <jcc@debian.org>
Tue, 6 Apr 2021 13:19:46 +0000 (15:19 +0200)
147 files changed:
Makefile
bcachefs.c
cmd_data.c
cmd_debug.c
cmd_device.c
cmd_format.c
cmd_fs.c
cmd_fsck.c
cmd_migrate.c
cmds.h
debian/changelog
debian/control
debian/files
include/linux/bitops.h
include/linux/cpumask.h
include/linux/generic-radix-tree.h
include/linux/kernel.h
include/linux/list.h
include/linux/list_nulls.h [new file with mode: 0644]
include/linux/overflow.h [new file with mode: 0644]
include/linux/page.h
include/linux/poison.h [new file with mode: 0644]
include/linux/random.h
include/linux/rcupdate.h
include/linux/rhashtable-types.h [new file with mode: 0644]
include/linux/rhashtable.h
include/linux/sched/mm.h
include/linux/six.h
include/linux/slab.h
include/linux/srcu.h [new file with mode: 0644]
include/linux/types.h
include/linux/vmalloc.h
include/linux/wait.h
include/trace/events/bcachefs.h
libbcachefs.c
libbcachefs.h
libbcachefs/acl.c
libbcachefs/acl.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bcachefs_ioctl.h
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/bkey_buf.h [new file with mode: 0644]
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/bkey_on_stack.h [deleted file]
libbcachefs/bkey_sort.c
libbcachefs/bkey_sort.h
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/chardev.c
libbcachefs/checksum.h
libbcachefs/clock.c
libbcachefs/clock_types.h
libbcachefs/compress.c
libbcachefs/debug.c
libbcachefs/debug.h
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/ec_types.h
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/extent_update.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-common.c
libbcachefs/fs-io.c
libbcachefs/fs-io.h
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/io_types.h
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_seq_blacklist.c
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/quota.c
libbcachefs/rebalance.c
libbcachefs/rebalance_types.h
libbcachefs/recovery.c
libbcachefs/recovery.h
libbcachefs/reflink.c
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/str_hash.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super.h
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/tests.c
libbcachefs/tests.h
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/varint.c [new file with mode: 0644]
libbcachefs/varint.h [new file with mode: 0644]
libbcachefs/xattr.c
linux/generic-radix-tree.c
linux/kthread.c
linux/rhashtable.c
linux/sched.c
linux/shrinker.c
linux/six.c
tools-util.c
tools-util.h

index cc00ac6eb931110fcebc6992dadf1d3553199815..3fe9604896513eed03b429965cca1bf4202f1c23 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -6,6 +6,7 @@ PYTEST=pytest-3
 CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall                           \
        -Wno-pointer-sign                                       \
        -Wno-zero-length-bounds                                 \
+       -Wno-stringop-overflow                                  \
        -fno-strict-aliasing                                    \
        -fno-delete-null-pointer-checks                         \
        -I. -Iinclude -Iraid                                    \
@@ -155,6 +156,10 @@ update-bcachefs-sources:
        git add linux/six.c
        cp $(LINUX_DIR)/include/linux/six.h include/linux/
        git add include/linux/six.h
+       cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/
+       git add include/linux/list_nulls.h
+       cp $(LINUX_DIR)/include/linux/poison.h include/linux/
+       git add include/linux/poison.h
        $(RM) libbcachefs/*.mod.c
        git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
        git add .bcachefs_revision
index b4958f13c8a2e20436c08974b08395a9a208b959..e9ff7d10b250cdec73d29602ae0dced67ccf443d 100644 (file)
@@ -59,6 +59,7 @@ static void usage(void)
             "\n"
             "Commands for managing filesystem data:\n"
             "  data rereplicate     Rereplicate degraded data\n"
+            "  data job             Kick off low level data jobs\n"
             "\n"
             "Encryption:\n"
             "  unlock               Unlock an encrypted filesystem prior to running/mounting\n"
@@ -128,6 +129,8 @@ static int device_cmds(int argc, char *argv[])
                return cmd_device_set_state(argc, argv);
        if (!strcmp(cmd, "resize"))
                return cmd_device_resize(argc, argv);
+       if (!strcmp(cmd, "resize-journal"))
+               return cmd_device_resize_journal(argc, argv);
 
        usage();
        return 0;
@@ -139,6 +142,8 @@ static int data_cmds(int argc, char *argv[])
 
        if (!strcmp(cmd, "rereplicate"))
                return cmd_data_rereplicate(argc, argv);
+       if (!strcmp(cmd, "job"))
+               return cmd_data_job(argc, argv);
 
        usage();
        return 0;
index f495b6c0ae7e5ca1bc141e4e818337909685435b..25a2dcb22cd0631b41010d1c7fa2b3776a39a6ed 100644 (file)
@@ -4,6 +4,7 @@
 #include <sys/ioctl.h>
 
 #include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/btree_cache.h"
 
 #include "cmds.h"
 #include "libbcachefs.h"
@@ -41,8 +42,83 @@ int cmd_data_rereplicate(int argc, char *argv[])
                die("too many arguments");
 
        return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) {
-               .op     = BCH_DATA_OP_REREPLICATE,
-               .start  = POS_MIN,
-               .end    = POS_MAX,
+               .op             = BCH_DATA_OP_REREPLICATE,
+               .start_btree    = 0,
+               .start_pos      = POS_MIN,
+               .end_btree      = BTREE_ID_NR,
+               .end_pos        = POS_MAX,
        });
 }
+
+static void data_job_usage(void)
+{
+       puts("bcachefs data job\n"
+            "Usage: bcachefs data job [job} filesystem\n"
+            "\n"
+            "Kick off a data job and report progress\n"
+            "\n"
+            "job: one of scrub, rereplicate, migrate, or rewrite_old_nodes\n"
+            "\n"
+            "Options:\n"
+            "  -b btree                    btree to operate on\n"
+            "  -s inode:offset       start position\n"
+            "  -e inode:offset       end position\n"
+            "  -h, --help                  display this help and exit\n"
+            "Report bugs to <linux-bcache@vger.kernel.org>");
+       exit(EXIT_SUCCESS);
+}
+
+const char * const data_jobs[] = {
+       "scrub",
+       "rereplicate",
+       "migrate",
+       "rewrite_old_nodes",
+       NULL
+};
+
+int cmd_data_job(int argc, char *argv[])
+{
+       struct bch_ioctl_data op = {
+               .start_btree    = 0,
+               .start_pos      = POS_MIN,
+               .end_btree      = BTREE_ID_NR,
+               .end_pos        = POS_MAX,
+       };
+       int opt;
+
+       while ((opt = getopt(argc, argv, "s:e:h")) != -1)
+               switch (opt) {
+               case 'b':
+                       op.start_btree = read_string_list_or_die(optarg,
+                                               bch2_btree_ids, "btree id");
+                       op.end_btree = op.start_btree;
+                       break;
+               case 's':
+                       op.start_pos    = bpos_parse(optarg);
+                       break;
+                       op.end_pos      = bpos_parse(optarg);
+               case 'e':
+                       break;
+               case 'h':
+                       data_job_usage();
+               }
+       args_shift(optind);
+
+       char *job = arg_pop();
+       if (!job)
+               die("please specify which type of job");
+
+       op.op = read_string_list_or_die(job, data_jobs, "bad job type");
+
+       if (op.op == BCH_DATA_OP_SCRUB)
+               die("scrub not implemented yet");
+
+       char *fs_path = arg_pop();
+       if (!fs_path)
+               fs_path = ".";
+
+       if (argc)
+               die("too many arguments");
+
+       return bchu_data(bcache_fs_open(fs_path), op);
+}
index 461644762bf8039f2926ba60a0b5eef43a91feb2..4938ec07e3f49f409bd372af88834f91720ea7ab 100644 (file)
@@ -114,7 +114,7 @@ int cmd_dump(int argc, char *argv[])
        opt_set(opts, nochanges,        true);
        opt_set(opts, norecovery,       true);
        opt_set(opts, degraded,         true);
-       opt_set(opts, errors,           BCH_ON_ERROR_CONTINUE);
+       opt_set(opts, errors,           BCH_ON_ERROR_continue);
        opt_set(opts, fix_errors,       FSCK_OPT_YES);
 
        while ((opt = getopt(argc, argv, "o:fvh")) != -1)
@@ -317,13 +317,13 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
                        sectors = vstruct_sectors(bne, c->block_bits);
                }
 
-               fprintf(stdout, "  offset %u journal seq %llu\n",
-                       offset, le64_to_cpu(i->journal_seq));
+               fprintf(stdout, "  offset %u version %u, journal seq %llu\n",
+                       offset,
+                       le16_to_cpu(i->version),
+                       le64_to_cpu(i->journal_seq));
                offset += sectors;
 
-               for (k = i->start;
-                    k != vstruct_last(i);
-                    k = bkey_next_skip_noops(k, vstruct_last(i))) {
+               for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) {
                        struct bkey u;
                        char buf[4096];
 
@@ -387,25 +387,6 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id,
        bch2_trans_exit(&trans);
 }
 
-static struct bpos parse_pos(char *buf)
-{
-       char *s = buf, *field;
-       u64 inode_v = 0, offset_v = 0;
-
-       if (!(field = strsep(&s, ":")) ||
-           kstrtoull(field, 10, &inode_v))
-               die("invalid bpos %s", buf);
-
-       if ((field = strsep(&s, ":")) &&
-           kstrtoull(field, 10, &offset_v))
-               die("invalid bpos %s", buf);
-
-       if (s)
-               die("invalid bpos %s", buf);
-
-       return (struct bpos) { .inode = inode_v, .offset = offset_v };
-}
-
 static void list_keys_usage(void)
 {
        puts("bcachefs list - list filesystem metadata to stdout\n"
@@ -445,7 +426,7 @@ int cmd_list(int argc, char *argv[])
        opt_set(opts, nochanges,        true);
        opt_set(opts, norecovery,       true);
        opt_set(opts, degraded,         true);
-       opt_set(opts, errors,           BCH_ON_ERROR_CONTINUE);
+       opt_set(opts, errors,           BCH_ON_ERROR_continue);
 
        while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1)
                switch (opt) {
@@ -455,10 +436,10 @@ int cmd_list(int argc, char *argv[])
                        btree_id_end = btree_id_start + 1;
                        break;
                case 's':
-                       start   = parse_pos(optarg);
+                       start   = bpos_parse(optarg);
                        break;
                case 'e':
-                       end     = parse_pos(optarg);
+                       end     = bpos_parse(optarg);
                        break;
                case 'i':
                        if (kstrtoull(optarg, 10, &inum))
@@ -538,7 +519,7 @@ int cmd_list_journal(int argc, char *argv[])
        opt_set(opts, nochanges,        true);
        opt_set(opts, norecovery,       true);
        opt_set(opts, degraded,         true);
-       opt_set(opts, errors,           BCH_ON_ERROR_CONTINUE);
+       opt_set(opts, errors,           BCH_ON_ERROR_continue);
        opt_set(opts, fix_errors,       FSCK_OPT_YES);
        opt_set(opts, keep_journal,     true);
 
@@ -570,14 +551,10 @@ int cmd_list_journal(int argc, char *argv[])
                printf("journal entry   %8llu\n"
                       "    version     %8u\n"
                       "    last seq    %8llu\n"
-                      "    read clock  %8u\n"
-                      "    write clock %8u\n"
                       ,
                       le64_to_cpu(p->j.seq),
-                      le32_to_cpu(p->j.seq),
-                      le64_to_cpu(p->j.last_seq),
-                      le16_to_cpu(p->j.read_clock),
-                      le16_to_cpu(p->j.write_clock));
+                      le32_to_cpu(p->j.version),
+                      le64_to_cpu(p->j.last_seq));
 
                for_each_jset_key(k, _n, entry, &p->j) {
                        char buf[200];
index c311324a75dc17c2749bfadaf4c4e51a86cc45b9..f9e975abc2b93916e0ea820aac1d3845e241a416 100644 (file)
@@ -12,7 +12,9 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#include "libbcachefs/bcachefs.h"
 #include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/journal.h"
 #include "libbcachefs/super-io.h"
 #include "cmds.h"
 #include "libbcachefs.h"
@@ -121,11 +123,9 @@ static void device_remove_usage(void)
 {
        puts("bcachefs device_remove - remove a device from a filesystem\n"
             "Usage:\n"
-            "  bcachefs device remove device\n"
-            "  bcachefs device remove --by-id path devid\n"
+            "  bcachefs device remove <device>|<devid> <path>\n"
             "\n"
             "Options:\n"
-            "  -i, --by-id                 Remove device by device id\n"
             "  -f, --force                 Force removal, even if some data\n"
             "                              couldn't be migrated\n"
             "  -F, --force-metadata        Force removal, even if some metadata\n"
@@ -146,14 +146,10 @@ int cmd_device_remove(int argc, char *argv[])
        };
        struct bchfs_handle fs;
        bool by_id = false;
-       int opt, flags = BCH_FORCE_IF_DEGRADED;
-       unsigned dev_idx;
+       int opt, flags = BCH_FORCE_IF_DEGRADED, dev_idx;
 
        while ((opt = getopt_long(argc, argv, "fh", longopts, NULL)) != -1)
                switch (opt) {
-               case 'i':
-                       by_id = true;
-                       break;
                case 'f':
                        flags |= BCH_FORCE_IF_DATA_LOST;
                        break;
@@ -165,27 +161,31 @@ int cmd_device_remove(int argc, char *argv[])
                }
        args_shift(optind);
 
-       if (by_id) {
-               char *path = arg_pop();
-               if (!path)
-                       die("Please supply filesystem to remove device from");
+       char *dev_str = arg_pop();
+       if (!dev_str)
+               die("Please supply a device");
 
-               dev_idx = (intptr_t) arg_pop();
-               if (!dev_idx)
-                       die("Please supply device id");
+       char *end;
+       dev_idx = strtoul(dev_str, &end, 10);
+       if (*dev_str && !*end)
+               by_id = true;
 
-               fs = bcache_fs_open(path);
+       char *fs_path = arg_pop();
+       if (fs_path) {
+               fs = bcache_fs_open(fs_path);
+
+               if (!by_id) {
+                       dev_idx = bchu_dev_path_to_idx(fs, dev_str);
+                       if (dev_idx < 0)
+                               die("%s does not seem to be a member of %s",
+                                   dev_str, fs_path);
+               }
+       } else if (!by_id) {
+               fs = bchu_fs_open_by_dev(dev_str, &dev_idx);
        } else {
-               char *dev = arg_pop();
-               if (!dev)
-                       die("Please supply a device to remove");
-
-               fs = bchu_fs_open_by_dev(dev, &dev_idx);
+               die("Filesystem path required when specifying device by id");
        }
 
-       if (argc)
-               die("too many arguments");
-
        bchu_disk_remove(fs, dev_idx, flags);
        return 0;
 }
@@ -220,7 +220,7 @@ int cmd_device_online(int argc, char *argv[])
        if (argc)
                die("too many arguments");
 
-       unsigned dev_idx;
+       int dev_idx;
        struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
        bchu_disk_online(fs, dev);
        return 0;
@@ -265,7 +265,7 @@ int cmd_device_offline(int argc, char *argv[])
        if (argc)
                die("too many arguments");
 
-       unsigned dev_idx;
+       int dev_idx;
        struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
        bchu_disk_offline(fs, dev_idx, flags);
        return 0;
@@ -301,20 +301,22 @@ int cmd_device_evacuate(int argc, char *argv[])
        if (argc)
                die("too many arguments");
 
-       unsigned dev_idx;
+       int dev_idx;
        struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
 
        struct bch_ioctl_dev_usage u = bchu_dev_usage(fs, dev_idx);
 
-       if (u.state == BCH_MEMBER_STATE_RW) {
+       if (u.state == BCH_MEMBER_STATE_rw) {
                printf("Setting %s readonly\n", dev_path);
-               bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_RO, 0);
+               bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_ro, 0);
        }
 
        return bchu_data(fs, (struct bch_ioctl_data) {
                .op             = BCH_DATA_OP_MIGRATE,
-               .start          = POS_MIN,
-               .end            = POS_MAX,
+               .start_btree    = 0,
+               .start_pos      = POS_MIN,
+               .end_btree      = BTREE_ID_NR,
+               .end_pos        = POS_MAX,
                .migrate.dev    = dev_idx,
        });
 }
@@ -322,7 +324,10 @@ int cmd_device_evacuate(int argc, char *argv[])
 static void device_set_state_usage(void)
 {
        puts("bcachefs device set-state\n"
-            "Usage: bcachefs device set-state device new-state\n"
+            "Usage: bcachefs device set-state <new-state> <device>|<devid> <path>\n"
+            "\n"
+            "<new-state>: one of rw, ro, failed or spare\n"
+            "<path>: path to mounted filesystem, optional unless specifying device by id\n"
             "\n"
             "Options:\n"
             "  -f, --force                 Force, if data redundancy will be degraded\n"
@@ -340,7 +345,9 @@ int cmd_device_set_state(int argc, char *argv[])
                { "help",                       0, NULL, 'h' },
                { NULL }
        };
-       int opt, flags = 0;
+       struct bchfs_handle fs;
+       bool by_id = false;
+       int opt, flags = 0, dev_idx;
        bool offline = false;
 
        while ((opt = getopt_long(argc, argv, "foh", longopts, NULL)) != -1)
@@ -356,31 +363,32 @@ int cmd_device_set_state(int argc, char *argv[])
                }
        args_shift(optind);
 
-       char *dev_path = arg_pop();
-       if (!dev_path)
-               die("Please supply a device");
-
        char *new_state_str = arg_pop();
        if (!new_state_str)
                die("Please supply a device state");
 
        unsigned new_state = read_string_list_or_die(new_state_str,
-                                       bch2_dev_state, "device state");
+                                       bch2_member_states, "device state");
 
-       if (!offline) {
-               unsigned dev_idx;
-               struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
+       char *dev_str = arg_pop();
+       if (!dev_str)
+               die("Please supply a device");
 
-               bchu_disk_set_state(fs, dev_idx, new_state, flags);
+       char *end;
+       dev_idx = strtoul(dev_str, &end, 10);
+       if (*dev_str && !*end)
+               by_id = true;
 
-               bcache_fs_close(fs);
-       } else {
+       if (offline) {
                struct bch_opts opts = bch2_opts_empty();
                struct bch_sb_handle sb = { NULL };
 
-               int ret = bch2_read_super(dev_path, &opts, &sb);
+               if (by_id)
+                       die("Cannot specify offline device by id");
+
+               int ret = bch2_read_super(dev_str, &opts, &sb);
                if (ret)
-                       die("error opening %s: %s", dev_path, strerror(-ret));
+                       die("error opening %s: %s", dev_str, strerror(-ret));
 
                struct bch_member *m = bch2_sb_get_members(sb.sb)->members + sb.sb->dev_idx;
 
@@ -390,8 +398,27 @@ int cmd_device_set_state(int argc, char *argv[])
 
                bch2_super_write(sb.bdev->bd_fd, sb.sb);
                bch2_free_super(&sb);
+               return 0;
+       }
+
+       char *fs_path = arg_pop();
+       if (fs_path) {
+               fs = bcache_fs_open(fs_path);
+
+               if (!by_id) {
+                       dev_idx = bchu_dev_path_to_idx(fs, dev_str);
+                       if (dev_idx < 0)
+                               die("%s does not seem to be a member of %s",
+                                   dev_str, fs_path);
+               }
+       } else if (!by_id) {
+               fs = bchu_fs_open_by_dev(dev_str, &dev_idx);
+       } else {
+               die("Filesystem path required when specifying device by id");
        }
 
+       bchu_disk_set_state(fs, dev_idx, new_state, flags);
+
        return 0;
 }
 
@@ -496,3 +523,103 @@ int cmd_device_resize(int argc, char *argv[])
        }
        return 0;
 }
+
+static void device_resize_journal_usage(void)
+{
+       puts("bcachefs device resize-journal \n"
+            "Usage: bcachefs device resize-journal device [ size ]\n"
+            "\n"
+            "Options:\n"
+            "  -h, --help                  display this help and exit\n"
+            "Report bugs to <linux-bcache@vger.kernel.org>");
+       exit(EXIT_SUCCESS);
+}
+
+int cmd_device_resize_journal(int argc, char *argv[])
+{
+       static const struct option longopts[] = {
+               { "help",                       0, NULL, 'h' },
+               { NULL }
+       };
+       u64 size;
+       int opt;
+
+       while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+               switch (opt) {
+               case 'h':
+                       device_resize_journal_usage();
+               }
+       args_shift(optind);
+
+       char *dev = arg_pop();
+       if (!dev)
+               die("Please supply a device");
+
+       int dev_fd = xopen(dev, O_RDONLY);
+
+       char *size_arg = arg_pop();
+       if (!size_arg)
+               size = get_size(dev, dev_fd);
+       else if (bch2_strtoull_h(size_arg, &size))
+               die("invalid size");
+
+       size >>= 9;
+
+       if (argc)
+               die("Too many arguments");
+
+       struct stat dev_stat = xfstat(dev_fd);
+
+       struct mntent *mount = dev_to_mount(dev);
+       if (mount) {
+               if (!S_ISBLK(dev_stat.st_mode))
+                       die("%s is mounted but isn't a block device?!", dev);
+
+               struct bchfs_handle fs = bcache_fs_open(mount->mnt_dir);
+
+               unsigned idx = bchu_disk_get_idx(fs, dev_stat.st_rdev);
+
+               struct bch_sb *sb = bchu_read_super(fs, -1);
+               if (idx >= sb->nr_devices)
+                       die("error reading superblock: dev idx >= sb->nr_devices");
+
+               struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+               if (!mi)
+                       die("error reading superblock: no member info");
+
+               /* could also just read this out of sysfs... meh */
+               struct bch_member *m = mi->members + idx;
+
+               u64 nbuckets = size / le16_to_cpu(m->bucket_size);
+
+               printf("resizing journal on %s to %llu buckets\n", dev, nbuckets);
+               bchu_disk_resize_journal(fs, idx, nbuckets);
+       } else {
+               printf("%s is offline - starting:\n", dev);
+
+               struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty());
+               if (IS_ERR(c))
+                       die("error opening %s: %s", dev, strerror(-PTR_ERR(c)));
+
+               struct bch_dev *ca, *resize = NULL;
+               unsigned i;
+
+               for_each_online_member(ca, c, i) {
+                       if (resize)
+                               die("confused: more than one online device?");
+                       resize = ca;
+                       percpu_ref_get(&resize->io_ref);
+               }
+
+               u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size);
+
+               printf("resizing journal on %s to %llu buckets\n", dev, nbuckets);
+               int ret = bch2_set_nr_journal_buckets(c, resize, nbuckets);
+               if (ret)
+                       fprintf(stderr, "resize error: %s\n", strerror(-ret));
+
+               percpu_ref_put(&resize->io_ref);
+               bch2_fs_stop(c);
+       }
+       return 0;
+}
index 673c63a70b452f1a3e00f6fac97e9ae55a4db8a0..b88ffe91f6c05c8751d285a2293725dfb5d93234 100644 (file)
@@ -36,11 +36,14 @@ x(0,        no_passphrase,          no_argument)            \
 x('L', label,                  required_argument)      \
 x('U', uuid,                   required_argument)      \
 x(0,   fs_size,                required_argument)      \
+x(0,   superblock_size,        required_argument)      \
 x(0,   bucket_size,            required_argument)      \
 x('g', group,                  required_argument)      \
 x(0,   discard,                no_argument)            \
 x(0,   data_allowed,           required_argument)      \
 x(0,   durability,             required_argument)      \
+x(0,   version,                required_argument)      \
+x(0,   no_initialize,          no_argument)            \
 x('f', force,                  no_argument)            \
 x('q', quiet,                  no_argument)            \
 x('h', help,                   no_argument)
@@ -60,6 +63,7 @@ static void usage(void)
             "      --no_passphrase         Don't encrypt master encryption key\n"
             "  -L, --label=label\n"
             "  -U, --uuid=uuid\n"
+            "      --superblock_size=size\n"
             "\n"
             "Device specific options:");
 
@@ -112,7 +116,7 @@ int cmd_format(int argc, char *argv[])
        darray(char *) device_paths;
        struct format_opts opts = format_opts_default();
        struct dev_opts dev_opts = dev_opts_default(), *dev;
-       bool force = false, no_passphrase = false, quiet = false;
+       bool force = false, no_passphrase = false, quiet = false, initialize = true;
        unsigned v;
        int opt;
 
@@ -162,6 +166,12 @@ int cmd_format(int argc, char *argv[])
 
                        dev_opts.size >>= 9;
                        break;
+               case O_superblock_size:
+                       if (bch2_strtouint_h(optarg, &opts.superblock_size))
+                               die("invalid filesystem size");
+
+                       opts.superblock_size >>= 9;
+                       break;
                case O_bucket_size:
                        dev_opts.bucket_size =
                                hatoi_validate(optarg, "bucket size");
@@ -183,6 +193,13 @@ int cmd_format(int argc, char *argv[])
                            dev_opts.durability > BCH_REPLICAS_MAX)
                                die("invalid durability");
                        break;
+               case O_version:
+                       if (kstrtouint(optarg, 10, &opts.version))
+                               die("invalid version");
+                       break;
+               case O_no_initialize:
+                       initialize = false;
+                       break;
                case O_no_opt:
                        darray_append(device_paths, optarg);
                        dev_opts.path = optarg;
@@ -206,8 +223,10 @@ int cmd_format(int argc, char *argv[])
        if (darray_empty(devices))
                die("Please supply a device");
 
-       if (opts.encrypted && !no_passphrase)
+       if (opts.encrypted && !no_passphrase) {
                opts.passphrase = read_passphrase_twice("Enter passphrase: ");
+               initialize = false;
+       }
 
        darray_foreach(dev, devices)
                dev->fd = open_for_format(dev->path, force);
@@ -229,7 +248,7 @@ int cmd_format(int argc, char *argv[])
 
        darray_free(devices);
 
-       if (!opts.passphrase) {
+       if (initialize) {
                /*
                 * Start the filesystem once, to allocate the journal and create
                 * the root directory:
index f0b67b668ec982c1d21b5e7e79c3757163050947..8b9d91b80d162a051ce7d7920f43e57dd34a4e98 100644 (file)
--- a/cmd_fs.c
+++ b/cmd_fs.c
@@ -22,7 +22,7 @@ static void print_dev_usage_type(const char *type,
        u64 frag = max((s64) buckets * bucket_size - (s64) sectors, 0LL);
 
        printf_pad(20, "  %s:", type);
-       printf("%12s%12llu%12s\n",
+       printf(" %15s %15llu %15s\n",
               pr_units(sectors, units),
               buckets,
               pr_units(frag, units));
@@ -37,18 +37,17 @@ static void print_dev_usage(struct bchfs_handle fs,
 
        printf("\n");
        printf_pad(20, "%s (device %u):", d->label ?: "(no label)", d->idx);
-       printf("%24s%12s\n", d->dev ?: "(device not found)", bch2_dev_state[u.state]);
+       printf("%30s%16s\n", d->dev ?: "(device not found)", bch2_member_states[u.state]);
 
-       printf("%-20s%12s%12s%12s\n",
+       printf("%-20s%16s%16s%16s\n",
               "", "data", "buckets", "fragmented");
 
-       for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++) {
+       for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++)
                print_dev_usage_type(bch2_data_types[i],
                                     u.bucket_size,
                                     u.buckets[i],
                                     u.sectors[i],
                                     units);
-       }
 
        print_dev_usage_type("erasure coded",
                             u.bucket_size,
@@ -57,12 +56,12 @@ static void print_dev_usage(struct bchfs_handle fs,
                             units);
 
        printf_pad(20, "  available:");
-       printf("%12s%12llu\n",
+       printf(" %15s %15llu\n",
               pr_units(u.available_buckets * u.bucket_size, units),
               u.available_buckets);
 
        printf_pad(20, "  capacity:");
-       printf("%12s%12llu\n",
+       printf(" %15s %15llu\n",
               pr_units(u.nr_buckets * u.bucket_size, units),
               u.nr_buckets);
 }
index 9ef69ad8002502cc62f67ec150924082bdf3ec4e..247e2072ab67c73551e9ca87e2ae77f779f0798a 100644 (file)
@@ -93,10 +93,14 @@ int cmd_fsck(int argc, char *argv[])
                exit(8);
        }
 
-       if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags))
+       if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags)) {
+               fprintf(stderr, "%s: errors fixed\n", c->name);
                ret |= 1;
-       if (test_bit(BCH_FS_ERROR, &c->flags))
+       }
+       if (test_bit(BCH_FS_ERROR, &c->flags)) {
+               fprintf(stderr, "%s: still has errors\n", c->name);
                ret |= 4;
+       }
 
        bch2_fs_stop(c);
        return ret;
index 797c51e0ef540988947e7ba82bde3eb022a06a65..a0d27427a13487adb0c3169472ade84f48aa4f47 100644 (file)
@@ -122,8 +122,8 @@ static void update_inode(struct bch_fs *c,
        struct bkey_inode_buf packed;
        int ret;
 
-       bch2_inode_pack(&packed, inode);
-       ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+       bch2_inode_pack(c, &packed, inode);
+       ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
                                NULL, NULL, 0);
        if (ret)
                die("error updating inode: %s", strerror(-ret));
@@ -301,7 +301,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
 
        while (length) {
                struct bkey_i_extent *e;
-               BKEY_PADDED(k) k;
+               __BKEY_PADDED(k, BKEY_EXTENT_VAL_U64s_MAX) k;
                u64 b = sector_to_bucket(ca, physical);
                struct disk_reservation res;
                unsigned sectors;
@@ -329,7 +329,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
 
                bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c);
 
-               ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
                                        &res, NULL, 0);
                if (ret)
                        die("btree insert error %s", strerror(-ret));
@@ -599,7 +599,9 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
        bch2_alloc_write(c, false);
 }
 
-static void find_superblock_space(ranges extents, struct dev_opts *dev)
+static void find_superblock_space(ranges extents,
+                                 struct format_opts opts,
+                                 struct dev_opts *dev)
 {
        struct range *i;
 
@@ -609,9 +611,10 @@ static void find_superblock_space(ranges extents, struct dev_opts *dev)
                u64 end = round_down(i->end,
                                     dev->bucket_size << 9);
 
-               if (start + (128 << 10) <= end) {
+               /* Need space for two superblocks: */
+               if (start + (opts.superblock_size << 9) * 2 <= end) {
                        dev->sb_offset  = start >> 9;
-                       dev->sb_end     = dev->sb_offset + 256;
+                       dev->sb_end     = dev->sb_offset + opts.superblock_size * 2;
                        return;
                }
        }
@@ -673,7 +676,7 @@ static int migrate_fs(const char            *fs_path,
                                get_size(dev.path, dev.fd) / 5,
                                &bcachefs_inum, stat.st_dev, force);
 
-       find_superblock_space(extents, &dev);
+       find_superblock_space(extents, format_opts, &dev);
 
        struct bch_sb *sb = bch2_format(fs_opt_strs,
                                        fs_opts,format_opts, &dev, 1);
diff --git a/cmds.h b/cmds.h
index bcd27adc5350a5474f21d9a4d77cfa1f52e1fe54..cc490844dc8684533a9ff99ce6d039d71999d186 100644 (file)
--- a/cmds.h
+++ b/cmds.h
@@ -28,8 +28,10 @@ int cmd_device_offline(int argc, char *argv[]);
 int cmd_device_evacuate(int argc, char *argv[]);
 int cmd_device_set_state(int argc, char *argv[]);
 int cmd_device_resize(int argc, char *argv[]);
+int cmd_device_resize_journal(int argc, char *argv[]);
 
 int cmd_data_rereplicate(int argc, char *argv[]);
+int cmd_data_job(int argc, char *argv[]);
 
 int cmd_unlock(int argc, char *argv[]);
 int cmd_set_passphrase(int argc, char *argv[]);
index 684da48be1d44fa53da0cbafb09f4d3d2acbbb1b..3bd589849874f2594dfcc777eb46d30b7d20d8c6 100644 (file)
@@ -1,3 +1,12 @@
+bcachefs-tools (0.1+git20210404.ce906d66-1) UNRELEASED; urgency=medium
+
+  * New upstream snapshot
+  * Update standards version to 4.5.1
+
+  Currently unreleased due to test failures.
+
+ -- Jonathan Carter <jcc@debian.org>  Tue, 06 Apr 2021 15:11:27 +0200
+
 bcachefs-tools (0.1+git20201025.742dbbdb-1) unstable; urgency=medium
 
   * New upstream snapshot
index 81ffb467249bda2e9492eb3a1dfb4ac166c3c106..caf1b0d45b700174aab093471a62d1cab009b101 100644 (file)
@@ -2,7 +2,7 @@ Source: bcachefs-tools
 Maintainer: Jonathan Carter <jcc@debian.org>
 Section: utils
 Priority: optional
-Standards-Version: 4.5.0
+Standards-Version: 4.5.1
 Rules-Requires-Root: no
 Build-Depends: debhelper-compat (= 13),
                pkg-config,
index ba38766d3d47dce20264955b937eea51521bab59..d1acbd3cf326626a17b26587ec306fb923357eda 100644 (file)
@@ -1 +1 @@
-bcachefs-tools_0.1+git20201025.742dbbdb-1_source.buildinfo utils optional
+bcachefs-tools_0.1+git20210404.ce906d66-1_source.buildinfo utils optional
index f2183d5430ba49affa395b20b0a6acf3c444b2c4..2fe736e95b86cc333b50019e049c0490891ddfbb 100644 (file)
@@ -85,6 +85,17 @@ static inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
        return (old & mask) != 0;
 }
 
+static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
+{
+       unsigned long mask = BIT_MASK(nr);
+       unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
+       unsigned long old;
+
+       old = __atomic_fetch_and(p, ~mask, __ATOMIC_RELAXED);
+
+       return (old & mask) != 0;
+}
+
 static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
        unsigned long mask = BIT_MASK(nr);
index 024d645c5579914b386b8dcdd10cb4a7eff5d5e8..bfab7ea70eb56cab400de1a57f46d08c1210207c 100644 (file)
@@ -10,6 +10,8 @@
 #define cpu_present(cpu)       ((cpu) == 0)
 #define cpu_active(cpu)                ((cpu) == 0)
 
+#define raw_smp_processor_id() 0U
+
 #define for_each_cpu(cpu, mask)                        \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)            \
index 3a91130a4fbd54040301f7da2baf4a242b14ab67..f09689dafb008114f9199167f6d18918d0129a9d 100644 (file)
@@ -183,6 +183,14 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
 static inline void __genradix_iter_advance(struct genradix_iter *iter,
                                           size_t obj_size)
 {
+       size_t new_offset = iter->offset + obj_size;
+
+       if (new_offset < iter->offset) {
+               iter->offset    = SIZE_MAX;
+               iter->pos       = SIZE_MAX;
+               return;
+       }
+
        iter->offset += obj_size;
 
        if (!is_power_of_2(obj_size) &&
index 10d94c5eca5f453303ae368f78f9f7622ffaa490..4b45306d0ba1b517bc492c91031e04633664687a 100644 (file)
@@ -219,4 +219,6 @@ struct qstr {
 
 #define POISON_FREE 0x6b
 
+static inline void dump_stack(void) {}
+
 #endif
index 4a317090621c4eda065d9e70b11493b4e5c1c479..3639dc997ed43de7ca1359ae3dc5dee299937083 100644 (file)
@@ -26,7 +26,6 @@
 #define list_for_each_entry(p, h, m)   cds_list_for_each_entry(p, h, m)
 #define list_for_each_entry_reverse(p, h, m) cds_list_for_each_entry_reverse(p, h, m)
 #define list_for_each_entry_safe(p, n, h, m) cds_list_for_each_entry_safe(p, n, h, m)
-#define list_for_each_entry_safe_reverse(p, n, h, m) cds_list_for_each_entry_safe_reverse(p, n, h, m)
 
 static inline int list_empty_careful(const struct list_head *head)
 {
@@ -54,6 +53,15 @@ static inline void list_splice_init(struct list_head *list,
 #define list_first_entry_or_null(ptr, type, member) \
        (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
 
+#define list_prev_entry(pos, member) \
+       list_entry((pos)->member.prev, typeof(*(pos)), member)
+
+#define list_for_each_entry_safe_reverse(pos, n, head, member)         \
+       for (pos = list_last_entry(head, typeof(*pos), member),         \
+               n = list_prev_entry(pos, member);                       \
+            &pos->member != (head);                                    \
+            pos = n, n = list_prev_entry(n, member))
+
 /* hlists: */
 
 #include <urcu/hlist.h>
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
new file mode 100644 (file)
index 0000000..fa6e847
--- /dev/null
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+#include <linux/poison.h>
+#include <linux/const.h>
+
+/*
+ * Special version of lists, where end of list is not a NULL pointer,
+ * but a 'nulls' marker, which can have many different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * In the standard hlist, termination of a list is the NULL pointer.
+ * In this special 'nulls' variant, we use the fact that objects stored in
+ * a list are aligned on a word (4 or 8 bytes alignment).
+ * We therefore use the last significant bit of 'ptr' :
+ * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ */
+
+struct hlist_nulls_head {
+       struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+       struct hlist_nulls_node *next, **pprev;
+};
+#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+       ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+       ({ typeof(ptr) ____ptr = (ptr); \
+          !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+       })
+/**
+ * ptr_is_a_nulls - Test if a ptr is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
+{
+       return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Get the 'nulls' value of the end of chain
+ * @ptr: end of chain
+ *
+ * Should be called only if is_a_nulls(ptr);
+ */
+static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
+{
+       return ((unsigned long)ptr) >> 1;
+}
+
+/**
+ * hlist_nulls_unhashed - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.
+ */
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+       return !h->pprev;
+}
+
+/**
+ * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
+ * function may be used locklessly.
+ */
+static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
+{
+       return !READ_ONCE(h->pprev);
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+       return is_a_nulls(READ_ONCE(h->first));
+}
+
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+                                       struct hlist_nulls_head *h)
+{
+       struct hlist_nulls_node *first = h->first;
+
+       n->next = first;
+       WRITE_ONCE(n->pprev, &h->first);
+       h->first = n;
+       if (!is_a_nulls(first))
+               WRITE_ONCE(first->pprev, &n->next);
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+       struct hlist_nulls_node *next = n->next;
+       struct hlist_nulls_node **pprev = n->pprev;
+
+       WRITE_ONCE(*pprev, next);
+       if (!is_a_nulls(next))
+               WRITE_ONCE(next->pprev, pprev);
+}
+
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+       __hlist_nulls_del(n);
+       WRITE_ONCE(n->pprev, LIST_POISON2);
+}
+
+/**
+ * hlist_nulls_for_each_entry  - iterate over list of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct hlist_node to use as a loop cursor.
+ * @head:      the head for your list.
+ * @member:    the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member)                   \
+       for (pos = (head)->first;                                              \
+            (!is_a_nulls(pos)) &&                                             \
+               ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct hlist_node to use as a loop cursor.
+ * @member:    the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member)     \
+       for (; (!is_a_nulls(pos)) &&                            \
+               ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+#endif
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
new file mode 100644 (file)
index 0000000..ef74051
--- /dev/null
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+#include <linux/limits.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+#define is_signed_type(type)       (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+/*
+ * Avoids triggering -Wtype-limits compilation warning,
+ * while using unsigned data types to check a < 0.
+ */
+#define is_non_negative(a) ((a) > 0 || (a) == 0)
+#define is_negative(a) (!(is_non_negative(a)))
+
+/*
+ * Allows for effectively applying __must_check to a macro so we can have
+ * both the type-agnostic benefits of the macros while also being able to
+ * enforce that the return value is, in fact, checked.
+ */
+static inline bool __must_check __must_check_overflow(bool overflow)
+{
+       return unlikely(overflow);
+}
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) __must_check_overflow(({   \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       __builtin_add_overflow(__a, __b, __d);  \
+}))
+
+#define check_sub_overflow(a, b, d) __must_check_overflow(({   \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       __builtin_sub_overflow(__a, __b, __d);  \
+}))
+
+#define check_mul_overflow(a, b, d) __must_check_overflow(({   \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       __builtin_mul_overflow(__a, __b, __d);  \
+}))
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({    \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       *__d = __a + __b;                       \
+       *__d < __a;                             \
+})
+#define __unsigned_sub_overflow(a, b, d) ({    \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       *__d = __a - __b;                       \
+       __a < __b;                              \
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({            \
+       typeof(a) __a = (a);                            \
+       typeof(b) __b = (b);                            \
+       typeof(d) __d = (d);                            \
+       (void) (&__a == &__b);                          \
+       (void) (&__a == __d);                           \
+       *__d = __a * __b;                               \
+       __builtin_constant_p(__b) ?                     \
+         __b > 0 && __a > type_max(typeof(__a)) / __b : \
+         __a > 0 && __b > type_max(typeof(__b)) / __a;  \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({      \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       *__d = (u64)__a + (u64)__b;             \
+       (((~(__a ^ __b)) & (*__d ^ __a))        \
+               & type_min(typeof(__a))) != 0;  \
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({      \
+       typeof(a) __a = (a);                    \
+       typeof(b) __b = (b);                    \
+       typeof(d) __d = (d);                    \
+       (void) (&__a == &__b);                  \
+       (void) (&__a == __d);                   \
+       *__d = (u64)__a - (u64)__b;             \
+       ((((__a ^ __b)) & (*__d ^ __a))         \
+               & type_min(typeof(__a))) != 0;  \
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({                              \
+       typeof(a) __a = (a);                                            \
+       typeof(b) __b = (b);                                            \
+       typeof(d) __d = (d);                                            \
+       typeof(a) __tmax = type_max(typeof(a));                         \
+       typeof(a) __tmin = type_min(typeof(a));                         \
+       (void) (&__a == &__b);                                          \
+       (void) (&__a == __d);                                           \
+       *__d = (u64)__a * (u64)__b;                                     \
+       (__b > 0   && (__a > __tmax/__b || __a < __tmin/__b)) ||        \
+       (__b < (typeof(__b))-1  && (__a > __tmin/__b || __a < __tmax/__b)) || \
+       (__b == (typeof(__b))-1 && __a == __tmin);                      \
+})
+
+
+#define check_add_overflow(a, b, d)    __must_check_overflow(          \
+       __builtin_choose_expr(is_signed_type(typeof(a)),                \
+                       __signed_add_overflow(a, b, d),                 \
+                       __unsigned_add_overflow(a, b, d)))
+
+#define check_sub_overflow(a, b, d)    __must_check_overflow(          \
+       __builtin_choose_expr(is_signed_type(typeof(a)),                \
+                       __signed_sub_overflow(a, b, d),                 \
+                       __unsigned_sub_overflow(a, b, d)))
+
+#define check_mul_overflow(a, b, d)    __must_check_overflow(          \
+       __builtin_choose_expr(is_signed_type(typeof(a)),                \
+                       __signed_mul_overflow(a, b, d),                 \
+                       __unsigned_mul_overflow(a, b, d)))
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+ *
+ * @a: Value to be shifted
+ * @s: How many bits left to shift
+ * @d: Pointer to where to store the result
+ *
+ * Computes *@d = (@a << @s)
+ *
+ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * make sense. Example conditions:
+ * - 'a << s' causes bits to be lost when stored in *d.
+ * - 's' is garbage (e.g. negative) or so large that the result of
+ *   'a << s' is guaranteed to be 0.
+ * - 'a' is negative.
+ * - 'a << s' sets the sign bit, if any, in '*d'.
+ *
+ * '*d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if false is returned.
+ */
+#define check_shl_overflow(a, s, d) __must_check_overflow(({           \
+       typeof(a) _a = a;                                               \
+       typeof(s) _s = s;                                               \
+       typeof(d) _d = d;                                               \
+       u64 _a_full = _a;                                               \
+       unsigned int _to_shift =                                        \
+               is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0;    \
+       *_d = (_a_full << _to_shift);                                   \
+       (_to_shift != _s || is_negative(*_d) || is_negative(_a) ||      \
+       (*_d >> _to_shift) != _a);                                      \
+}))
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+       size_t bytes;
+
+       if (check_mul_overflow(a, b, &bytes))
+               return SIZE_MAX;
+
+       return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+       size_t bytes;
+
+       if (check_mul_overflow(a, b, &bytes))
+               return SIZE_MAX;
+       if (check_mul_overflow(bytes, c, &bytes))
+               return SIZE_MAX;
+
+       return bytes;
+}
+
+/*
+ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
+ * struct_size() below.
+ */
+static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
+{
+       size_t bytes;
+
+       if (check_mul_overflow(a, b, &bytes))
+               return SIZE_MAX;
+       if (check_add_overflow(bytes, c, &bytes))
+               return SIZE_MAX;
+
+       return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @count number of @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, count)                                  \
+       __ab_c_size(count,                                              \
+                   sizeof(*(p)->member) + __must_be_array((p)->member),\
+                   sizeof(*(p)))
+
+/**
+ * flex_array_size() - Calculate size of a flexible array member
+ *                     within an enclosing structure.
+ *
+ * @p: Pointer to the structure.
+ * @member: Name of the flexible array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of a flexible array of @count number of @member
+ * elements, at the end of structure @p.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define flex_array_size(p, member, count)                              \
+       array_size(count,                                               \
+                   sizeof(*(p)->member) + __must_be_array((p)->member))
+
+#endif /* __LINUX_OVERFLOW_H */
index 87be064f7dbc3df218409ac580ff7ddc823f00a3..310b3eda6de0f63aeb7162bbeb85dc2ddb7ec684 100644 (file)
@@ -21,6 +21,8 @@ struct page;
 #define kmap_atomic(page)              page_address(page)
 #define kunmap_atomic(addr)            do {} while (0)
 
+#define PageHighMem(page)              false
+
 static const char zero_page[PAGE_SIZE];
 
 #define ZERO_PAGE(o)                   ((struct page *) &zero_page[0])
diff --git a/include/linux/poison.h b/include/linux/poison.h
new file mode 100644 (file)
index 0000000..dc8ae5d
--- /dev/null
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+
+/*
+ * Architectures might want to move the poison pointer offset
+ * into some well-recognized area such as 0xdead000000000000,
+ * that is also not mappable by user-space exploits:
+ */
+#ifdef CONFIG_ILLEGAL_POINTER_VALUE
+# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL)
+#else
+# define POISON_POINTER_DELTA 0
+#endif
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x122 + POISON_POINTER_DELTA)
+
+/********** include/linux/timer.h **********/
+#define TIMER_ENTRY_STATIC     ((void *) 0x300 + POISON_POINTER_DELTA)
+
+/********** mm/page_poison.c **********/
+#ifdef CONFIG_PAGE_POISONING_ZERO
+#define PAGE_POISON 0x00
+#else
+#define PAGE_POISON 0xaa
+#endif
+
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING   ((void *) 0x400 + POISON_POINTER_DELTA)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define        RED_INACTIVE    0x09F911029D74E35BULL   /* when obj is inactive */
+#define        RED_ACTIVE      0xD84156C5635688C0ULL   /* when obj is active */
+
+#define SLUB_RED_INACTIVE      0xbb
+#define SLUB_RED_ACTIVE                0xcc
+
+/* ...and for poisoning */
+#define        POISON_INUSE    0x5a    /* for use-uninitialised poisoning */
+#define POISON_FREE    0x6b    /* for use-after-free poisoning */
+#define        POISON_END      0xa5    /* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM    0xcc
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE                0x5b
+#define JBD2_POISON_FREE       0x5c
+
+/********** drivers/base/dmapool.c **********/
+#define        POOL_POISON_FREED       0xa7    /* !inuse */
+#define        POOL_POISON_ALLOCATED   0xa9    /* !initted */
+
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE                0x12
+#define ATM_POISON             0xdeadbeef
+
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT       0x11
+#define MUTEX_DEBUG_FREE       0x22
+#define MUTEX_POISON_WW_CTX    ((void *) 0x500 + POISON_POINTER_DELTA)
+
+/********** security/ **********/
+#define KEY_DESTROY            0xbd
+
+#endif
index c38ae46d021f3a90b95460bb4b1b4b8954fa051a..28c595a0c0cfe3429ff69c19fa83edb424d7d38c 100644 (file)
@@ -45,6 +45,7 @@ static inline type get_random_##type(void)            \
 
 get_random_type(int);
 get_random_type(long);
+get_random_type(u32);
 get_random_type(u64);
 
 #endif /* _LINUX_RANDOM_H */
index c99d78a897ac8a03a29b831dbf2964484ee62177..ae292241c82c5f8ce361443d1f53a3e91e4737a0 100644 (file)
 
 #define RCU_INIT_POINTER(p, v)         WRITE_ONCE(p, v)
 
+/* Has the specified rcu_head structure been handed to call_rcu()? */
+
+/**
+ * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize.
+ *
+ * If you intend to invoke rcu_head_after_call_rcu() to test whether a
+ * given rcu_head structure has already been passed to call_rcu(), then
+ * you must also invoke this rcu_head_init() function on it just after
+ * allocating that structure.  Calls to this function must not race with
+ * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
+ */
+static inline void rcu_head_init(struct rcu_head *rhp)
+{
+       rhp->func = (void *)~0L;
+}
+
+static inline bool
+rcu_head_after_call_rcu(struct rcu_head *rhp,
+                       void (*f)(struct rcu_head *head))
+{
+       void (*func)(struct rcu_head *head) = READ_ONCE(rhp->func);
+
+       if (func == f)
+               return true;
+       return false;
+}
+
 #endif /* __TOOLS_LINUX_RCUPDATE_H */
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
new file mode 100644 (file)
index 0000000..57467cb
--- /dev/null
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Simple structures that might be needed in include
+ * files.
+ */
+
+#ifndef _LINUX_RHASHTABLE_TYPES_H
+#define _LINUX_RHASHTABLE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct rhash_head {
+       struct rhash_head __rcu         *next;
+};
+
+struct rhlist_head {
+       struct rhash_head               rhead;
+       struct rhlist_head __rcu        *next;
+};
+
+struct bucket_table;
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+       struct rhashtable *ht;
+       const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+                              const void *obj);
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+       u16                     nelem_hint;
+       u16                     key_len;
+       u16                     key_offset;
+       u16                     head_offset;
+       unsigned int            max_size;
+       u16                     min_size;
+       bool                    automatic_shrinking;
+       rht_hashfn_t            hashfn;
+       rht_obj_hashfn_t        obj_hashfn;
+       rht_obj_cmpfn_t         obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @key_len: Key length for hashfn
+ * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
+ */
+struct rhashtable {
+       struct bucket_table __rcu       *tbl;
+       unsigned int                    key_len;
+       unsigned int                    max_elems;
+       struct rhashtable_params        p;
+       bool                            rhlist;
+       struct work_struct              run_work;
+       struct mutex                    mutex;
+       spinlock_t                      lock;
+       atomic_t                        nelems;
+};
+
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+       struct rhashtable ht;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+       struct list_head list;
+       struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @list: Current hash list pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+       struct rhashtable *ht;
+       struct rhash_head *p;
+       struct rhlist_head *list;
+       struct rhashtable_walker walker;
+       unsigned int slot;
+       unsigned int skip;
+       bool end_of_table;
+};
+
+int rhashtable_init(struct rhashtable *ht,
+                   const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+                 const struct rhashtable_params *params);
+
+#endif /* _LINUX_RHASHTABLE_TYPES_H */
index 8dbe1533d54cb9cfa1523a93c09217c6d3df5de3..6cf8c2571160cc8019aead4af8b8dff5f2f11914 100644 (file)
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
- * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
  * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  *
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
-#include <linux/atomic.h>
-#include <linux/cache.h>
-#include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/jhash.h>
-#include <linux/workqueue.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
+#include <linux/list_nulls.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
 
-#define RHT_BASE_BITS          4
-#define RHT_HASH_BITS          27
-#define RHT_BASE_SHIFT         RHT_HASH_BITS
-#define RHT_HASH_RESERVED_SPACE        (RHT_BASE_BITS + 1)
+#define BIT(nr)                        (1UL << (nr))
 
-struct rhash_head {
-       struct rhash_head __rcu         *next;
-};
+#include <linux/rhashtable-types.h>
+/*
+ * Objects in an rhashtable have an embedded struct rhash_head
+ * which is linked into as hash chain from the hash table - or one
+ * of two or more hash tables when the rhashtable is being resized.
+ * The end of the chain is marked with a special nulls marks which has
+ * the least significant bit set but otherwise stores the address of
+ * the hash bucket.  This allows us to be sure we've found the end
+ * of the right list.
+ * The value stored in the hash bucket has BIT(0) used as a lock bit.
+ * This bit must be atomically set before any changes are made to
+ * the chain.  To avoid dereferencing this pointer without clearing
+ * the bit first, we use an opaque 'struct rhash_lock_head *' for the
+ * pointer stored in the bucket.  This struct needs to be defined so
+ * that rcu_dereference() works on it, but it has no content so a
+ * cast is needed for it to be useful.  This ensures it isn't
+ * used by mistake with clearing the lock bit first.
+ */
+struct rhash_lock_head {};
 
+/* Maximum chain length before rehash
+ *
+ * The maximum (not average) chain length grows with the size of the hash
+ * table, at a rate of (log N)/(log log N).
+ *
+ * The value of 16 is selected so that even if the hash table grew to
+ * 2^32 you would not expect the maximum chain length to exceed it
+ * unless we are under attack (or extremely unlucky).
+ *
+ * As this limit is only to detect attacks, we don't need to set it to a
+ * lower value as you'd need the chain length to vastly exceed 16 to have
+ * any real effect on the system.
+ */
+#define RHT_ELASTICITY 16u
+
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @nest: Number of bits of first-level nested table.
+ * @rehash: Current bucket being rehashed
+ * @hash_rnd: Random seed to fold into hash
+ * @walkers: List of active walkers
+ * @rcu: RCU structure for freeing the table
+ * @future_tbl: Table under construction during rehashing
+ * @ntbl: Nested table used when out of memory.
+ * @buckets: size * hash buckets
+ */
 struct bucket_table {
        unsigned int            size;
-       unsigned int            rehash;
+       unsigned int            nest;
        u32                     hash_rnd;
-       unsigned int            locks_mask;
-       spinlock_t              *locks;
        struct list_head        walkers;
        struct rcu_head         rcu;
 
        struct bucket_table __rcu *future_tbl;
 
-       struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
-};
-
-struct rhashtable_compare_arg {
-       struct rhashtable *ht;
-       const void *key;
+       struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
-                              const void *obj);
-
-struct rhashtable_params {
-       size_t                  nelem_hint;
-       size_t                  key_len;
-       size_t                  key_offset;
-       size_t                  head_offset;
-       unsigned int            insecure_max_entries;
-       unsigned int            max_size;
-       unsigned int            min_size;
-       u32                     nulls_base;
-       bool                    insecure_elasticity;
-       bool                    automatic_shrinking;
-       size_t                  locks_mul;
-       rht_hashfn_t            hashfn;
-       rht_obj_hashfn_t        obj_hashfn;
-       rht_obj_cmpfn_t         obj_cmpfn;
-};
-
-struct rhashtable {
-       struct bucket_table __rcu       *tbl;
-       atomic_t                        nelems;
-       unsigned int                    key_len;
-       unsigned int                    elasticity;
-       struct rhashtable_params        p;
-       struct work_struct              run_work;
-       struct mutex                    mutex;
-       spinlock_t                      lock;
-};
-
-struct rhashtable_walker {
-       struct list_head list;
-       struct bucket_table *tbl;
-};
-
-#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
-
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
-       return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
-#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-       ((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+/*
+ * NULLS_MARKER() expects a hash value with the low
+ * bits mostly likely to be significant, and it discards
+ * the msb.
+ * We give it an address, in which the bottom bit is
+ * always 0, and the msb might be significant.
+ * So we shift the address down one bit to align with
+ * expectations and avoid losing a significant bit.
+ *
+ * We never store the NULLS_MARKER in the hash table
+ * itself as we need the lsb for locking.
+ * Instead we store a NULL
+ */
+#define        RHT_NULLS_MARKER(ptr)   \
+       ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
+#define INIT_RHT_NULLS_HEAD(ptr)       \
+       ((ptr) = NULL)
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
@@ -118,37 +120,45 @@ static inline void *rht_obj(const struct rhashtable *ht,
 static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
                                            unsigned int hash)
 {
-       return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+       return hash & (tbl->size - 1);
 }
 
-static inline unsigned int rht_key_hashfn(
-       struct rhashtable *ht, const struct bucket_table *tbl,
-       const void *key, const struct rhashtable_params params)
+static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
+       const void *key, const struct rhashtable_params params,
+       unsigned int hash_rnd)
 {
        unsigned int hash;
 
        /* params must be equal to ht->p if it isn't constant. */
        if (!__builtin_constant_p(params.key_len))
-               hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd);
+               hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
        else if (params.key_len) {
                unsigned int key_len = params.key_len;
 
                if (params.hashfn)
-                       hash = params.hashfn(key, key_len, tbl->hash_rnd);
+                       hash = params.hashfn(key, key_len, hash_rnd);
                else if (key_len & (sizeof(u32) - 1))
-                       hash = jhash(key, key_len, tbl->hash_rnd);
+                       hash = jhash(key, key_len, hash_rnd);
                else
-                       hash = jhash2(key, key_len / sizeof(u32),
-                                     tbl->hash_rnd);
+                       hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
        } else {
                unsigned int key_len = ht->p.key_len;
 
                if (params.hashfn)
-                       hash = params.hashfn(key, key_len, tbl->hash_rnd);
+                       hash = params.hashfn(key, key_len, hash_rnd);
                else
-                       hash = jhash(key, key_len, tbl->hash_rnd);
+                       hash = jhash(key, key_len, hash_rnd);
        }
 
+       return hash;
+}
+
+static inline unsigned int rht_key_hashfn(
+       struct rhashtable *ht, const struct bucket_table *tbl,
+       const void *key, const struct rhashtable_params params)
+{
+       unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);
+
        return rht_bucket_index(tbl, hash);
 }
 
@@ -165,6 +175,11 @@ static inline unsigned int rht_head_hashfn(
               rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
 }
 
+/**
+ * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
+ * @ht:                hash table
+ * @tbl:       current table
+ */
 static inline bool rht_grow_above_75(const struct rhashtable *ht,
                                     const struct bucket_table *tbl)
 {
@@ -173,6 +188,11 @@ static inline bool rht_grow_above_75(const struct rhashtable *ht,
               (!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
+ * @ht:                hash table
+ * @tbl:       current table
+ */
 static inline bool rht_shrink_below_30(const struct rhashtable *ht,
                                       const struct bucket_table *tbl)
 {
@@ -181,6 +201,11 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht,
               tbl->size > ht->p.min_size;
 }
 
+/**
+ * rht_grow_above_100 - returns true if nelems > table-size
+ * @ht:                hash table
+ * @tbl:       current table
+ */
 static inline bool rht_grow_above_100(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
 {
@@ -188,62 +213,353 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
                (!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_grow_above_max - returns true if table is above maximum
+ * @ht:                hash table
+ * @tbl:       current table
+ */
 static inline bool rht_grow_above_max(const struct rhashtable *ht,
                                      const struct bucket_table *tbl)
 {
-       return ht->p.insecure_max_entries &&
-              atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+       return atomic_read(&ht->nelems) >= ht->max_elems;
 }
 
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
-                                         unsigned int hash)
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
+#else
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
-       return &tbl->locks[hash & tbl->locks_mask];
+       return 1;
 }
 
-int rhashtable_insert_rehash(struct rhashtable *, struct bucket_table *);
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *,
-                                           const void *,
-                                           struct rhash_head *,
-                                           struct bucket_table *);
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+                                            u32 hash)
+{
+       return 1;
+}
+#endif /* CONFIG_PROVE_LOCKING */
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+                            struct rhash_head *obj);
 
-int rhashtable_init(struct rhashtable *, const struct rhashtable_params *);
-void rhashtable_destroy(struct rhashtable *);
+void rhashtable_walk_enter(struct rhashtable *ht,
+                          struct rhashtable_iter *iter);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
 
-#define rht_dereference(p, ht)                 rcu_dereference(p)
-#define rht_dereference_rcu(p, ht)             rcu_dereference(p)
-#define rht_dereference_bucket(p, tbl, hash)   rcu_dereference(p)
-#define rht_dereference_bucket_rcu(p, tbl, hash) rcu_dereference(p)
+static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+       (void)rhashtable_walk_start_check(iter);
+}
+
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_peek(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
+
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+                                void (*free_fn)(void *ptr, void *arg),
+                                void *arg);
+void rhashtable_destroy(struct rhashtable *ht);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+       const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+       const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+       struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);
+
+#define rht_dereference(p, ht) \
+       rcu_dereference(p)
+
+#define rht_dereference_rcu(p, ht) \
+       rcu_dereference(p)
+
+#define rht_dereference_bucket(p, tbl, hash) \
+       rcu_dereference(p)
+
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+       rcu_dereference(p)
 
 #define rht_entry(tpos, pos, member) \
        ({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
-#define rht_for_each_continue(pos, head, tbl, hash) \
-       for (pos = rht_dereference_bucket(head, tbl, hash); \
-            !rht_is_a_nulls(pos); \
+static inline struct rhash_lock_head __rcu *const *rht_bucket(
+       const struct bucket_table *tbl, unsigned int hash)
+{
+       return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+                                    &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_var(
+       struct bucket_table *tbl, unsigned int hash)
+{
+       return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
+                                    &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_insert(
+       struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+       return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
+                                    &tbl->buckets[hash];
+}
+
+/*
+ * We lock a bucket by setting BIT(0) in the pointer - this is always
+ * zero in real pointers.  The NULLS mark is never stored in the bucket,
+ * rather we store NULL if the bucket is empty.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer.  In that case
+ * we cannot get a lock.  For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there.  In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
+ * provides the same release semantics that bit_spin_unlock() provides,
+ * this is safe.
+ * When we write to a bucket without unlocking, we use rht_assign_locked().
+ */
+
+static inline void rht_lock(struct bucket_table *tbl,
+                           struct rhash_lock_head __rcu **bkt)
+{
+       bit_spin_lock(0, (unsigned long *)bkt);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+                                  struct rhash_lock_head __rcu **bucket,
+                                  unsigned int subclass)
+{
+       bit_spin_lock(0, (unsigned long *)bucket);
+}
+
+static inline void rht_unlock(struct bucket_table *tbl,
+                             struct rhash_lock_head __rcu **bkt)
+{
+       bit_spin_unlock(0, (unsigned long *)bkt);
+}
+
+static inline struct rhash_head *__rht_ptr(
+       struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
+{
+       return (struct rhash_head *)
+               ((unsigned long)p & ~BIT(0) ?:
+                (unsigned long)RHT_NULLS_MARKER(bkt));
+}
+
+/*
+ * Where 'bkt' is a bucket and might be locked:
+ *   rht_ptr_rcu() dereferences that pointer and clears the lock bit.
+ *   rht_ptr() dereferences in a context where the bucket is locked.
+ *   rht_ptr_exclusive() dereferences in a context where exclusive
+ *            access is guaranteed, such as when destroying the table.
+ */
+static inline struct rhash_head *rht_ptr_rcu(
+       struct rhash_lock_head __rcu *const *bkt)
+{
+       return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline struct rhash_head *rht_ptr(
+       struct rhash_lock_head __rcu *const *bkt,
+       struct bucket_table *tbl,
+       unsigned int hash)
+{
+       return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
+}
+
+static inline struct rhash_head *rht_ptr_exclusive(
+       struct rhash_lock_head __rcu *const *bkt)
+{
+       return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
+                                    struct rhash_head *obj)
+{
+       if (rht_is_a_nulls(obj))
+               obj = NULL;
+       rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
+}
+
+static inline void rht_assign_unlock(struct bucket_table *tbl,
+                                    struct rhash_lock_head __rcu **bkt,
+                                    struct rhash_head *obj)
+{
+       if (rht_is_a_nulls(obj))
+               obj = NULL;
+       rcu_assign_pointer(*bkt, (void *)obj);
+       preempt_enable();
+       __release(bitlock);
+}
+
+/**
+ * rht_for_each_from - iterate over hash chain from given head
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @head:      the &struct rhash_head to start from
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ */
+#define rht_for_each_from(pos, head, tbl, hash) \
+       for (pos = head;                        \
+            !rht_is_a_nulls(pos);              \
             pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ */
 #define rht_for_each(pos, tbl, hash) \
-       rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+       rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
+                         tbl, hash)
+
+/**
+ * rht_for_each_entry_from - iterate over hash chain from given head
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @head:      the &struct rhash_head to start from
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)    \
+       for (pos = head;                                                \
+            (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);    \
+            pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
-#define rht_for_each_rcu_continue(pos, head, tbl, hash)                        \
+/**
+ * rht_for_each_entry - iterate over hash chain of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry(tpos, pos, tbl, hash, member)               \
+       rht_for_each_entry_from(tpos, pos,                              \
+                               rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+                               tbl, hash, member)
+
+/**
+ * rht_for_each_entry_safe - safely iterate over hash chain of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @next:      the &struct rhash_head to use as next in loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive allows for the looped code to
+ * remove the loop cursor from the list.
+ */
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)          \
+       for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),                 \
+            next = !rht_is_a_nulls(pos) ?                                    \
+                      rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
+            (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);          \
+            pos = next,                                                      \
+            next = !rht_is_a_nulls(pos) ?                                    \
+                      rht_dereference_bucket(pos->next, tbl, hash) : NULL)
+
+/**
+ * rht_for_each_rcu_from - iterate over rcu hash chain from given head
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @head:      the &struct rhash_head to start from
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_from(pos, head, tbl, hash)                    \
        for (({barrier(); }),                                           \
-            pos = rht_dereference_bucket_rcu(head, tbl, hash);         \
+            pos = head;                                                \
             !rht_is_a_nulls(pos);                                      \
             pos = rcu_dereference_raw(pos->next))
 
-#define rht_for_each_rcu(pos, tbl, hash)                               \
-       rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+/**
+ * rht_for_each_rcu - iterate over rcu hash chain
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu(pos, tbl, hash)                       \
+       for (({barrier(); }),                                   \
+            pos = rht_ptr_rcu(rht_bucket(tbl, hash));          \
+            !rht_is_a_nulls(pos);                              \
+            pos = rcu_dereference_raw(pos->next))
 
-#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+/**
+ * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @head:      the &struct rhash_head to start from
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
        for (({barrier(); }),                                               \
-            pos = rht_dereference_bucket_rcu(head, tbl, hash);             \
+            pos = head;                                                    \
             (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);        \
             pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)           \
-       rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
-                                       tbl, hash, member)
+/**
+ * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rhash_head to use as a loop cursor.
+ * @tbl:       the &struct bucket_table
+ * @hash:      the hash value / bucket index
+ * @member:    name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)              \
+       rht_for_each_entry_rcu_from(tpos, pos,                             \
+                                   rht_ptr_rcu(rht_bucket(tbl, hash)),    \
+                                   tbl, hash, member)
+
+/**
+ * rhl_for_each_rcu - iterate over rcu hash table list
+ * @pos:       the &struct rlist_head to use as a loop cursor.
+ * @list:      the head of the list
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_rcu(pos, list)                                    \
+       for (pos = list; pos; pos = rcu_dereference_raw(pos->next))
+
+/**
+ * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
+ * @tpos:      the type * to use as a loop cursor.
+ * @pos:       the &struct rlist_head to use as a loop cursor.
+ * @list:      the head of the list
+ * @member:    name of the &struct rlist_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_entry_rcu(tpos, pos, list, member)                        \
+       for (pos = list; pos && rht_entry(tpos, pos, member);           \
+            pos = rcu_dereference_raw(pos->next))
 
 static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
                                     const void *obj)
@@ -254,7 +570,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
        return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
 }
 
-static inline void *rhashtable_lookup_fast(
+/* Internal function, do not use. */
+static inline struct rhash_head *__rhashtable_lookup(
        struct rhashtable *ht, const void *key,
        const struct rhashtable_params params)
 {
@@ -262,23 +579,27 @@ static inline void *rhashtable_lookup_fast(
                .ht = ht,
                .key = key,
        };
-       const struct bucket_table *tbl;
+       struct rhash_lock_head __rcu *const *bkt;
+       struct bucket_table *tbl;
        struct rhash_head *he;
        unsigned int hash;
 
-       rcu_read_lock();
-
        tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
        hash = rht_key_hashfn(ht, tbl, key, params);
-       rht_for_each_rcu(he, tbl, hash) {
-               if (params.obj_cmpfn ?
-                   params.obj_cmpfn(&arg, rht_obj(ht, he)) :
-                   rhashtable_compare(&arg, rht_obj(ht, he)))
-                       continue;
-               rcu_read_unlock();
-               return rht_obj(ht, he);
-       }
+       bkt = rht_bucket(tbl, hash);
+       do {
+               rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
+                       if (params.obj_cmpfn ?
+                           params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+                           rhashtable_compare(&arg, rht_obj(ht, he)))
+                               continue;
+                       return he;
+               }
+               /* An object might have been moved to a different hash chain,
+                * while we walk along it - better check and retry.
+                */
+       } while (he != RHT_NULLS_MARKER(bkt));
 
        /* Ensure we see any new tables. */
        smp_rmb();
@@ -286,149 +607,593 @@ restart:
        tbl = rht_dereference_rcu(tbl->future_tbl, ht);
        if (unlikely(tbl))
                goto restart;
-       rcu_read_unlock();
 
        return NULL;
 }
 
-static inline int __rhashtable_insert_fast(
-       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+/**
+ * rhashtable_lookup - search hash table
+ * @ht:                hash table
+ * @key:       the pointer to the key
+ * @params:    hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup(
+       struct rhashtable *ht, const void *key,
+       const struct rhashtable_params params)
+{
+       struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+
+       return he ? rht_obj(ht, he) : NULL;
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, without RCU read lock
+ * @ht:                hash table
+ * @key:       the pointer to the key
+ * @params:    hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Only use this function when you have other mechanisms guaranteeing
+ * that the object won't go away after the RCU read lock is released.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+       struct rhashtable *ht, const void *key,
+       const struct rhashtable_params params)
+{
+       void *obj;
+
+       rcu_read_lock();
+       obj = rhashtable_lookup(ht, key, params);
+       rcu_read_unlock();
+
+       return obj;
+}
+
+/**
+ * rhltable_lookup - search hash list table
+ * @hlt:       hash table
+ * @key:       the pointer to the key
+ * @params:    hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key.  All matching entries are returned
+ * in a list.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the list of entries that match the given key.
+ */
+static inline struct rhlist_head *rhltable_lookup(
+       struct rhltable *hlt, const void *key,
        const struct rhashtable_params params)
+{
+       struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
+
+       return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+}
+
+/* Internal function, please use rhashtable_insert_fast() instead. This
+ * function returns the existing element already in hashes in there is a clash,
+ * otherwise it returns an error via ERR_PTR().
+ */
+static inline void *__rhashtable_insert_fast(
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params, bool rhlist)
 {
        struct rhashtable_compare_arg arg = {
                .ht = ht,
                .key = key,
        };
-       struct bucket_table *tbl, *new_tbl;
+       struct rhash_lock_head __rcu **bkt;
+       struct rhash_head __rcu **pprev;
+       struct bucket_table *tbl;
        struct rhash_head *head;
-       spinlock_t *lock;
-       unsigned int elasticity;
        unsigned int hash;
-       int err;
+       int elasticity;
+       void *data;
 
-restart:
        rcu_read_lock();
 
        tbl = rht_dereference_rcu(ht->tbl, ht);
+       hash = rht_head_hashfn(ht, tbl, obj, params);
+       elasticity = RHT_ELASTICITY;
+       bkt = rht_bucket_insert(ht, tbl, hash);
+       data = ERR_PTR(-ENOMEM);
+       if (!bkt)
+               goto out;
+       pprev = NULL;
+       rht_lock(tbl, bkt);
 
-       /* All insertions must grab the oldest table containing
-        * the hashed bucket that is yet to be rehashed.
-        */
-       for (;;) {
-               hash = rht_head_hashfn(ht, tbl, obj, params);
-               lock = rht_bucket_lock(tbl, hash);
-               spin_lock_bh(lock);
+       if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
+slow_path:
+               rht_unlock(tbl, bkt);
+               rcu_read_unlock();
+               return rhashtable_insert_slow(ht, key, obj);
+       }
 
-               if (tbl->rehash <= hash)
-                       break;
+       rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+               struct rhlist_head *plist;
+               struct rhlist_head *list;
 
-               spin_unlock_bh(lock);
-               tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-       }
+               elasticity--;
+               if (!key ||
+                   (params.obj_cmpfn ?
+                    params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+                    rhashtable_compare(&arg, rht_obj(ht, head)))) {
+                       pprev = &head->next;
+                       continue;
+               }
 
-       new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-       if (unlikely(new_tbl)) {
-               tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
-               if (!IS_ERR_OR_NULL(tbl))
-                       goto slow_path;
+               data = rht_obj(ht, head);
 
-               err = PTR_ERR(tbl);
-               goto out;
-       }
+               if (!rhlist)
+                       goto out_unlock;
 
-       err = -E2BIG;
-       if (unlikely(rht_grow_above_max(ht, tbl)))
-               goto out;
 
-       if (unlikely(rht_grow_above_100(ht, tbl))) {
-slow_path:
-               spin_unlock_bh(lock);
-               err = rhashtable_insert_rehash(ht, tbl);
-               rcu_read_unlock();
-               if (err)
-                       return err;
+               list = container_of(obj, struct rhlist_head, rhead);
+               plist = container_of(head, struct rhlist_head, rhead);
 
-               goto restart;
+               RCU_INIT_POINTER(list->next, plist);
+               head = rht_dereference_bucket(head->next, tbl, hash);
+               RCU_INIT_POINTER(list->rhead.next, head);
+               if (pprev) {
+                       rcu_assign_pointer(*pprev, obj);
+                       rht_unlock(tbl, bkt);
+               } else
+                       rht_assign_unlock(tbl, bkt, obj);
+               data = NULL;
+               goto out;
        }
 
-       err = -EEXIST;
-       elasticity = ht->elasticity;
-       rht_for_each(head, tbl, hash) {
-               if (key &&
-                   unlikely(!(params.obj_cmpfn ?
-                              params.obj_cmpfn(&arg, rht_obj(ht, head)) :
-                              rhashtable_compare(&arg, rht_obj(ht, head)))))
-                       goto out;
-               if (!--elasticity)
-                       goto slow_path;
-       }
+       if (elasticity <= 0)
+               goto slow_path;
+
+       data = ERR_PTR(-E2BIG);
+       if (unlikely(rht_grow_above_max(ht, tbl)))
+               goto out_unlock;
 
-       err = 0;
+       if (unlikely(rht_grow_above_100(ht, tbl)))
+               goto slow_path;
 
-       head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+       /* Inserting at head of list makes unlocking free. */
+       head = rht_ptr(bkt, tbl, hash);
 
        RCU_INIT_POINTER(obj->next, head);
+       if (rhlist) {
+               struct rhlist_head *list;
 
-       rcu_assign_pointer(tbl->buckets[hash], obj);
+               list = container_of(obj, struct rhlist_head, rhead);
+               RCU_INIT_POINTER(list->next, NULL);
+       }
 
        atomic_inc(&ht->nelems);
+       rht_assign_unlock(tbl, bkt, obj);
+
        if (rht_grow_above_75(ht, tbl))
                schedule_work(&ht->run_work);
 
+       data = NULL;
 out:
-       spin_unlock_bh(lock);
        rcu_read_unlock();
 
-       return err;
+       return data;
+
+out_unlock:
+       rht_unlock(tbl, bkt);
+       goto out;
 }
 
+/**
+ * rhashtable_insert_fast - insert object into hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhashtable_insert_fast(
+       struct rhashtable *ht, struct rhash_head *obj,
+       const struct rhashtable_params params)
+{
+       void *ret;
+
+       ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhltable_insert_key - insert object into hash list table
+ * @hlt:       hash list table
+ * @key:       the pointer to the key
+ * @list:      pointer to hash list head inside object
+ * @params:    hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert_key(
+       struct rhltable *hlt, const void *key, struct rhlist_head *list,
+       const struct rhashtable_params params)
+{
+       return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+                                               params, true));
+}
+
+/**
+ * rhltable_insert - insert object into hash list table
+ * @hlt:       hash list table
+ * @list:      pointer to hash list head inside object
+ * @params:    hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert(
+       struct rhltable *hlt, struct rhlist_head *list,
+       const struct rhashtable_params params)
+{
+       const char *key = rht_obj(&hlt->ht, &list->rhead);
+
+       key += params.key_offset;
+
+       return rhltable_insert_key(hlt, key, list, params);
+}
+
+/**
+ * rhashtable_lookup_insert_fast - lookup and insert object into hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
 static inline int rhashtable_lookup_insert_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
 {
        const char *key = rht_obj(ht, obj);
+       void *ret;
 
        BUG_ON(ht->p.obj_hashfn);
 
-       return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
-                                       params);
+       ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+                                      false);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
 }
 
-static inline int __rhashtable_remove_fast(
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+       struct rhashtable *ht, struct rhash_head *obj,
+       const struct rhashtable_params params)
+{
+       const char *key = rht_obj(ht, obj);
+
+       BUG_ON(ht->p.obj_hashfn);
+
+       return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+                                       false);
+}
+
+/**
+ * rhashtable_lookup_insert_key - search and insert object to hash table
+ *                               with explicit key
+ * @ht:                hash table
+ * @key:       key
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ *
+ * Returns zero on success.
+ */
+static inline int rhashtable_lookup_insert_key(
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params)
+{
+       void *ret;
+
+       BUG_ON(!ht->p.obj_hashfn || !key);
+
+       ret = __rhashtable_insert_fast(ht, key, obj, params, false);
+       if (IS_ERR(ret))
+               return PTR_ERR(ret);
+
+       return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
+ * @ht:                hash table
+ * @key:       key
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_key(), but this function returns the
+ * object if it exists, NULL if it does not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_key(
+       struct rhashtable *ht, const void *key, struct rhash_head *obj,
+       const struct rhashtable_params params)
+{
+       BUG_ON(!ht->p.obj_hashfn || !key);
+
+       return __rhashtable_insert_fast(ht, key, obj, params, false);
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast_one(
        struct rhashtable *ht, struct bucket_table *tbl,
-       struct rhash_head *obj, const struct rhashtable_params params)
+       struct rhash_head *obj, const struct rhashtable_params params,
+       bool rhlist)
 {
+       struct rhash_lock_head __rcu **bkt;
        struct rhash_head __rcu **pprev;
        struct rhash_head *he;
-       spinlock_t * lock;
        unsigned int hash;
        int err = -ENOENT;
 
        hash = rht_head_hashfn(ht, tbl, obj, params);
-       lock = rht_bucket_lock(tbl, hash);
+       bkt = rht_bucket_var(tbl, hash);
+       if (!bkt)
+               return -ENOENT;
+       pprev = NULL;
+       rht_lock(tbl, bkt);
 
-       spin_lock_bh(lock);
+       rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+               struct rhlist_head *list;
+
+               list = container_of(he, struct rhlist_head, rhead);
 
-       pprev = &tbl->buckets[hash];
-       rht_for_each(he, tbl, hash) {
                if (he != obj) {
+                       struct rhlist_head __rcu **lpprev;
+
                        pprev = &he->next;
-                       continue;
+
+                       if (!rhlist)
+                               continue;
+
+                       do {
+                               lpprev = &list->next;
+                               list = rht_dereference_bucket(list->next,
+                                                             tbl, hash);
+                       } while (list && obj != &list->rhead);
+
+                       if (!list)
+                               continue;
+
+                       list = rht_dereference_bucket(list->next, tbl, hash);
+                       RCU_INIT_POINTER(*lpprev, list);
+                       err = 0;
+                       break;
                }
 
-               rcu_assign_pointer(*pprev, obj->next);
+               obj = rht_dereference_bucket(obj->next, tbl, hash);
+               err = 1;
+
+               if (rhlist) {
+                       list = rht_dereference_bucket(list->next, tbl, hash);
+                       if (list) {
+                               RCU_INIT_POINTER(list->rhead.next, obj);
+                               obj = &list->rhead;
+                               err = 0;
+                       }
+               }
+
+               if (pprev) {
+                       rcu_assign_pointer(*pprev, obj);
+                       rht_unlock(tbl, bkt);
+               } else {
+                       rht_assign_unlock(tbl, bkt, obj);
+               }
+               goto unlocked;
+       }
+
+       rht_unlock(tbl, bkt);
+unlocked:
+       if (err > 0) {
+               atomic_dec(&ht->nelems);
+               if (unlikely(ht->p.automatic_shrinking &&
+                            rht_shrink_below_30(ht, tbl)))
+                       schedule_work(&ht->run_work);
                err = 0;
-               break;
        }
 
-       spin_unlock_bh(lock);
+       return err;
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast(
+       struct rhashtable *ht, struct rhash_head *obj,
+       const struct rhashtable_params params, bool rhlist)
+{
+       struct bucket_table *tbl;
+       int err;
+
+       rcu_read_lock();
+
+       tbl = rht_dereference_rcu(ht->tbl, ht);
+
+       /* Because we have already taken (and released) the bucket
+        * lock in old_tbl, if we find that future_tbl is not yet
+        * visible then that guarantees the entry to still be in
+        * the old tbl if it exists.
+        */
+       while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
+                                                  rhlist)) &&
+              (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+               ;
+
+       rcu_read_unlock();
 
        return err;
 }
 
+/**
+ * rhashtable_remove_fast - remove object from hash table
+ * @ht:                hash table
+ * @obj:       pointer to hash head inside object
+ * @params:    hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
 static inline int rhashtable_remove_fast(
        struct rhashtable *ht, struct rhash_head *obj,
        const struct rhashtable_params params)
+{
+       return __rhashtable_remove_fast(ht, obj, params, false);
+}
+
+/**
+ * rhltable_remove - remove object from hash list table
+ * @hlt:       hash list table
+ * @list:      pointer to hash list head inside object
+ * @params:    hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhltable_remove(
+       struct rhltable *hlt, struct rhlist_head *list,
+       const struct rhashtable_params params)
+{
+       return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
+}
+
+/* Internal function, please use rhashtable_replace_fast() instead */
+static inline int __rhashtable_replace_fast(
+       struct rhashtable *ht, struct bucket_table *tbl,
+       struct rhash_head *obj_old, struct rhash_head *obj_new,
+       const struct rhashtable_params params)
+{
+       struct rhash_lock_head __rcu **bkt;
+       struct rhash_head __rcu **pprev;
+       struct rhash_head *he;
+       unsigned int hash;
+       int err = -ENOENT;
+
+       /* Minimally, the old and new objects must have same hash
+        * (which should mean identifiers are the same).
+        */
+       hash = rht_head_hashfn(ht, tbl, obj_old, params);
+       if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
+               return -EINVAL;
+
+       bkt = rht_bucket_var(tbl, hash);
+       if (!bkt)
+               return -ENOENT;
+
+       pprev = NULL;
+       rht_lock(tbl, bkt);
+
+       rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+               if (he != obj_old) {
+                       pprev = &he->next;
+                       continue;
+               }
+
+               rcu_assign_pointer(obj_new->next, obj_old->next);
+               if (pprev) {
+                       rcu_assign_pointer(*pprev, obj_new);
+                       rht_unlock(tbl, bkt);
+               } else {
+                       rht_assign_unlock(tbl, bkt, obj_new);
+               }
+               err = 0;
+               goto unlocked;
+       }
+
+       rht_unlock(tbl, bkt);
+
+unlocked:
+       return err;
+}
+
+/**
+ * rhashtable_replace_fast - replace an object in hash table
+ * @ht:                hash table
+ * @obj_old:   pointer to hash head inside object being replaced
+ * @obj_new:   pointer to hash head inside object which is new
+ * @params:    hash table parameters
+ *
+ * Replacing an object doesn't affect the number of elements in the hash table
+ * or bucket, so we don't need to worry about shrinking or expanding the
+ * table here.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found,
+ * -EINVAL if hash is not the same for the old and new objects.
+ */
+static inline int rhashtable_replace_fast(
+       struct rhashtable *ht, struct rhash_head *obj_old,
+       struct rhash_head *obj_new,
+       const struct rhashtable_params params)
 {
        struct bucket_table *tbl;
        int err;
@@ -442,22 +1207,62 @@ static inline int rhashtable_remove_fast(
         * visible then that guarantees the entry to still be in
         * the old tbl if it exists.
         */
-       while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) &&
+       while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
+                                               obj_new, params)) &&
               (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
                ;
 
-       if (err)
-               goto out;
-
-       atomic_dec(&ht->nelems);
-       if (unlikely(ht->p.automatic_shrinking &&
-                    rht_shrink_below_30(ht, tbl)))
-               schedule_work(&ht->run_work);
-
-out:
        rcu_read_unlock();
 
        return err;
 }
 
+/**
+ * rhltable_walk_enter - Initialise an iterator
+ * @hlt:       Table to walk over
+ * @iter:      Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+                                      struct rhashtable_iter *iter)
+{
+       return rhashtable_walk_enter(&hlt->ht, iter);
+}
+
+/**
+ * rhltable_free_and_destroy - free elements and destroy hash list table
+ * @hlt:       the hash list table to destroy
+ * @free_fn:   callback to release resources of element
+ * @arg:       pointer passed to free_fn
+ *
+ * See documentation for rhashtable_free_and_destroy.
+ */
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+                                            void (*free_fn)(void *ptr,
+                                                            void *arg),
+                                            void *arg)
+{
+       return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+       return rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
 #endif /* _LINUX_RHASHTABLE_H */
index 347105c6e0caa83b7637459bd73de71bfe5bfb47..03feda7ab1defe0d03c9dce745df7a52f311d3c7 100644 (file)
@@ -1,7 +1,8 @@
 #ifndef _LINUX_SCHED_MM_H
 #define _LINUX_SCHED_MM_H
 
-#define PF_MEMALLOC_NOFS 0
+#define PF_MEMALLOC            0x00000800      /* Allocating memory */
+#define PF_MEMALLOC_NOFS       0x00040000      /* All allocation requests will inherit GFP_NOFS */
 
 static inline unsigned int memalloc_nofs_save(void)
 {
@@ -15,4 +16,16 @@ static inline void memalloc_nofs_restore(unsigned int flags)
        current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
 }
 
+static inline unsigned int memalloc_noreclaim_save(void)
+{
+       unsigned int flags = current->flags & PF_MEMALLOC;
+       current->flags |= PF_MEMALLOC;
+       return flags;
+}
+
+static inline void memalloc_noreclaim_restore(unsigned int flags)
+{
+       current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+}
+
 #endif /* _LINUX_SCHED_MM_H */
index a16e94f482e972fe7fdafbd2f82b55cbc41e0b2d..477c33eb00d7dca36ad07cde09cd5681455cf6b2 100644 (file)
@@ -80,7 +80,8 @@ union six_lock_state {
        };
 
        struct {
-               unsigned        read_lock:28;
+               unsigned        read_lock:27;
+               unsigned        write_locking:1;
                unsigned        intent_lock:1;
                unsigned        waiters:3;
                /*
@@ -107,6 +108,7 @@ struct six_lock {
        unsigned                intent_lock_recurse;
        struct task_struct      *owner;
        struct optimistic_spin_queue osq;
+       unsigned __percpu       *readers;
 
        raw_spinlock_t          wait_lock;
        struct list_head        wait_list[2];
@@ -194,4 +196,8 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
+void six_lock_pcpu_free_rcu(struct six_lock *);
+void six_lock_pcpu_free(struct six_lock *);
+void six_lock_pcpu_alloc(struct six_lock *);
+
 #endif /* _LINUX_SIX_H */
index 32ffa55b0833a7d928d5447544ed41553d78f27a..775b7e3aa7829a0608455626f97c82ee41e4781a 100644 (file)
@@ -58,7 +58,7 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
 #define kzalloc(size, flags)           kmalloc(size, flags|__GFP_ZERO)
 #define kmalloc_array(n, size, flags)                                  \
        ((size) != 0 && (n) > SIZE_MAX / (size)                         \
-        ? NULL : kmalloc(n * size, flags))
+        ? NULL : kmalloc((n) * (size), flags))
 
 #define kcalloc(n, size, flags)                kmalloc_array(n, size, flags|__GFP_ZERO)
 
@@ -66,6 +66,7 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
 #define kzfree(p)                      free(p)
 
 #define kvmalloc(size, flags)          kmalloc(size, flags)
+#define kvzalloc(size, flags)          kzalloc(size, flags)
 #define kvfree(p)                      kfree(p)
 
 static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
@@ -132,4 +133,35 @@ static inline void *kmemdup(const void *src, size_t len, gfp_t gfp)
        return p;
 }
 
+struct kmem_cache {
+       size_t              obj_size;
+};
+
+static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp)
+{
+       return kmalloc(c->obj_size, gfp);
+}
+
+static inline void kmem_cache_free(struct kmem_cache *c, void *p)
+{
+       kfree(p);
+}
+
+static inline void kmem_cache_destroy(struct kmem_cache *p)
+{
+       kfree(p);
+}
+
+static inline struct kmem_cache *kmem_cache_create(size_t obj_size)
+{
+       struct kmem_cache *p = kmalloc(sizeof(*p), GFP_KERNEL);
+       if (!p)
+               return NULL;
+
+       p->obj_size = obj_size;
+       return p;
+}
+
+#define KMEM_CACHE(_struct, _flags)    kmem_cache_create(sizeof(struct _struct))
+
 #endif /* __TOOLS_LINUX_SLAB_H */
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
new file mode 100644 (file)
index 0000000..75823cf
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef __TOOLS_LINUX_SRCU_H
+#define __TOOLS_LINUX_SRCU_H
+
+struct srcu_struct {
+};
+
+static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) {}
+
+static inline int srcu_read_lock(struct srcu_struct *ssp)
+{
+       return 0;
+}
+
+static inline bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+       return false;
+}
+
+static inline unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+       return 0;
+}
+
+static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {}
+
+static inline int init_srcu_struct(struct srcu_struct *ssp)
+{
+       return 0;
+}
+
+#endif /* __TOOLS_LINUX_SRCU_H */
index 387c38314f0535b41cbba148d85929427086dcfe..c9886cbaadcc891f6f1dcdd3369ca415b1cbeaf7 100644 (file)
@@ -11,6 +11,8 @@
 #define __SANE_USERSPACE_TYPES__       /* For PPC64, to get LL64 types */
 #include <asm/types.h>
 
+#include <linux/cache.h>
+
 #define BITS_PER_LONG  __BITS_PER_LONG
 
 struct page;
@@ -31,6 +33,7 @@ typedef unsigned gfp_t;
 #define __GFP_IO       0
 #define __GFP_NOWARN   0
 #define __GFP_NORETRY  0
+#define __GFP_NOFAIL   0
 #define __GFP_ZERO     1
 
 #define PAGE_ALLOC_COSTLY_ORDER        6
index efcc1912765f8c81115ab4a0ade75ca8fe3bc2ac..c674d9a2c05737da3e94d21234dd366f59ecbab9 100644 (file)
@@ -16,6 +16,8 @@ static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 {
        void *p;
 
+       size = round_up(size, PAGE_SIZE);
+
        run_shrinkers();
 
        p = aligned_alloc(PAGE_SIZE, size);
index 62d15e5d73c65f74629e6195b5c9c69939a72078..c3d982421883ac640a56c5005a6d467502b5e86e 100644 (file)
@@ -91,6 +91,7 @@ do {                                                                  \
 } while (0)
 
 #define wait_event_killable(wq, condition)     ({wait_event(wq, condition); 0; })
+#define wait_event_interruptible(wq, condition)        ({wait_event(wq, condition); 0; })
 
 #define __wait_event_timeout(wq, condition, timeout)                   \
        ___wait_event(wq, ___wait_cond_timeout(condition),              \
index 9b4e8295ed75a55b9e9150198e693a758f808f9d..d4cb7a298cc2481a73c3de4c1c5ddd08f117d42e 100644 (file)
@@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write,
        TP_ARGS(bio)
 );
 
+TRACE_EVENT(journal_reclaim_start,
+       TP_PROTO(struct bch_fs *c, u64 min_nr,
+                u64 prereserved, u64 prereserved_total,
+                u64 btree_cache_dirty, u64 btree_cache_total,
+                u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+       TP_ARGS(c, min_nr, prereserved, prereserved_total,
+               btree_cache_dirty, btree_cache_total,
+               btree_key_cache_dirty, btree_key_cache_total),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16              )
+               __field(u64,            min_nr                  )
+               __field(u64,            prereserved             )
+               __field(u64,            prereserved_total       )
+               __field(u64,            btree_cache_dirty       )
+               __field(u64,            btree_cache_total       )
+               __field(u64,            btree_key_cache_dirty   )
+               __field(u64,            btree_key_cache_total   )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->min_nr                 = min_nr;
+               __entry->prereserved            = prereserved;
+               __entry->prereserved_total      = prereserved_total;
+               __entry->btree_cache_dirty      = btree_cache_dirty;
+               __entry->btree_cache_total      = btree_cache_total;
+               __entry->btree_key_cache_dirty  = btree_key_cache_dirty;
+               __entry->btree_key_cache_total  = btree_key_cache_total;
+       ),
+
+       TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+                 __entry->uuid,
+                 __entry->min_nr,
+                 __entry->prereserved,
+                 __entry->prereserved_total,
+                 __entry->btree_cache_dirty,
+                 __entry->btree_cache_total,
+                 __entry->btree_key_cache_dirty,
+                 __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+       TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+       TP_ARGS(c, nr_flushed),
+
+       TP_STRUCT__entry(
+               __array(char,           uuid,   16 )
+               __field(u64,            nr_flushed )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->nr_flushed = nr_flushed;
+       ),
+
+       TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+);
+
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
@@ -513,7 +572,7 @@ TRACE_EVENT(transaction_restart_ip,
                __entry->ip     = ip;
        ),
 
-       TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+       TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
 );
 
 DECLARE_EVENT_CLASS(transaction_restart,
@@ -528,7 +587,7 @@ DECLARE_EVENT_CLASS(transaction_restart,
                __entry->ip = ip;
        ),
 
-       TP_printk("%pf", (void *) __entry->ip)
+       TP_printk("%ps", (void *) __entry->ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_btree_node_reused,
@@ -536,9 +595,46 @@ DEFINE_EVENT(transaction_restart,  trans_restart_btree_node_reused,
        TP_ARGS(ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_would_deadlock,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
+TRACE_EVENT(trans_restart_would_deadlock,
+       TP_PROTO(unsigned long  trans_ip,
+                unsigned long  caller_ip,
+                unsigned       reason,
+                enum btree_id  have_btree_id,
+                unsigned       have_iter_type,
+                enum btree_id  want_btree_id,
+                unsigned       want_iter_type),
+       TP_ARGS(trans_ip, caller_ip, reason,
+               have_btree_id, have_iter_type,
+               want_btree_id, want_iter_type),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,          trans_ip        )
+               __field(unsigned long,          caller_ip       )
+               __field(u8,                     reason          )
+               __field(u8,                     have_btree_id   )
+               __field(u8,                     have_iter_type  )
+               __field(u8,                     want_btree_id   )
+               __field(u8,                     want_iter_type  )
+       ),
+
+       TP_fast_assign(
+               __entry->trans_ip               = trans_ip;
+               __entry->caller_ip              = caller_ip;
+               __entry->reason                 = reason;
+               __entry->have_btree_id          = have_btree_id;
+               __entry->have_iter_type         = have_iter_type;
+               __entry->want_btree_id          = want_btree_id;
+               __entry->want_iter_type         = want_iter_type;
+       ),
+
+       TP_printk("%ps %pS because %u have %u:%u want %u:%u",
+                 (void *) __entry->trans_ip,
+                 (void *) __entry->caller_ip,
+                 __entry->reason,
+                 __entry->have_btree_id,
+                 __entry->have_iter_type,
+                 __entry->want_btree_id,
+                 __entry->want_iter_type)
 );
 
 TRACE_EVENT(trans_restart_iters_realloced,
@@ -555,7 +651,7 @@ TRACE_EVENT(trans_restart_iters_realloced,
                __entry->nr     = nr;
        ),
 
-       TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr)
+       TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr)
 );
 
 TRACE_EVENT(trans_restart_mem_realloced,
@@ -572,7 +668,7 @@ TRACE_EVENT(trans_restart_mem_realloced,
                __entry->bytes  = bytes;
        ),
 
-       TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes)
+       TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_journal_res_get,
@@ -585,6 +681,11 @@ DEFINE_EVENT(transaction_restart,  trans_restart_journal_preres_get,
        TP_ARGS(ip)
 );
 
+DEFINE_EVENT(transaction_restart,      trans_restart_journal_reclaim,
+       TP_PROTO(unsigned long ip),
+       TP_ARGS(ip)
+);
+
 DEFINE_EVENT(transaction_restart,      trans_restart_mark_replicas,
        TP_PROTO(unsigned long ip),
        TP_ARGS(ip)
@@ -620,11 +721,6 @@ DEFINE_EVENT(transaction_restart,  trans_restart_traverse,
        TP_ARGS(ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_atomic,
-       TP_PROTO(unsigned long ip),
-       TP_ARGS(ip)
-);
-
 DECLARE_EVENT_CLASS(node_lock_fail,
        TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
        TP_ARGS(level, iter_seq, node, node_seq),
index 7ff02b881513cb79b785ebc59153560247f9dfbe..e5dcfd8f7bf9a07bd3cb9f4ed0965422262a5cf8 100644 (file)
@@ -35,52 +35,35 @@ static u64 min_size(unsigned bucket_size)
        return BCH_MIN_NR_NBUCKETS * bucket_size;
 }
 
-static void init_layout(struct bch_sb_layout *l, unsigned block_size,
+static void init_layout(struct bch_sb_layout *l,
+                       unsigned block_size,
+                       unsigned sb_size,
                        u64 start, u64 end)
 {
-       unsigned sb_size;
-       u64 backup; /* offset of 2nd sb */
+       unsigned i;
 
        memset(l, 0, sizeof(*l));
 
-       if (start != BCH_SB_SECTOR)
-               start = round_up(start, block_size);
-       end = round_down(end, block_size);
-
-       if (start >= end)
-               die("insufficient space for superblocks");
-
-       /*
-        * Create two superblocks in the allowed range: reserve a maximum of 64k
-        */
-       sb_size = min_t(u64, 128, end - start / 2);
-
-       backup = start + sb_size;
-       backup = round_up(backup, block_size);
-
-       backup = min(backup, end);
-
-       sb_size = min(end - backup, backup- start);
-       sb_size = rounddown_pow_of_two(sb_size);
-
-       if (sb_size < 8)
-               die("insufficient space for superblocks");
-
        l->magic                = BCACHE_MAGIC;
        l->layout_type          = 0;
        l->nr_superblocks       = 2;
        l->sb_max_size_bits     = ilog2(sb_size);
-       l->sb_offset[0]         = cpu_to_le64(start);
-       l->sb_offset[1]         = cpu_to_le64(backup);
+
+       /* Create two superblocks in the allowed range: */
+       for (i = 0; i < l->nr_superblocks; i++) {
+               if (start != BCH_SB_SECTOR)
+                       start = round_up(start, block_size);
+
+               l->sb_offset[i] = cpu_to_le64(start);
+               start += sb_size;
+       }
+
+       if (start >= end)
+               die("insufficient space for superblocks");
 }
 
 void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
 {
-       if (!dev->sb_offset) {
-               dev->sb_offset  = BCH_SB_SECTOR;
-               dev->sb_end     = BCH_SB_SECTOR + 256;
-       }
-
        if (!dev->size)
                dev->size = get_size(dev->path, dev->fd) >> 9;
 
@@ -202,13 +185,16 @@ struct bch_sb *bch2_format(struct bch_opt_strs    fs_opt_strs,
        if (bch2_sb_realloc(&sb, 0))
                die("insufficient memory");
 
-       sb.sb->version          = le16_to_cpu(bcachefs_metadata_version_current);
-       sb.sb->version_min      = le16_to_cpu(bcachefs_metadata_version_current);
+       sb.sb->version          = le16_to_cpu(opts.version);
+       sb.sb->version_min      = le16_to_cpu(opts.version);
        sb.sb->magic            = BCACHE_MAGIC;
        sb.sb->block_size       = cpu_to_le16(fs_opts.block_size);
        sb.sb->user_uuid        = opts.uuid;
        sb.sb->nr_devices       = nr_devs;
 
+       if (opts.version == bcachefs_metadata_version_current)
+               sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+
        uuid_generate(sb.sb->uuid.b);
 
        if (opts.label)
@@ -255,7 +241,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
                m->first_bucket = 0;
                m->bucket_size  = cpu_to_le16(i->bucket_size);
 
-               SET_BCH_MEMBER_REPLACEMENT(m,   CACHE_REPLACEMENT_LRU);
+               SET_BCH_MEMBER_REPLACEMENT(m,   BCH_CACHE_REPLACEMENT_lru);
                SET_BCH_MEMBER_DISCARD(m,       i->discard);
                SET_BCH_MEMBER_DATA_ALLOWED(m,  i->data_allowed);
                SET_BCH_MEMBER_DURABILITY(m,    i->durability + 1);
@@ -282,6 +268,8 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
                parse_target(&sb, devs, nr_devs, fs_opt_strs.background_target));
        SET_BCH_SB_PROMOTE_TARGET(sb.sb,
                parse_target(&sb, devs, nr_devs, fs_opt_strs.promote_target));
+       SET_BCH_SB_METADATA_TARGET(sb.sb,
+               parse_target(&sb, devs, nr_devs, fs_opt_strs.metadata_target));
 
        /* Crypt: */
        if (opts.encrypted) {
@@ -295,7 +283,13 @@ struct bch_sb *bch2_format(struct bch_opt_strs     fs_opt_strs,
        for (i = devs; i < devs + nr_devs; i++) {
                sb.sb->dev_idx = i - devs;
 
+               if (!i->sb_offset) {
+                       i->sb_offset    = BCH_SB_SECTOR;
+                       i->sb_end       = i->size;
+               }
+
                init_layout(&sb.sb->layout, fs_opts.block_size,
+                           opts.superblock_size,
                            i->sb_offset, i->sb_end);
 
                if (i->sb_offset == BCH_SB_SECTOR) {
@@ -533,14 +527,14 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
                       time_str,
 
                       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
-                      ? bch2_dev_state[BCH_MEMBER_STATE(m)]
+                      ? bch2_member_states[BCH_MEMBER_STATE(m)]
                       : "unknown",
 
                       group,
                       data_allowed_str,
                       data_has_str,
 
-                      BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
+                      BCH_MEMBER_REPLACEMENT(m) < BCH_CACHE_REPLACEMENT_NR
                       ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
                       : "unknown",
 
@@ -619,6 +613,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
 static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
                                enum units units)
 {
+       struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+
+       printf("  flags:       %x", le32_to_cpu(clean->flags));
+       printf("  journal seq: %llx", le64_to_cpu(clean->journal_seq));
 }
 
 static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f,
@@ -669,13 +668,15 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout,
 {
        struct bch_sb_field_members *mi;
        char user_uuid_str[40], internal_uuid_str[40];
-       char features_str[200];
+       char features_str[500];
+       char compat_features_str[500];
        char fields_have_str[200];
        char label[BCH_SB_LABEL_SIZE + 1];
        char time_str[64];
        char foreground_str[64];
        char background_str[64];
        char promote_str[64];
+       char metadata_str[64];
        struct bch_sb_field *f;
        u64 fields_have = 0;
        unsigned nr_devices = 0;
@@ -715,10 +716,17 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout,
        bch2_sb_get_target(sb, promote_str, sizeof(promote_str),
                BCH_SB_PROMOTE_TARGET(sb));
 
+       bch2_sb_get_target(sb, metadata_str, sizeof(metadata_str),
+               BCH_SB_METADATA_TARGET(sb));
+
        bch2_flags_to_text(&PBUF(features_str),
                           bch2_sb_features,
                           le64_to_cpu(sb->features[0]));
 
+       bch2_flags_to_text(&PBUF(compat_features_str),
+                          bch2_sb_compat,
+                          le64_to_cpu(sb->compat[0]));
+
        vstruct_for_each(sb, f)
                fields_have |= 1 << le32_to_cpu(f->type);
        bch2_flags_to_text(&PBUF(fields_have_str),
@@ -726,8 +734,10 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout,
 
        printf("External UUID:                  %s\n"
               "Internal UUID:                  %s\n"
+              "Device index:                   %u\n"
               "Label:                          %s\n"
-              "Version:                        %llu\n"
+              "Version:                        %u\n"
+              "Oldest version on disk:         %u\n"
               "Created:                        %s\n"
               "Squence number:                 %llu\n"
               "Block_size:                     %s\n"
@@ -735,6 +745,7 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout,
               "Error action:                   %s\n"
               "Clean:                          %llu\n"
               "Features:                       %s\n"
+              "Compat features:                %s\n"
 
               "Metadata replicas:              %llu\n"
               "Data replicas:                  %llu\n"
@@ -746,6 +757,7 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout,
               "Foreground write target:        %s\n"
               "Background write target:        %s\n"
               "Promote target:                 %s\n"
+               "Metadata target:                %s\n"
 
               "String hash type:               %s (%llu)\n"
               "32 bit inodes:                  %llu\n"
@@ -757,19 +769,22 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout,
               "Superblock size:                %llu\n",
               user_uuid_str,
               internal_uuid_str,
+              sb->dev_idx,
               label,
-              le64_to_cpu(sb->version),
+              le16_to_cpu(sb->version),
+              le16_to_cpu(sb->version_min),
               time_str,
               le64_to_cpu(sb->seq),
               pr_units(le16_to_cpu(sb->block_size), units),
               pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units),
 
-              BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS
+              BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR
               ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
               : "unknown",
 
               BCH_SB_CLEAN(sb),
               features_str,
+              compat_features_str,
 
               BCH_SB_META_REPLICAS_WANT(sb),
               BCH_SB_DATA_REPLICAS_WANT(sb),
@@ -792,6 +807,7 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout,
               foreground_str,
               background_str,
               promote_str,
+               metadata_str,
 
               BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
               ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)]
@@ -891,7 +907,7 @@ struct bchfs_handle bcache_fs_open(const char *path)
  * Given a path to a block device, open the filesystem it belongs to; also
  * return the device's idx:
  */
-struct bchfs_handle bchu_fs_open_by_dev(const char *path, unsigned *idx)
+struct bchfs_handle bchu_fs_open_by_dev(const char *path, int *idx)
 {
        char buf[1024], *uuid_str;
 
@@ -935,6 +951,17 @@ struct bchfs_handle bchu_fs_open_by_dev(const char *path, unsigned *idx)
        return bcache_fs_open(uuid_str);
 }
 
+int bchu_dev_path_to_idx(struct bchfs_handle fs, const char *dev_path)
+{
+       int idx;
+       struct bchfs_handle fs2 = bchu_fs_open_by_dev(dev_path, &idx);
+
+       if (memcmp(&fs.uuid, &fs2.uuid, sizeof(fs.uuid)))
+               idx = -1;
+       bcache_fs_close(fs2);
+       return idx;
+}
+
 int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd)
 {
        int progress_fd = xioctl(fs.ioctl_fd, BCH_IOCTL_DATA, &cmd);
index 30add92c518c4ceae328048b5318b96220f0278a..45d2f8740a41479e852411103e38e24a20394b9f 100644 (file)
@@ -12,6 +12,8 @@
 
 /* option parsing */
 
+#define SUPERBLOCK_SIZE_DEFAULT                2048    /* 1 MB */
+
 struct bch_opt_strs {
 union {
        char                    *by_id[bch2_opts_nr];
@@ -30,9 +32,9 @@ void bch2_opts_usage(unsigned);
 struct format_opts {
        char            *label;
        uuid_le         uuid;
-
+       unsigned        version;
+       unsigned        superblock_size;
        unsigned        encoded_extent_max;
-
        bool            encrypted;
        char            *passphrase;
 };
@@ -40,6 +42,8 @@ struct format_opts {
 static inline struct format_opts format_opts_default()
 {
        return (struct format_opts) {
+               .version                = bcachefs_metadata_version_current,
+               .superblock_size        = SUPERBLOCK_SIZE_DEFAULT,
                .encoded_extent_max     = 128,
        };
 }
@@ -90,7 +94,8 @@ struct bchfs_handle {
 
 void bcache_fs_close(struct bchfs_handle);
 struct bchfs_handle bcache_fs_open(const char *);
-struct bchfs_handle bchu_fs_open_by_dev(const char *, unsigned *);
+struct bchfs_handle bchu_fs_open_by_dev(const char *, int *);
+int bchu_dev_path_to_idx(struct bchfs_handle, const char *);
 
 static inline void bchu_disk_add(struct bchfs_handle fs, char *dev)
 {
@@ -214,6 +219,19 @@ static inline void bchu_disk_resize(struct bchfs_handle fs,
        xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE, &i);
 }
 
+static inline void bchu_disk_resize_journal(struct bchfs_handle fs,
+                                           unsigned idx,
+                                           u64 nbuckets)
+{
+       struct bch_ioctl_disk_resize i = {
+               .flags  = BCH_BY_INDEX,
+               .dev    = idx,
+               .nbuckets = nbuckets,
+       };
+
+       xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE_JOURNAL, &i);
+}
+
 int bchu_data(struct bchfs_handle, struct bch_ioctl_data);
 
 struct dev_name {
index 76c98ddbf62871cce0f3cb6afb55274996c96602..0f2d7437c740344e3191ee83041e3bc62d6c2be1 100644 (file)
@@ -216,6 +216,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
 {
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c_xattr xattr;
@@ -226,7 +227,7 @@ retry:
        bch2_trans_begin(&trans);
 
        iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
-                       &inode->ei_str_hash, inode->v.i_ino,
+                       &hash, inode->v.i_ino,
                        &X_SEARCH(acl_to_xattr_type(type), "", 0),
                        0);
        if (IS_ERR(iter)) {
@@ -239,12 +240,12 @@ retry:
        }
 
        xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
-
        acl = bch2_acl_from_disk(xattr_val(xattr.v),
                        le16_to_cpu(xattr.v->x_val_len));
 
        if (!IS_ERR(acl))
                set_cached_acl(&inode->v, type, acl);
+       bch2_trans_iter_put(&trans, iter);
 out:
        bch2_trans_exit(&trans);
        return acl;
@@ -287,6 +288,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type)
        struct btree_trans trans;
        struct btree_iter *inode_iter;
        struct bch_inode_unpacked inode_u;
+       struct bch_hash_info hash_info;
        struct posix_acl *acl;
        umode_t mode;
        int ret;
@@ -308,12 +310,12 @@ retry:
        if (type == ACL_TYPE_ACCESS) {
                ret = posix_acl_update_mode(&inode->v, &mode, &acl);
                if (ret)
-                       goto err;
+                       goto btree_err;
        }
 
-       ret = bch2_set_acl_trans(&trans, &inode_u,
-                                &inode->ei_str_hash,
-                                acl, type);
+       hash_info = bch2_hash_info_init(c, &inode_u);
+
+       ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
        if (ret)
                goto btree_err;
 
@@ -325,6 +327,8 @@ retry:
                                  &inode->ei_journal_seq,
                                  BTREE_INSERT_NOUNLOCK);
 btree_err:
+       bch2_trans_iter_put(&trans, inode_iter);
+
        if (ret == -EINTR)
                goto retry;
        if (unlikely(ret))
@@ -342,29 +346,31 @@ err:
 }
 
 int bch2_acl_chmod(struct btree_trans *trans,
-                  struct bch_inode_info *inode,
+                  struct bch_inode_unpacked *inode,
                   umode_t mode,
                   struct posix_acl **new_acl)
 {
+       struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
        struct btree_iter *iter;
        struct bkey_s_c_xattr xattr;
        struct bkey_i_xattr *new;
        struct posix_acl *acl;
-       int ret = 0;
+       int ret;
 
        iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
-                       &inode->ei_str_hash, inode->v.i_ino,
+                       &hash_info, inode->bi_inum,
                        &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
                        BTREE_ITER_INTENT);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+       ret = PTR_ERR_OR_ZERO(iter);
+       if (ret)
+               return ret == -ENOENT ? 0 : ret;
 
        xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
-
        acl = bch2_acl_from_disk(xattr_val(xattr.v),
                        le16_to_cpu(xattr.v->x_val_len));
-       if (IS_ERR_OR_NULL(acl))
-               return PTR_ERR(acl);
+       ret = PTR_ERR_OR_ZERO(acl);
+       if (ret || !acl)
+               goto err;
 
        ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
        if (ret)
@@ -381,6 +387,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
        *new_acl = acl;
        acl = NULL;
 err:
+       bch2_trans_iter_put(trans, iter);
        kfree(acl);
        return ret;
 }
index cb62d502a7ff3b3fa6034ab7e474718d7cd42e2d..ba210c26d5c13b3a2110103831c90396c6fa57c7 100644 (file)
@@ -33,7 +33,7 @@ int bch2_set_acl_trans(struct btree_trans *,
                       const struct bch_hash_info *,
                       struct posix_acl *, int);
 int bch2_set_acl(struct inode *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
                   umode_t, struct posix_acl **);
 
 #else
@@ -47,7 +47,7 @@ static inline int bch2_set_acl_trans(struct btree_trans *trans,
 }
 
 static inline int bch2_acl_chmod(struct btree_trans *trans,
-                                struct bch_inode_info *inode,
+                                struct bch_inode_unpacked *inode,
                                 umode_t mode,
                                 struct posix_acl **new_acl)
 {
index 97508de9f7214204f8f6297b0d0d6d6be901732e..48971fcf2d5bb393c818732e3b0f1be50c179ebc 100644 (file)
@@ -14,6 +14,7 @@
 #include "ec.h"
 #include "error.h"
 #include "recovery.h"
+#include "varint.h"
 
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
-static const char * const bch2_alloc_field_names[] = {
-#define x(name, bytes) #name,
-       BCH_ALLOC_FIELDS()
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+       BCH_ALLOC_FIELDS_V1()
 #undef x
-       NULL
 };
 
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
 /* Ratelimiting/PD controllers */
 
 static void pd_controllers_update(struct work_struct *work)
@@ -54,10 +52,10 @@ static void pd_controllers_update(struct work_struct *work)
                 * reclaimed by copy GC
                 */
                fragmented += max_t(s64, 0, (bucket_to_sector(ca,
-                                       stats.buckets[BCH_DATA_user] +
-                                       stats.buckets[BCH_DATA_cached]) -
-                                 (stats.sectors[BCH_DATA_user] +
-                                  stats.sectors[BCH_DATA_cached])) << 9);
+                                       stats.d[BCH_DATA_user].buckets +
+                                       stats.d[BCH_DATA_cached].buckets) -
+                                 (stats.d[BCH_DATA_user].sectors +
+                                  stats.d[BCH_DATA_cached].sectors)) << 9);
        }
 
        bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
@@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work)
 
 /* Persistent alloc info: */
 
-static inline u64 get_alloc_field(const struct bch_alloc *a,
-                                 const void **p, unsigned field)
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+                                    const void **p, unsigned field)
 {
-       unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+       unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
        u64 v;
 
        if (!(a->fields & (1 << field)))
@@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
        return v;
 }
 
-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
-                                  unsigned field, u64 v)
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
+                                     unsigned field, u64 v)
 {
-       unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+       unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
 
        if (!v)
                return;
@@ -127,55 +125,127 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
        *p += bytes;
 }
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+                                struct bkey_s_c k)
 {
-       struct bkey_alloc_unpacked ret = { .gen = 0 };
+       const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+       const void *d = in->data;
+       unsigned idx = 0;
 
-       if (k.k->type == KEY_TYPE_alloc) {
-               const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
-               const void *d = a->data;
-               unsigned idx = 0;
+       out->gen = in->gen;
 
-               ret.gen = a->gen;
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+       BCH_ALLOC_FIELDS_V1()
+#undef  x
+}
 
-#define x(_name, _bits)        ret._name = get_alloc_field(a, &d, idx++);
-               BCH_ALLOC_FIELDS()
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+                               struct bkey_s_c k)
+{
+       struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+       const u8 *in = a.v->data;
+       const u8 *end = bkey_val_end(a);
+       unsigned fieldnr = 0;
+       int ret;
+       u64 v;
+
+       out->gen        = a.v->gen;
+       out->oldest_gen = a.v->oldest_gen;
+       out->data_type  = a.v->data_type;
+
+#define x(_name, _bits)                                                        \
+       if (fieldnr < a.v->nr_fields) {                                 \
+               ret = bch2_varint_decode(in, end, &v);                  \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               in += ret;                                              \
+       } else {                                                        \
+               v = 0;                                                  \
+       }                                                               \
+       out->_name = v;                                                 \
+       if (v != out->_name)                                            \
+               return -1;                                              \
+       fieldnr++;
+
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
-       }
-       return ret;
+       return 0;
 }
 
-void bch2_alloc_pack(struct bkey_i_alloc *dst,
-                    const struct bkey_alloc_unpacked src)
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+                              const struct bkey_alloc_unpacked src)
 {
-       unsigned idx = 0;
-       void *d = dst->v.data;
+       struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+       u8 *out = a->v.data;
+       u8 *end = (void *) &dst[1];
+       u8 *last_nonzero_field = out;
        unsigned bytes;
 
-       dst->v.fields   = 0;
-       dst->v.gen      = src.gen;
+       a->k.p          = POS(src.dev, src.bucket);
+       a->v.gen        = src.gen;
+       a->v.oldest_gen = src.oldest_gen;
+       a->v.data_type  = src.data_type;
+
+#define x(_name, _bits)                                                        \
+       nr_fields++;                                                    \
+                                                                       \
+       if (src._name) {                                                \
+               out += bch2_varint_encode(out, src._name);              \
+                                                                       \
+               last_nonzero_field = out;                               \
+               last_nonzero_fieldnr = nr_fields;                       \
+       } else {                                                        \
+               *out++ = 0;                                             \
+       }
 
-#define x(_name, _bits)        put_alloc_field(dst, &d, idx++, src._name);
-       BCH_ALLOC_FIELDS()
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
+       BUG_ON(out > end);
+
+       out = last_nonzero_field;
+       a->v.nr_fields = last_nonzero_fieldnr;
+
+       bytes = (u8 *) out - (u8 *) &a->v;
+       set_bkey_val_bytes(&a->k, bytes);
+       memset_u64s_tail(&a->v, 0, bytes);
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+       struct bkey_alloc_unpacked ret = {
+               .dev    = k.k->p.inode,
+               .bucket = k.k->p.offset,
+               .gen    = 0,
+       };
+
+       if (k.k->type == KEY_TYPE_alloc_v2)
+               bch2_alloc_unpack_v2(&ret, k);
+       else if (k.k->type == KEY_TYPE_alloc)
+               bch2_alloc_unpack_v1(&ret, k);
+
+       return ret;
+}
 
-       bytes = (void *) d - (void *) &dst->v;
-       set_bkey_val_bytes(&dst->k, bytes);
-       memset_u64s_tail(&dst->v, 0, bytes);
+void bch2_alloc_pack(struct bch_fs *c,
+                    struct bkey_alloc_buf *dst,
+                    const struct bkey_alloc_unpacked src)
+{
+       bch2_alloc_pack_v2(dst, src);
 }
 
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
        unsigned i, bytes = offsetof(struct bch_alloc, data);
 
-       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
                if (a->fields & (1 << i))
-                       bytes += BCH_ALLOC_FIELD_BYTES[i];
+                       bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
 
        return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
@@ -190,20 +260,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
        return NULL;
 }
 
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-                       struct bkey_s_c k)
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-       struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-       const void *d = a.v->data;
-       unsigned i;
+       struct bkey_alloc_unpacked u;
 
-       pr_buf(out, "gen %u", a.v->gen);
+       if (k.k->p.inode >= c->sb.nr_devices ||
+           !c->devs[k.k->p.inode])
+               return "invalid device";
+
+       if (bch2_alloc_unpack_v2(&u, k))
+               return "unpack error";
 
-       for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
-               if (a.v->fields & (1 << i))
-                       pr_buf(out, " %s %llu",
-                              bch2_alloc_field_names[i],
-                              get_alloc_field(a.v, &d, i));
+       return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+                          struct bkey_s_c k)
+{
+       struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+       pr_buf(out, "gen %u oldest_gen %u data_type %u",
+              u.gen, u.oldest_gen, u.data_type);
+#define x(_name, ...)  pr_buf(out, #_name " %llu ", (u64) u._name);
+       BCH_ALLOC_FIELDS_V2()
+#undef  x
 }
 
 static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@@ -213,11 +293,13 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
        struct bucket *g;
        struct bkey_alloc_unpacked u;
 
-       if (level || k.k->type != KEY_TYPE_alloc)
+       if (level ||
+           (k.k->type != KEY_TYPE_alloc &&
+            k.k->type != KEY_TYPE_alloc_v2))
                return 0;
 
        ca = bch_dev_bkey_exists(c, k.k->p.inode);
-       g = __bucket(ca, k.k->p.offset, 0);
+       g = bucket(ca, k.k->p.offset);
        u = bch2_alloc_unpack(k);
 
        g->_mark.gen            = u.gen;
@@ -234,12 +316,10 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
+       int ret;
 
        down_read(&c->gc_lock);
-       ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
+       ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
                                          NULL, bch2_alloc_read_fn);
        up_read(&c->gc_lock);
 
@@ -248,26 +328,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
                return ret;
        }
 
-       percpu_down_write(&c->mark_lock);
-       bch2_dev_usage_from_buckets(c);
-       percpu_up_write(&c->mark_lock);
-
-       mutex_lock(&c->bucket_clock[READ].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, READ);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[READ].lock);
-
-       mutex_lock(&c->bucket_clock[WRITE].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, WRITE);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[WRITE].lock);
-
        return 0;
 }
 
@@ -278,18 +338,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bkey_s_c k;
        struct bch_dev *ca;
-       struct bucket_array *ba;
        struct bucket *g;
        struct bucket_mark m;
        struct bkey_alloc_unpacked old_u, new_u;
-       __BKEY_PADDED(k, 8) alloc_key; /* hack: */
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf a;
        int ret;
 retry:
        bch2_trans_begin(trans);
 
        ret = bch2_btree_key_cache_flush(trans,
-                       BTREE_ID_ALLOC, iter->pos);
+                       BTREE_ID_alloc, iter->pos);
        if (ret)
                goto err;
 
@@ -302,193 +360,60 @@ retry:
 
        percpu_down_read(&c->mark_lock);
        ca      = bch_dev_bkey_exists(c, iter->pos.inode);
-       ba      = bucket_array(ca);
-
-       g       = &ba->b[iter->pos.offset];
+       g       = bucket(ca, iter->pos.offset);
        m       = READ_ONCE(g->mark);
-       new_u   = alloc_mem_to_key(g, m);
+       new_u   = alloc_mem_to_key(iter, g, m);
        percpu_up_read(&c->mark_lock);
 
        if (!bkey_alloc_unpacked_cmp(old_u, new_u))
                return 0;
 
-       a = bkey_alloc_init(&alloc_key.k);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, new_u);
-
-       bch2_trans_update(trans, iter, &a->k_i,
+       bch2_alloc_pack(c, &a, new_u);
+       bch2_trans_update(trans, iter, &a.k,
                          BTREE_TRIGGER_NORUN);
        ret = bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE|
-                               flags);
+                               BTREE_INSERT_NOFAIL|flags);
 err:
        if (ret == -EINTR)
                goto retry;
        return ret;
 }
 
-int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       u64 first_bucket, nbuckets;
+       struct bch_dev *ca;
+       unsigned i;
        int ret = 0;
 
-       percpu_down_read(&c->mark_lock);
-       first_bucket    = bucket_array(ca)->first_bucket;
-       nbuckets        = bucket_array(ca)->nbuckets;
-       percpu_up_read(&c->mark_lock);
-
-       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
-                                  POS(ca->dev_idx, first_bucket),
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-       while (iter->pos.offset < nbuckets) {
-               bch2_trans_cond_resched(&trans);
-
-               ret = bch2_alloc_write_key(&trans, iter, flags);
-               if (ret)
-                       break;
-               bch2_btree_iter_next_slot(iter);
-       }
-
-       bch2_trans_exit(&trans);
-
-       return ret;
-}
+       for_each_member_device(ca, c, i) {
+               bch2_btree_iter_set_pos(iter,
+                       POS(ca->dev_idx, ca->mi.first_bucket));
 
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
+               while (iter->pos.offset < ca->mi.nbuckets) {
+                       bch2_trans_cond_resched(&trans);
 
-       for_each_rw_member(ca, c, i) {
-               bch2_dev_alloc_write(c, ca, flags);
-               if (ret) {
-                       percpu_ref_put(&ca->io_ref);
-                       break;
+                       ret = bch2_alloc_write_key(&trans, iter, flags);
+                       if (ret) {
+                               percpu_ref_put(&ca->io_ref);
+                               goto err;
+                       }
+                       bch2_btree_iter_next_slot(iter);
                }
        }
-
+err:
+       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_exit(&trans);
        return ret;
 }
 
 /* Bucket IO clocks: */
 
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets = bucket_array(ca);
-       struct bucket *g;
-       u16 max_last_io = 0;
-       unsigned i;
-
-       lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-       /* Recalculate max_last_io for this device: */
-       for_each_bucket(g, buckets)
-               max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-       ca->max_last_bucket_io[rw] = max_last_io;
-
-       /* Recalculate global max_last_io: */
-       max_last_io = 0;
-
-       for_each_member_device(ca, c, i)
-               max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-       clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets;
-       struct bch_dev *ca;
-       struct bucket *g;
-       unsigned i;
-
-       trace_rescale_prios(c);
-
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets)
-                       g->io_time[rw] = clock->hand -
-                       bucket_last_io(c, g, rw) / 2;
-
-               bch2_recalc_oldest_io(c, ca, rw);
-
-               up_read(&ca->bucket_lock);
-       }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
-       return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-       struct bucket_clock *clock = container_of(timer,
-                                               struct bucket_clock, rescale);
-       struct bch_fs *c = container_of(clock,
-                                       struct bch_fs, bucket_clock[clock->rw]);
-       struct bch_dev *ca;
-       u64 capacity;
-       unsigned i;
-
-       mutex_lock(&clock->lock);
-
-       /* if clock cannot be advanced more, rescale prio */
-       if (clock->max_last_io >= U16_MAX - 2)
-               bch2_rescale_bucket_io_times(c, clock->rw);
-
-       BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-       for_each_member_device(ca, c, i)
-               ca->max_last_bucket_io[clock->rw]++;
-       clock->max_last_io++;
-       clock->hand++;
-
-       mutex_unlock(&clock->lock);
-
-       capacity = READ_ONCE(c->capacity);
-
-       if (!capacity)
-               return;
-
-       /*
-        * we only increment when 0.1% of the filesystem capacity has been read
-        * or written too, this determines if it's time
-        *
-        * XXX: we shouldn't really be going off of the capacity of devices in
-        * RW mode (that will be 0 when we're RO, yet we can still service
-        * reads)
-        */
-       timer->expire += bucket_clock_freq(capacity);
-
-       bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-
-       clock->hand             = 1;
-       clock->rw               = rw;
-       clock->rescale.fn       = bch2_inc_clock_hand;
-       clock->rescale.expire   = bucket_clock_freq(c->capacity);
-       mutex_init(&clock->lock);
-}
-
 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
                              size_t bucket_nr, int rw)
 {
@@ -496,40 +421,38 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
        struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
        struct btree_iter *iter;
        struct bucket *g;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf *a;
        struct bkey_alloc_unpacked u;
-       u16 *time;
+       u64 *time, now;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
+       iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
                                   BTREE_ITER_CACHED|
                                   BTREE_ITER_CACHED_NOFILL|
                                   BTREE_ITER_INTENT);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               goto out;
 
-       a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+       a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
        ret = PTR_ERR_OR_ZERO(a);
        if (ret)
                goto out;
 
        percpu_down_read(&c->mark_lock);
        g = bucket(ca, bucket_nr);
-       u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+       u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
        percpu_up_read(&c->mark_lock);
 
-       bkey_alloc_init(&a->k_i);
-       a->k.p = iter->pos;
-
        time = rw == READ ? &u.read_time : &u.write_time;
-       if (*time == c->bucket_clock[rw].hand)
+       now = atomic64_read(&c->io_clock[rw].now);
+       if (*time == now)
                goto out;
 
-       *time = c->bucket_clock[rw].hand;
-
-       bch2_alloc_pack(a, u);
+       *time = now;
 
-       ret   = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+       bch2_alloc_pack(c, a, u);
+       ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
                bch2_trans_commit(trans, NULL, NULL, 0);
 out:
        bch2_trans_iter_put(trans, iter);
@@ -553,7 +476,8 @@ out:
 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 {
        unsigned long gc_count = c->gc_count;
-       u64 available;
+       s64 available;
+       unsigned i;
        int ret = 0;
 
        ca->allocator_state = ALLOCATOR_BLOCKED;
@@ -569,13 +493,19 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
                if (gc_count != c->gc_count)
                        ca->inc_gen_really_needs_gc = 0;
 
-               available = max_t(s64, 0, dev_buckets_available(ca) -
-                                 ca->inc_gen_really_needs_gc);
+               available  = dev_buckets_available(ca);
+               available -= ca->inc_gen_really_needs_gc;
+
+               spin_lock(&c->freelist_lock);
+               for (i = 0; i < RESERVE_NR; i++)
+                       available -= fifo_used(&ca->free[i]);
+               spin_unlock(&c->freelist_lock);
+
+               available = max(available, 0LL);
 
                if (available > fifo_free(&ca->free_inc) ||
                    (available &&
-                    (!fifo_full(&ca->free[RESERVE_BTREE]) ||
-                     !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
+                    !fifo_full(&ca->free[RESERVE_MOVINGGC])))
                        break;
 
                up_read(&c->gc_lock);
@@ -591,20 +521,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
        return ret;
 }
 
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-                                      size_t bucket,
-                                      struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+                                      struct bucket_mark m)
 {
        u8 gc_gen;
 
-       if (!is_available_bucket(mark))
+       if (!is_available_bucket(m))
+               return false;
+
+       if (m.owned_by_allocator)
                return false;
 
        if (ca->buckets_nouse &&
-           test_bit(bucket, ca->buckets_nouse))
+           test_bit(b, ca->buckets_nouse))
                return false;
 
-       gc_gen = bucket_gc_gen(ca, bucket);
+       gc_gen = bucket_gc_gen(bucket(ca, b));
 
        if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
                ca->inc_gen_needs_gc++;
@@ -618,43 +550,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 /*
  * Determines what order we're going to reuse buckets, smallest bucket_key()
  * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
  */
 
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-                                    size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+                               u64 now, u64 last_seq_ondisk)
 {
-       unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-       unsigned max_last_io = ca->max_last_bucket_io[READ];
-
-       /*
-        * Time since last read, scaled to [0, 8) where larger value indicates
-        * more recently read data:
-        */
-       unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
+       unsigned used = bucket_sectors_used(m);
 
-       /* How much we want to keep the data in this bucket: */
-       unsigned long data_wantness =
-               (hotness + 1) * bucket_sectors_used(m);
-
-       unsigned long needs_journal_commit =
-               bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+       if (used) {
+               /*
+                * Prefer to keep buckets that have been read more recently, and
+                * buckets that have more data in them:
+                */
+               u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+               u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
 
-       return  (data_wantness << 9) |
-               (needs_journal_commit << 8) |
-               (bucket_gc_gen(ca, b) / 16);
+               return -last_read_scaled;
+       } else {
+               /*
+                * Prefer to use buckets with smaller gc_gen so that we don't
+                * have to walk the btree and recalculate oldest_gen - but shift
+                * off the low bits so that buckets will still have equal sort
+                * keys when there's only a small difference, so that we can
+                * keep sequential buckets together:
+                */
+               return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+                       (bucket_gc_gen(g) >> 4);
+       }
 }
 
 static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -677,16 +599,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
        struct bucket_array *buckets;
        struct alloc_heap_entry e = { 0 };
+       u64 now, last_seq_ondisk;
        size_t b, i, nr = 0;
 
-       ca->alloc_heap.used = 0;
-
-       mutex_lock(&c->bucket_clock[READ].lock);
        down_read(&ca->bucket_lock);
 
        buckets = bucket_array(ca);
-
-       bch2_recalc_oldest_io(c, ca, READ);
+       ca->alloc_heap.used = 0;
+       now = atomic64_read(&c->io_clock[READ].now);
+       last_seq_ondisk = c->journal.last_seq_ondisk;
 
        /*
         * Find buckets with lowest read priority, by building a maxheap sorted
@@ -694,8 +615,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
         * all buckets have been visited.
         */
        for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-               unsigned long key = bucket_sort_key(c, ca, b, m);
+               struct bucket *g = &buckets->b[b];
+               struct bucket_mark m = READ_ONCE(g->mark);
+               unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
 
                if (!bch2_can_invalidate_bucket(ca, b, m))
                        continue;
@@ -730,7 +652,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
        }
 
        up_read(&ca->bucket_lock);
-       mutex_unlock(&c->bucket_clock[READ].lock);
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -810,13 +731,13 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
        ca->inc_gen_needs_gc                    = 0;
 
        switch (ca->mi.replacement) {
-       case CACHE_REPLACEMENT_LRU:
+       case BCH_CACHE_REPLACEMENT_lru:
                find_reclaimable_buckets_lru(c, ca);
                break;
-       case CACHE_REPLACEMENT_FIFO:
+       case BCH_CACHE_REPLACEMENT_fifo:
                find_reclaimable_buckets_fifo(c, ca);
                break;
-       case CACHE_REPLACEMENT_RANDOM:
+       case BCH_CACHE_REPLACEMENT_random:
                find_reclaimable_buckets_random(c, ca);
                break;
        }
@@ -875,14 +796,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
                                       struct btree_iter *iter,
                                       u64 *journal_seq, unsigned flags)
 {
-#if 0
-       __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
-       /* hack: */
-       __BKEY_PADDED(k, 8) alloc_key;
-#endif
        struct bch_fs *c = trans->c;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf a;
        struct bkey_alloc_unpacked u;
        struct bucket *g;
        struct bucket_mark m;
@@ -896,34 +811,33 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 
        /* first, put on free_inc and mark as owned by allocator: */
        percpu_down_read(&c->mark_lock);
-       spin_lock(&c->freelist_lock);
-
-       verify_not_on_freelist(c, ca, b);
-
-       BUG_ON(!fifo_push(&ca->free_inc, b));
-
        g = bucket(ca, b);
        m = READ_ONCE(g->mark);
 
-       invalidating_cached_data = m.cached_sectors != 0;
+       BUG_ON(m.dirty_sectors);
+
+       bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+
+       spin_lock(&c->freelist_lock);
+       verify_not_on_freelist(c, ca, b);
+       BUG_ON(!fifo_push(&ca->free_inc, b));
+       spin_unlock(&c->freelist_lock);
 
        /*
         * If we're not invalidating cached data, we only increment the bucket
         * gen in memory here, the incremented gen will be updated in the btree
         * by bch2_trans_mark_pointer():
         */
+       if (!m.cached_sectors &&
+           !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
+               BUG_ON(m.data_type);
+               bucket_cmpxchg(g, m, m.gen++);
+               percpu_up_read(&c->mark_lock);
+               goto out;
+       }
 
-       if (!invalidating_cached_data)
-               bch2_invalidate_bucket(c, ca, b, &m);
-       else
-               bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-
-       spin_unlock(&c->freelist_lock);
        percpu_up_read(&c->mark_lock);
 
-       if (!invalidating_cached_data)
-               goto out;
-
        /*
         * If the read-only path is trying to shut down, we can't be generating
         * new btree updates:
@@ -933,8 +847,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
                goto out;
        }
 
-       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
        bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
 retry:
        ret = bch2_btree_iter_traverse(iter);
@@ -944,7 +856,7 @@ retry:
        percpu_down_read(&c->mark_lock);
        g = bucket(ca, iter->pos.offset);
        m = READ_ONCE(g->mark);
-       u = alloc_mem_to_key(g, m);
+       u = alloc_mem_to_key(iter, g, m);
 
        percpu_up_read(&c->mark_lock);
 
@@ -954,14 +866,11 @@ retry:
        u.data_type     = 0;
        u.dirty_sectors = 0;
        u.cached_sectors = 0;
-       u.read_time     = c->bucket_clock[READ].hand;
-       u.write_time    = c->bucket_clock[WRITE].hand;
-
-       a = bkey_alloc_init(&alloc_key.k);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, u);
+       u.read_time     = atomic64_read(&c->io_clock[READ].now);
+       u.write_time    = atomic64_read(&c->io_clock[WRITE].now);
 
-       bch2_trans_update(trans, iter, &a->k_i,
+       bch2_alloc_pack(c, &a, u);
+       bch2_trans_update(trans, iter, &a.k,
                          BTREE_TRIGGER_BUCKET_INVALIDATE);
 
        /*
@@ -976,8 +885,7 @@ retry:
                                BTREE_INSERT_NOUNLOCK|
                                BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE|
-                               BTREE_INSERT_USE_ALLOC_RESERVE|
+                               BTREE_INSERT_JOURNAL_RESERVED|
                                flags);
        if (ret == -EINTR)
                goto retry;
@@ -1029,8 +937,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
                                   POS(ca->dev_idx, 0),
                                   BTREE_ITER_CACHED|
                                   BTREE_ITER_CACHED_NOFILL|
@@ -1045,6 +952,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
                                (!fifo_empty(&ca->free_inc)
                                 ? BTREE_INSERT_NOWAIT : 0));
 
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
 
        /* If we used NOWAIT, don't return the error: */
@@ -1138,6 +1046,12 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
        return 0;
 }
 
+static inline bool allocator_thread_running(struct bch_dev *ca)
+{
+       return ca->mi.state == BCH_MEMBER_STATE_rw &&
+               test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+}
+
 /**
  * bch_allocator_thread - move buckets from free_inc to reserves
  *
@@ -1154,9 +1068,16 @@ static int bch2_allocator_thread(void *arg)
        int ret;
 
        set_freezable();
-       ca->allocator_state = ALLOCATOR_RUNNING;
 
        while (1) {
+               if (!allocator_thread_running(ca)) {
+                       ca->allocator_state = ALLOCATOR_STOPPED;
+                       if (kthread_wait_freezable(allocator_thread_running(ca)))
+                               break;
+               }
+
+               ca->allocator_state = ALLOCATOR_RUNNING;
+
                cond_resched();
                if (kthread_should_stop())
                        break;
@@ -1456,9 +1377,12 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
                return 0;
 
        p = kthread_create(bch2_allocator_thread, ca,
-                          "bch_alloc[%s]", ca->name);
-       if (IS_ERR(p))
+                          "bch-alloc/%s", ca->name);
+       if (IS_ERR(p)) {
+               bch_err(ca->fs, "error creating allocator thread: %li",
+                       PTR_ERR(p));
                return PTR_ERR(p);
+       }
 
        get_task_struct(p);
        rcu_assign_pointer(ca->alloc_thread, p);
@@ -1469,8 +1393,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
        spin_lock_init(&c->freelist_lock);
-       bch2_bucket_clock_init(c, READ);
-       bch2_bucket_clock_init(c, WRITE);
 
        c->pd_controllers_update_seconds = 5;
        INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
index cbaff56f7473f4f77edb5da718312ee7f29c598b..6fededcd9f8686276beadd696b713c734eeffb5b 100644 (file)
@@ -7,12 +7,33 @@
 #include "debug.h"
 
 struct bkey_alloc_unpacked {
+       u64             bucket;
+       u8              dev;
        u8              gen;
+       u8              oldest_gen;
+       u8              data_type;
 #define x(_name, _bits)        u##_bits _name;
-       BCH_ALLOC_FIELDS()
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
 };
 
+struct bkey_alloc_buf {
+       struct bkey_i   k;
+
+       union {
+       struct {
+#define x(_name,  _bits)               + _bits / 8
+       u8              _pad[8 + BCH_ALLOC_FIELDS_V1()];
+#undef  x
+       } _v1;
+       struct {
+#define x(_name,  _bits)               + 8 + _bits / 8
+       u8              _pad[8 + BCH_ALLOC_FIELDS_V2()];
+#undef  x
+       } _v2;
+       };
+} __attribute__((packed, aligned(8)));
+
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX      96U
 
@@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
 static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
                                           struct bkey_alloc_unpacked r)
 {
-       return l.gen != r.gen
-#define x(_name, _bits)        || l._name != r._name
-       BCH_ALLOC_FIELDS()
+       return  l.gen != r.gen                  ||
+               l.oldest_gen != r.oldest_gen    ||
+               l.data_type != r.data_type
+#define x(_name, ...)  || l._name != r._name
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
        ;
 }
 
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bkey_i_alloc *,
+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
                     const struct bkey_alloc_unpacked);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
 static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+alloc_mem_to_key(struct btree_iter *iter,
+                struct bucket *g, struct bucket_mark m)
 {
        return (struct bkey_alloc_unpacked) {
+               .dev            = iter->pos.inode,
+               .bucket         = iter->pos.offset,
                .gen            = m.gen,
                .oldest_gen     = g->oldest_gen,
                .data_type      = m.data_type,
@@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
 
 #define ALLOC_SCAN_BATCH(ca)           max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc (struct bkey_ops) {                \
-       .key_invalid    = bch2_alloc_invalid,           \
+       .key_invalid    = bch2_alloc_v1_invalid,        \
+       .val_to_text    = bch2_alloc_to_text,           \
+}
+
+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {     \
+       .key_invalid    = bch2_alloc_v2_invalid,        \
        .val_to_text    = bch2_alloc_to_text,           \
 }
 
@@ -76,7 +108,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
                                          size_t bucket)
 {
-       if (expensive_debug_checks(c)) {
+       if (bch2_expensive_debug_checks) {
                size_t iter;
                long i;
                unsigned j;
@@ -98,7 +130,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
 int bch2_alloc_write(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
index 7a92e3d532548a2219985381e38f2c4081794b1b..8f0b94f591bedd7f8645f32faf696ace430c092f 100644 (file)
@@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
        rcu_read_lock();
        buckets = bucket_array(ca);
 
-       for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-               if (is_available_bucket(buckets->b[b].mark))
+       for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
+               if (is_available_bucket(buckets->b[b].mark) &&
+                   !buckets->b[b].mark.owned_by_allocator)
                        goto success;
        b = -1;
 success:
@@ -204,9 +205,10 @@ success:
 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 {
        switch (reserve) {
-       case RESERVE_ALLOC:
-               return 0;
        case RESERVE_BTREE:
+       case RESERVE_BTREE_MOVINGGC:
+               return 0;
+       case RESERVE_MOVINGGC:
                return OPEN_BUCKETS_COUNT / 4;
        default:
                return OPEN_BUCKETS_COUNT / 2;
@@ -223,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
                                      bool may_alloc_partial,
                                      struct closure *cl)
 {
-       struct bucket_array *buckets;
        struct open_bucket *ob;
-       long bucket = 0;
+       long b = 0;
 
        spin_lock(&c->freelist_lock);
 
@@ -259,22 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
                return ERR_PTR(-OPEN_BUCKETS_EMPTY);
        }
 
-       if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+       if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
                goto out;
 
        switch (reserve) {
-       case RESERVE_ALLOC:
-               if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-                       goto out;
-               break;
-       case RESERVE_BTREE:
-               if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
-                   ca->free[RESERVE_BTREE].size &&
-                   fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-                       goto out;
-               break;
+       case RESERVE_BTREE_MOVINGGC:
        case RESERVE_MOVINGGC:
-               if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+               if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
                        goto out;
                break;
        default:
@@ -292,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
        trace_bucket_alloc_fail(ca, reserve);
        return ERR_PTR(-FREELIST_EMPTY);
 out:
-       verify_not_on_freelist(c, ca, bucket);
+       verify_not_on_freelist(c, ca, b);
 
        ob = bch2_open_bucket_alloc(c);
 
        spin_lock(&ob->lock);
-       buckets = bucket_array(ca);
 
        ob->valid       = true;
        ob->sectors_free = ca->mi.bucket_size;
        ob->alloc_reserve = reserve;
        ob->ptr         = (struct bch_extent_ptr) {
                .type   = 1 << BCH_EXTENT_ENTRY_ptr,
-               .gen    = buckets->b[bucket].mark.gen,
-               .offset = bucket_to_sector(ca, bucket),
+               .gen    = bucket(ca, b)->mark.gen,
+               .offset = bucket_to_sector(ca, b),
                .dev    = ca->dev_idx,
        };
 
@@ -458,16 +449,18 @@ bch2_bucket_alloc_set(struct bch_fs *c,
  * it's to a device we don't want:
  */
 
-static void bucket_alloc_from_stripe(struct bch_fs *c,
-                                    struct open_buckets *ptrs,
-                                    struct write_point *wp,
-                                    struct bch_devs_mask *devs_may_alloc,
-                                    u16 target,
-                                    unsigned erasure_code,
-                                    unsigned nr_replicas,
-                                    unsigned *nr_effective,
-                                    bool *have_cache,
-                                    unsigned flags)
+static enum bucket_alloc_ret
+bucket_alloc_from_stripe(struct bch_fs *c,
+                        struct open_buckets *ptrs,
+                        struct write_point *wp,
+                        struct bch_devs_mask *devs_may_alloc,
+                        u16 target,
+                        unsigned erasure_code,
+                        unsigned nr_replicas,
+                        unsigned *nr_effective,
+                        bool *have_cache,
+                        unsigned flags,
+                        struct closure *cl)
 {
        struct dev_alloc_list devs_sorted;
        struct ec_stripe_head *h;
@@ -476,31 +469,39 @@ static void bucket_alloc_from_stripe(struct bch_fs *c,
        unsigned i, ec_idx;
 
        if (!erasure_code)
-               return;
+               return 0;
 
        if (nr_replicas < 2)
-               return;
+               return 0;
 
        if (ec_open_bucket(c, ptrs))
-               return;
+               return 0;
 
-       h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1);
+       h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
+                                   wp == &c->copygc_write_point,
+                                   cl);
+       if (IS_ERR(h))
+               return -PTR_ERR(h);
        if (!h)
-               return;
+               return 0;
 
        devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
 
        for (i = 0; i < devs_sorted.nr; i++)
-               open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+               for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+                       if (!h->s->blocks[ec_idx])
+                               continue;
+
+                       ob = c->open_buckets + h->s->blocks[ec_idx];
                        if (ob->ptr.dev == devs_sorted.devs[i] &&
-                           !test_and_set_bit(h->s->data_block_idx[ec_idx],
-                                             h->s->blocks_allocated))
+                           !test_and_set_bit(ec_idx, h->s->blocks_allocated))
                                goto got_bucket;
+               }
        goto out_put_head;
 got_bucket:
        ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
-       ob->ec_idx      = h->s->data_block_idx[ec_idx];
+       ob->ec_idx      = ec_idx;
        ob->ec          = h->s;
 
        add_new_bucket(c, ptrs, devs_may_alloc,
@@ -508,6 +509,7 @@ got_bucket:
        atomic_inc(&h->s->pin);
 out_put_head:
        bch2_ec_stripe_head_put(c, h);
+       return 0;
 }
 
 /* Sector allocator */
@@ -585,10 +587,13 @@ open_bucket_add_buckets(struct bch_fs *c,
                }
 
                if (!ec_open_bucket(c, ptrs)) {
-                       bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+                       ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
                                                 target, erasure_code,
                                                 nr_replicas, nr_effective,
-                                                have_cache, flags);
+                                                have_cache, flags, _cl);
+                       if (ret == FREELIST_EMPTY ||
+                           ret == OPEN_BUCKETS_EMPTY)
+                               return ret;
                        if (*nr_effective >= nr_replicas)
                                return 0;
                }
@@ -634,10 +639,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
 
                if (!drop && ob->ec) {
                        mutex_lock(&ob->ec->lock);
-                       open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
-                               drop |= ob2->ptr.dev == ca->dev_idx;
-                       open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+                       for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
+                               if (!ob->ec->blocks[j])
+                                       continue;
+
+                               ob2 = c->open_buckets + ob->ec->blocks[j];
                                drop |= ob2->ptr.dev == ca->dev_idx;
+                       }
                        mutex_unlock(&ob->ec->lock);
                }
 
index 20705460bb0aa10ef24dc1910716ea78493e7074..be164d6108bbcdbb3f2ca48bf8e6dc0618d4b6a8 100644 (file)
 
 struct ec_bucket_buf;
 
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
-       /*
-        * "now" in (read/write) IO time - incremented whenever we do X amount
-        * of reads or writes.
-        *
-        * Goes with the bucket read/write prios: when we read or write to a
-        * bucket we reset the bucket's prio to the current hand; thus hand -
-        * prio = time since bucket was last read/written.
-        *
-        * The units are some amount (bytes/sectors) of data read/written, and
-        * the units can change on the fly if we need to rescale to fit
-        * everything in a u16 - your only guarantee is that the units are
-        * consistent.
-        */
-       u16                     hand;
-       u16                     max_last_io;
-
-       int                     rw;
-
-       struct io_timer         rescale;
-       struct mutex            lock;
-};
-
-/* There is one reserve for each type of btree, one for prios and gens
- * and one for moving GC */
 enum alloc_reserve {
-       RESERVE_ALLOC           = -1,
-       RESERVE_BTREE           = 0,
-       RESERVE_MOVINGGC        = 1,
-       RESERVE_NONE            = 2,
-       RESERVE_NR              = 3,
+       RESERVE_BTREE_MOVINGGC  = -2,
+       RESERVE_BTREE           = -1,
+       RESERVE_MOVINGGC        = 0,
+       RESERVE_NONE            = 1,
+       RESERVE_NR              = 2,
 };
 
 typedef FIFO(long)     alloc_fifo;
@@ -89,7 +63,6 @@ struct write_point {
        u64                     last_used;
        unsigned long           write_point;
        enum bch_data_type      type;
-       bool                    is_ec;
 
        /* calculated based on how many pointers we're actually going to use: */
        unsigned                sectors_free;
index 29f411635f29968e9fae422d9fbc5ad67318b265..549cded6276469bc79ea6debd66fa272843c9002 100644 (file)
 #include <linux/semaphore.h>
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
+#include <linux/srcu.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
         dynamic_fault("bcachefs:meta:write:" name)
 
 #ifdef __KERNEL__
-#define bch2_fmt(_c, fmt)      "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt(_c, fmt)              "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt_inum(_c, _inum, fmt)  "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
 #else
-#define bch2_fmt(_c, fmt)      fmt "\n"
+#define bch2_fmt(_c, fmt)              fmt "\n"
+#define bch2_fmt_inum(_c, _inum, fmt)  "inum %llu: " fmt "\n", (_inum)
 #endif
 
 #define bch_info(c, fmt, ...) \
        printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
        printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
 #define bch_err_ratelimited(c, fmt, ...) \
        printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+       printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
 
 #define bch_verbose(c, fmt, ...)                                       \
 do {                                                                   \
@@ -265,6 +271,8 @@ do {                                                                        \
        BCH_DEBUG_PARAM(debug_check_bkeys,                              \
                "Run bkey_debugcheck (primarily checking GC/allocation "\
                "information) when iterating over keys")                \
+       BCH_DEBUG_PARAM(debug_check_btree_accounting,                   \
+               "Verify btree accounting for keys within a node")       \
        BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
                "Reread btree nodes at various points to verify the "   \
                "mergesort in the read path against modifications "     \
@@ -295,6 +303,16 @@ do {                                                                       \
 #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
 #endif
 
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
 #define BCH_TIME_STATS()                       \
        x(btree_node_mem_alloc)                 \
        x(btree_node_split)                     \
@@ -351,14 +369,14 @@ enum gc_phase {
        GC_PHASE_START,
        GC_PHASE_SB,
 
-       GC_PHASE_BTREE_EC,
-       GC_PHASE_BTREE_EXTENTS,
-       GC_PHASE_BTREE_INODES,
-       GC_PHASE_BTREE_DIRENTS,
-       GC_PHASE_BTREE_XATTRS,
-       GC_PHASE_BTREE_ALLOC,
-       GC_PHASE_BTREE_QUOTAS,
-       GC_PHASE_BTREE_REFLINK,
+       GC_PHASE_BTREE_stripes,
+       GC_PHASE_BTREE_extents,
+       GC_PHASE_BTREE_inodes,
+       GC_PHASE_BTREE_dirents,
+       GC_PHASE_BTREE_xattrs,
+       GC_PHASE_BTREE_alloc,
+       GC_PHASE_BTREE_quotas,
+       GC_PHASE_BTREE_reflink,
 
        GC_PHASE_PENDING_DELETE,
        GC_PHASE_ALLOC,
@@ -411,7 +429,9 @@ struct bch_dev {
        unsigned long           *buckets_nouse;
        struct rw_semaphore     bucket_lock;
 
-       struct bch_dev_usage __percpu *usage[2];
+       struct bch_dev_usage            *usage_base;
+       struct bch_dev_usage __percpu   *usage[JOURNAL_BUF_NR];
+       struct bch_dev_usage __percpu   *usage_gc;
 
        /* Allocator: */
        struct task_struct __rcu *alloc_thread;
@@ -433,9 +453,6 @@ struct bch_dev {
 
        size_t                  fifo_last_bucket;
 
-       /* last calculated minimum prio */
-       u16                     max_last_bucket_io[2];
-
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
 
@@ -455,6 +472,7 @@ struct bch_dev {
        atomic64_t              rebalance_work;
 
        struct journal_device   journal;
+       u64                     prev_journal_sector;
 
        struct work_struct      io_error_work;
 
@@ -491,8 +509,9 @@ enum {
        BCH_FS_ERRORS_FIXED,
 
        /* misc: */
-       BCH_FS_FIXED_GENS,
-       BCH_FS_ALLOC_WRITTEN,
+       BCH_FS_NEED_ANOTHER_GC,
+       BCH_FS_DELETED_NODES,
+       BCH_FS_NEED_ALLOC_WRITE,
        BCH_FS_REBUILD_REPLICAS,
        BCH_FS_HOLD_BTREE_WRITES,
 };
@@ -521,14 +540,20 @@ struct journal_keys {
        struct journal_key {
                enum btree_id   btree_id:8;
                unsigned        level:8;
+               bool            allocated;
                struct bkey_i   *k;
                u32             journal_seq;
                u32             journal_offset;
        }                       *d;
        size_t                  nr;
+       size_t                  size;
        u64                     journal_seq_base;
 };
 
+struct btree_iter_buf {
+       struct btree_iter       *iter;
+};
+
 struct bch_fs {
        struct closure          cl;
 
@@ -557,7 +582,10 @@ struct bch_fs {
        struct bch_replicas_cpu replicas_gc;
        struct mutex            replicas_gc_lock;
 
+       struct journal_entry_res btree_root_journal_res;
        struct journal_entry_res replicas_journal_res;
+       struct journal_entry_res clock_journal_res;
+       struct journal_entry_res dev_usage_journal_res;
 
        struct bch_disk_groups_cpu __rcu *disk_groups;
 
@@ -569,6 +597,7 @@ struct bch_fs {
                uuid_le         user_uuid;
 
                u16             version;
+               u16             version_min;
                u16             encoded_extent_max;
 
                u8              nr_devices;
@@ -624,13 +653,15 @@ struct bch_fs {
        struct mutex            btree_trans_lock;
        struct list_head        btree_trans_list;
        mempool_t               btree_iters_pool;
+       struct btree_iter_buf  __percpu *btree_iters_bufs;
+
+       struct srcu_struct      btree_trans_barrier;
 
        struct btree_key_cache  btree_key_cache;
 
        struct workqueue_struct *wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
-       struct workqueue_struct *journal_reclaim_wq;
 
        /* ALLOCATION */
        struct delayed_work     pd_controllers_update;
@@ -649,6 +680,7 @@ struct bch_fs {
        unsigned                bucket_size_max;
 
        atomic64_t              sectors_available;
+       struct mutex            sectors_available_lock;
 
        struct bch_fs_pcpu __percpu     *pcpu;
 
@@ -656,20 +688,13 @@ struct bch_fs {
 
        seqcount_t                      usage_lock;
        struct bch_fs_usage             *usage_base;
-       struct bch_fs_usage __percpu    *usage[2];
+       struct bch_fs_usage __percpu    *usage[JOURNAL_BUF_NR];
        struct bch_fs_usage __percpu    *usage_gc;
+       u64 __percpu            *online_reserved;
 
        /* single element mempool: */
        struct mutex            usage_scratch_lock;
-       struct bch_fs_usage     *usage_scratch;
-
-       /*
-        * When we invalidate buckets, we use both the priority and the amount
-        * of good data to determine which buckets to reuse first - to weight
-        * those together consistently we keep track of the smallest nonzero
-        * priority of any bucket.
-        */
-       struct bucket_clock     bucket_clock[2];
+       struct bch_fs_usage_online *usage_scratch;
 
        struct io_clock         io_clock[2];
 
@@ -705,7 +730,7 @@ struct bch_fs {
         * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
         * has been marked by GC.
         *
-        * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+        * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
         *
         * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
         * can read without a lock.
@@ -780,6 +805,9 @@ struct bch_fs {
        struct bio_set          dio_write_bioset;
        struct bio_set          dio_read_bioset;
 
+
+       atomic64_t              btree_writes_nr;
+       atomic64_t              btree_writes_sectors;
        struct bio_list         btree_write_error_list;
        struct work_struct      btree_write_error_work;
        spinlock_t              btree_write_error_lock;
@@ -801,7 +829,8 @@ struct bch_fs {
        struct mutex            verify_lock;
 #endif
 
-       u64                     unused_inode_hint;
+       u64                     *unused_inode_hints;
+       unsigned                inode_shard_bits;
 
        /*
         * A btree node on disk could have too many bsets for an iterator to fit
@@ -814,6 +843,7 @@ struct bch_fs {
        struct journal          journal;
        struct list_head        journal_entries;
        struct journal_keys     journal_keys;
+       struct list_head        journal_iters;
 
        u64                     last_bucket_seq_cleanup;
 
@@ -826,10 +856,6 @@ struct bch_fs {
        unsigned                copy_gc_enabled:1;
        bool                    promote_whole_extents;
 
-#define BCH_DEBUG_PARAM(name, description) bool name;
-       BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
        struct time_stats       times[BCH_TIME_STAT_NR];
 };
 
index 2926c648a17f2761fe77df869dd7106994539866..ead7268bf8984052d0c2935100286980ee1cbcbb 100644 (file)
@@ -138,19 +138,18 @@ struct bpos {
 #define KEY_SNAPSHOT_MAX               ((__u32)~0U)
 #define KEY_SIZE_MAX                   ((__u32)~0U)
 
-static inline struct bpos POS(__u64 inode, __u64 offset)
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
 {
-       struct bpos ret;
-
-       ret.inode       = inode;
-       ret.offset      = offset;
-       ret.snapshot    = 0;
-
-       return ret;
+       return (struct bpos) {
+               .inode          = inode,
+               .offset         = offset,
+               .snapshot       = snapshot,
+       };
 }
 
-#define POS_MIN                                POS(0, 0)
-#define POS_MAX                                POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+#define POS_MIN                                SPOS(0, 0, 0)
+#define POS_MAX                                SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset)           SPOS(_inode, _offset, 0)
 
 /* Empty placeholder struct, for container_of() */
 struct bch_val {
@@ -326,7 +325,7 @@ static inline void bkey_init(struct bkey *k)
        x(discard,              1)                      \
        x(error,                2)                      \
        x(cookie,               3)                      \
-       x(whiteout,             4)                      \
+       x(hash_whiteout,        4)                      \
        x(btree_ptr,            5)                      \
        x(extent,               6)                      \
        x(reservation,          7)                      \
@@ -341,7 +340,8 @@ static inline void bkey_init(struct bkey *k)
        x(reflink_v,            16)                     \
        x(inline_data,          17)                     \
        x(btree_ptr_v2,         18)                     \
-       x(indirect_inline_data, 19)
+       x(indirect_inline_data, 19)                     \
+       x(alloc_v2,             20)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -350,11 +350,27 @@ enum bch_bkey_type {
        KEY_TYPE_MAX,
 };
 
+struct bch_deleted {
+       struct bch_val          v;
+};
+
+struct bch_discard {
+       struct bch_val          v;
+};
+
+struct bch_error {
+       struct bch_val          v;
+};
+
 struct bch_cookie {
        struct bch_val          v;
        __le64                  cookie;
 };
 
+struct bch_hash_whiteout {
+       struct bch_val          v;
+};
+
 /* Extents */
 
 /*
@@ -551,9 +567,11 @@ struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
        __u64                   type:5,
                                block:8,
-                               idx:51;
+                               redundancy:4,
+                               idx:47;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   idx:51,
+       __u64                   idx:47,
+                               redundancy:4,
                                block:8,
                                type:5;
 #endif
@@ -603,13 +621,14 @@ struct bch_btree_ptr_v2 {
        __u64                   mem_ptr;
        __le64                  seq;
        __le16                  sectors_written;
-       /* In case we ever decide to do variable size btree nodes: */
-       __le16                  sectors;
+       __le16                  flags;
        struct bpos             min_key;
        struct bch_extent_ptr   start[0];
        __u64                   _data[0];
 } __attribute__((packed, aligned(8)));
 
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,  struct bch_btree_ptr_v2, flags, 0, 1);
+
 struct bch_extent {
        struct bch_val          v;
 
@@ -634,8 +653,6 @@ struct bch_reservation {
 #define BKEY_EXTENT_VAL_U64s_MAX                               \
        (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
 
-#define BKEY_PADDED(key)       __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
 /* * Maximum possible size of an entire extent, key + value: */
 #define BKEY_EXTENT_U64s_MAX           (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
 
@@ -669,10 +686,10 @@ struct bch_inode_generation {
 } __attribute__((packed, aligned(8)));
 
 #define BCH_INODE_FIELDS()                     \
-       x(bi_atime,                     64)     \
-       x(bi_ctime,                     64)     \
-       x(bi_mtime,                     64)     \
-       x(bi_otime,                     64)     \
+       x(bi_atime,                     96)     \
+       x(bi_ctime,                     96)     \
+       x(bi_mtime,                     96)     \
+       x(bi_otime,                     96)     \
        x(bi_size,                      64)     \
        x(bi_sectors,                   64)     \
        x(bi_uid,                       32)     \
@@ -689,7 +706,9 @@ struct bch_inode_generation {
        x(bi_foreground_target,         16)     \
        x(bi_background_target,         16)     \
        x(bi_erasure_code,              16)     \
-       x(bi_fields_set,                16)
+       x(bi_fields_set,                16)     \
+       x(bi_dir,                       64)     \
+       x(bi_dir_offset,                64)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()                       \
@@ -725,6 +744,7 @@ enum {
        __BCH_INODE_I_SIZE_DIRTY= 5,
        __BCH_INODE_I_SECTORS_DIRTY= 6,
        __BCH_INODE_UNLINKED    = 7,
+       __BCH_INODE_BACKPTR_UNTRUSTED = 8,
 
        /* bits 20+ reserved for packed fields below: */
 };
@@ -737,9 +757,11 @@ enum {
 #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
 #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
 #define BCH_INODE_UNLINKED     (1 << __BCH_INODE_UNLINKED)
+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
 
 LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 32);
+LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
 
 /* Dirents */
 
@@ -799,35 +821,40 @@ struct bch_alloc {
        __u8                    data[];
 } __attribute__((packed, aligned(8)));
 
-#define BCH_ALLOC_FIELDS()                     \
+#define BCH_ALLOC_FIELDS_V1()                  \
        x(read_time,            16)             \
        x(write_time,           16)             \
        x(data_type,            8)              \
        x(dirty_sectors,        16)             \
        x(cached_sectors,       16)             \
-       x(oldest_gen,           8)
+       x(oldest_gen,           8)              \
+       x(stripe,               32)             \
+       x(stripe_redundancy,    8)
+
+struct bch_alloc_v2 {
+       struct bch_val          v;
+       __u8                    nr_fields;
+       __u8                    gen;
+       __u8                    oldest_gen;
+       __u8                    data_type;
+       __u8                    data[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_FIELDS_V2()                  \
+       x(read_time,            64)             \
+       x(write_time,           64)             \
+       x(dirty_sectors,        16)             \
+       x(cached_sectors,       16)             \
+       x(stripe,               32)             \
+       x(stripe_redundancy,    8)
 
 enum {
-#define x(name, bytes) BCH_ALLOC_FIELD_##name,
-       BCH_ALLOC_FIELDS()
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+       BCH_ALLOC_FIELDS_V1()
 #undef x
        BCH_ALLOC_FIELD_NR
 };
 
-static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
-       BCH_ALLOC_FIELDS()
-#undef x
-};
-
-#define x(name, bits) + (bits / 8)
-static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
-       DIV_ROUND_UP(offsetof(struct bch_alloc, data)
-                    BCH_ALLOC_FIELDS(), sizeof(u64));
-#undef x
-
-#define BKEY_ALLOC_U64s_MAX    (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
-
 /* Quotas: */
 
 enum quota_types {
@@ -963,19 +990,29 @@ LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,   struct bch_member, flags[1], 0,  20);
 LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
 #endif
 
+#define BCH_MEMBER_STATES()                    \
+       x(rw,           0)                      \
+       x(ro,           1)                      \
+       x(failed,       2)                      \
+       x(spare,        3)
+
 enum bch_member_state {
-       BCH_MEMBER_STATE_RW             = 0,
-       BCH_MEMBER_STATE_RO             = 1,
-       BCH_MEMBER_STATE_FAILED         = 2,
-       BCH_MEMBER_STATE_SPARE          = 3,
-       BCH_MEMBER_STATE_NR             = 4,
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+       BCH_MEMBER_STATES()
+#undef x
+       BCH_MEMBER_STATE_NR
 };
 
-enum cache_replacement {
-       CACHE_REPLACEMENT_LRU           = 0,
-       CACHE_REPLACEMENT_FIFO          = 1,
-       CACHE_REPLACEMENT_RANDOM        = 2,
-       CACHE_REPLACEMENT_NR            = 3,
+#define BCH_CACHE_REPLACEMENT_POLICIES()       \
+       x(lru,          0)                      \
+       x(fifo,         1)                      \
+       x(random,       2)
+
+enum bch_cache_replacement_policies {
+#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
+       BCH_CACHE_REPLACEMENT_POLICIES()
+#undef x
+       BCH_CACHE_REPLACEMENT_NR
 };
 
 struct bch_sb_field_members {
@@ -1131,8 +1168,8 @@ struct bch_sb_field_clean {
        struct bch_sb_field     field;
 
        __le32                  flags;
-       __le16                  read_clock;
-       __le16                  write_clock;
+       __le16                  _read_clock; /* no longer used */
+       __le16                  _write_clock;
        __le64                  journal_seq;
 
        union {
@@ -1170,7 +1207,9 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_new_versioning        = 10,
        bcachefs_metadata_version_bkey_renumber         = 10,
        bcachefs_metadata_version_inode_btree_change    = 11,
-       bcachefs_metadata_version_max                   = 12,
+       bcachefs_metadata_version_snapshot              = 12,
+       bcachefs_metadata_version_inode_backpointers    = 13,
+       bcachefs_metadata_version_max                   = 14,
 };
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
@@ -1275,7 +1314,8 @@ LE64_BITMASK(BCH_SB_PRJQUOTA,             struct bch_sb, flags[0], 59, 60);
 
 LE64_BITMASK(BCH_SB_HAS_ERRORS,                struct bch_sb, flags[0], 60, 61);
 
-LE64_BITMASK(BCH_SB_REFLINK,           struct bch_sb, flags[0], 61, 62);
+/* bit 61 was reflink option */
+LE64_BITMASK(BCH_SB_BIG_ENDIAN,                struct bch_sb, flags[0], 62, 63);
 
 /* 61-64 unused */
 
@@ -1305,6 +1345,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,  struct bch_sb, flags[2],  4, 64);
 
 LE64_BITMASK(BCH_SB_ERASURE_CODE,      struct bch_sb, flags[3],  0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET,   struct bch_sb, flags[3], 16, 28);
 
 /*
  * Features:
@@ -1330,13 +1371,25 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,       struct bch_sb, flags[3],  0, 16);
        x(btree_ptr_v2,                 11)     \
        x(extents_above_btree_updates,  12)     \
        x(btree_updates_journalled,     13)     \
-       x(reflink_inline_data,          14)
+       x(reflink_inline_data,          14)     \
+       x(new_varint,                   15)     \
+       x(journal_no_flush,             16)     \
+       x(alloc_v2,                     17)     \
+       x(extents_across_btree_nodes,   18)
+
+#define BCH_SB_FEATURES_ALWAYS                         \
+       ((1ULL << BCH_FEATURE_new_extent_overwrite)|    \
+        (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+        (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+        (1ULL << BCH_FEATURE_alloc_v2)|\
+        (1ULL << BCH_FEATURE_extents_across_btree_nodes))
 
 #define BCH_SB_FEATURES_ALL                            \
-       ((1ULL << BCH_FEATURE_new_siphash)|             \
-        (1ULL << BCH_FEATURE_new_extent_overwrite)|    \
+       (BCH_SB_FEATURES_ALWAYS|                        \
+        (1ULL << BCH_FEATURE_new_siphash)|             \
         (1ULL << BCH_FEATURE_btree_ptr_v2)|            \
-        (1ULL << BCH_FEATURE_extents_above_btree_updates))
+        (1ULL << BCH_FEATURE_new_varint)|              \
+        (1ULL << BCH_FEATURE_journal_no_flush))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@@ -1345,20 +1398,35 @@ enum bch_sb_feature {
        BCH_FEATURE_NR,
 };
 
+#define BCH_SB_COMPAT()                                        \
+       x(alloc_info,                           0)      \
+       x(alloc_metadata,                       1)      \
+       x(extents_above_btree_updates_done,     2)      \
+       x(bformat_overflow_done,                3)
+
 enum bch_sb_compat {
-       BCH_COMPAT_FEAT_ALLOC_INFO      = 0,
-       BCH_COMPAT_FEAT_ALLOC_METADATA  = 1,
+#define x(f, n) BCH_COMPAT_##f,
+       BCH_SB_COMPAT()
+#undef x
+       BCH_COMPAT_NR,
 };
 
 /* options: */
 
 #define BCH_REPLICAS_MAX               4U
 
+#define BCH_BKEY_PTRS_MAX              16U
+
+#define BCH_ERROR_ACTIONS()            \
+       x(continue,             0)      \
+       x(ro,                   1)      \
+       x(panic,                2)
+
 enum bch_error_actions {
-       BCH_ON_ERROR_CONTINUE           = 0,
-       BCH_ON_ERROR_RO                 = 1,
-       BCH_ON_ERROR_PANIC              = 2,
-       BCH_NR_ERROR_ACTIONS            = 3,
+#define x(t, n) BCH_ON_ERROR_##t = n,
+       BCH_ERROR_ACTIONS()
+#undef x
+       BCH_ON_ERROR_NR
 };
 
 enum bch_str_hash_type {
@@ -1369,11 +1437,16 @@ enum bch_str_hash_type {
        BCH_STR_HASH_NR                 = 4,
 };
 
+#define BCH_STR_HASH_OPTS()            \
+       x(crc32c,               0)      \
+       x(crc64,                1)      \
+       x(siphash,              2)
+
 enum bch_str_hash_opts {
-       BCH_STR_HASH_OPT_CRC32C         = 0,
-       BCH_STR_HASH_OPT_CRC64          = 1,
-       BCH_STR_HASH_OPT_SIPHASH        = 2,
-       BCH_STR_HASH_OPT_NR             = 3,
+#define x(t, n) BCH_STR_HASH_OPT_##t = n,
+       BCH_STR_HASH_OPTS()
+#undef x
+       BCH_STR_HASH_OPT_NR
 };
 
 enum bch_csum_type {
@@ -1408,11 +1481,16 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
        }
 }
 
+#define BCH_CSUM_OPTS()                        \
+       x(none,                 0)      \
+       x(crc32c,               1)      \
+       x(crc64,                2)
+
 enum bch_csum_opts {
-       BCH_CSUM_OPT_NONE               = 0,
-       BCH_CSUM_OPT_CRC32C             = 1,
-       BCH_CSUM_OPT_CRC64              = 2,
-       BCH_CSUM_OPT_NR                 = 3,
+#define x(t, n) BCH_CSUM_OPT_##t = n,
+       BCH_CSUM_OPTS()
+#undef x
+       BCH_CSUM_OPT_NR
 };
 
 #define BCH_COMPRESSION_TYPES()                \
@@ -1424,7 +1502,7 @@ enum bch_csum_opts {
        x(incompressible,       5)
 
 enum bch_compression_type {
-#define x(t, n) BCH_COMPRESSION_TYPE_##t,
+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
        BCH_COMPRESSION_TYPES()
 #undef x
        BCH_COMPRESSION_TYPE_NR
@@ -1437,7 +1515,7 @@ enum bch_compression_type {
        x(zstd,         3)
 
 enum bch_compression_opts {
-#define x(t, n) BCH_COMPRESSION_OPT_##t,
+#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
        BCH_COMPRESSION_OPTS()
 #undef x
        BCH_COMPRESSION_OPT_NR
@@ -1487,7 +1565,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
        x(blacklist,            3)              \
        x(blacklist_v2,         4)              \
        x(usage,                5)              \
-       x(data_usage,           6)
+       x(data_usage,           6)              \
+       x(clock,                7)              \
+       x(dev_usage,            8)
 
 enum {
 #define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
@@ -1535,6 +1615,30 @@ struct jset_entry_data_usage {
        struct bch_replicas_entry r;
 } __attribute__((packed));
 
+struct jset_entry_clock {
+       struct jset_entry       entry;
+       __u8                    rw;
+       __u8                    pad[7];
+       __le64                  time;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage_type {
+       __le64                  buckets;
+       __le64                  sectors;
+       __le64                  fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+       struct jset_entry       entry;
+       __le32                  dev;
+       __u32                   pad;
+
+       __le64                  buckets_ec;
+       __le64                  buckets_unavailable;
+
+       struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
@@ -1557,8 +1661,8 @@ struct jset {
 
        __u8                    encrypted_start[0];
 
-       __le16                  read_clock;
-       __le16                  write_clock;
+       __le16                  _read_clock; /* no longer used */
+       __le16                  _write_clock;
 
        /* Sequence number of oldest dirty journal entry */
        __le64                  last_seq;
@@ -1572,23 +1676,24 @@ struct jset {
 
 LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH,    struct jset, flags, 5, 6);
 
 #define BCH_JOURNAL_BUCKETS_MIN                8
 
 /* Btree: */
 
-#define BCH_BTREE_IDS()                                        \
-       x(EXTENTS,      0, "extents")                   \
-       x(INODES,       1, "inodes")                    \
-       x(DIRENTS,      2, "dirents")                   \
-       x(XATTRS,       3, "xattrs")                    \
-       x(ALLOC,        4, "alloc")                     \
-       x(QUOTAS,       5, "quotas")                    \
-       x(EC,           6, "stripes")                   \
-       x(REFLINK,      7, "reflink")
+#define BCH_BTREE_IDS()                                \
+       x(extents,      0)                      \
+       x(inodes,       1)                      \
+       x(dirents,      2)                      \
+       x(xattrs,       3)                      \
+       x(alloc,        4)                      \
+       x(quotas,       5)                      \
+       x(stripes,      6)                      \
+       x(reflink,      7)
 
 enum btree_id {
-#define x(kwd, val, name) BTREE_ID_##kwd = val,
+#define x(kwd, val) BTREE_ID_##kwd = val,
        BCH_BTREE_IDS()
 #undef x
        BTREE_ID_NR
@@ -1642,7 +1747,7 @@ struct btree_node {
        /* Closed interval: */
        struct bpos             min_key;
        struct bpos             max_key;
-       struct bch_extent_ptr   ptr;
+       struct bch_extent_ptr   _ptr; /* not used anymore */
        struct bkey_format      format;
 
        union {
index d71157a3e073277c7d80d81d4a814214fc95760b..f679fc2151bc4cfdd2e18a42674352f87e7fba7e 100644 (file)
@@ -14,6 +14,9 @@
 #define BCH_FORCE_IF_DATA_DEGRADED     (1 << 2)
 #define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
 
+#define BCH_FORCE_IF_LOST                      \
+       (BCH_FORCE_IF_DATA_LOST|                \
+        BCH_FORCE_IF_METADATA_LOST)
 #define BCH_FORCE_IF_DEGRADED                  \
        (BCH_FORCE_IF_DATA_DEGRADED|            \
         BCH_FORCE_IF_METADATA_DEGRADED)
@@ -73,6 +76,7 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_READ_SUPER   _IOW(0xbc,      12, struct bch_ioctl_read_super)
 #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc,      13,  struct bch_ioctl_disk_get_idx)
 #define BCH_IOCTL_DISK_RESIZE  _IOW(0xbc,      14,  struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
 
 /* ioctl below act on a particular file, not the filesystem as a whole: */
 
@@ -167,10 +171,11 @@ struct bch_ioctl_disk_set_state {
 };
 
 enum bch_data_ops {
-       BCH_DATA_OP_SCRUB       = 0,
-       BCH_DATA_OP_REREPLICATE = 1,
-       BCH_DATA_OP_MIGRATE     = 2,
-       BCH_DATA_OP_NR          = 3,
+       BCH_DATA_OP_SCRUB               = 0,
+       BCH_DATA_OP_REREPLICATE         = 1,
+       BCH_DATA_OP_MIGRATE             = 2,
+       BCH_DATA_OP_REWRITE_OLD_NODES   = 3,
+       BCH_DATA_OP_NR                  = 4,
 };
 
 /*
@@ -183,11 +188,13 @@ enum bch_data_ops {
  * job. The file descriptor is O_CLOEXEC.
  */
 struct bch_ioctl_data {
-       __u32                   op;
+       __u16                   op;
+       __u8                    start_btree;
+       __u8                    end_btree;
        __u32                   flags;
 
-       struct bpos             start;
-       struct bpos             end;
+       struct bpos             start_pos;
+       struct bpos             end_pos;
 
        union {
        struct {
@@ -329,4 +336,17 @@ struct bch_ioctl_disk_resize {
        __u64                   nbuckets;
 };
 
+/*
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
+ *
+ * @dev                - member to resize
+ * @nbuckets   - new number of buckets
+ */
+struct bch_ioctl_disk_resize_journal {
+       __u32                   flags;
+       __u32                   pad;
+       __u64                   dev;
+       __u64                   nbuckets;
+};
+
 #endif /* _BCACHEFS_IOCTL_H */
index 4d0c9129cd4abcb9cef2dc9543f124f990d872cb..3af56062601f62d17a3ba597f0737225ad715956 100644 (file)
@@ -411,7 +411,7 @@ static bool bkey_packed_successor(struct bkey_packed *out,
 
                if ((*p & mask) != mask) {
                        *p += 1ULL << offset;
-                       EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+                       EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
                        return true;
                }
 
@@ -551,7 +551,12 @@ void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
 static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
                             unsigned bits, u64 offset)
 {
-       offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+       unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+       u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+
+       bits = min(bits, unpacked_bits);
+
+       offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
 
        f->bits_per_field[i]    = bits;
        f->field_offset[i]      = cpu_to_le64(offset);
@@ -609,15 +614,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f)
                return "incorrect number of fields";
 
        for (i = 0; i < f->nr_fields; i++) {
+               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+               u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
                u64 field_offset = le64_to_cpu(f->field_offset[i]);
 
-               if (f->bits_per_field[i] > 64)
+               if (f->bits_per_field[i] > unpacked_bits)
                        return "field too large";
 
-               if (field_offset &&
-                   (f->bits_per_field[i] == 64 ||
-                   (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
-                    field_offset)))
+               if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+                       return "offset + bits overflow";
+
+               if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+                    unpacked_mask) <
+                   field_offset)
                        return "offset + bits overflow";
 
                bits += f->bits_per_field[i];
@@ -1040,7 +1049,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
                              high_word(f, r),
                              b->nr_key_bits);
 
-       EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+       EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
                                bkey_unpack_pos(b, r)));
        return ret;
 }
@@ -1050,13 +1059,13 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
                                               const struct bkey_packed *l,
                                               const struct bpos *r)
 {
-       return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+       return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
 }
 
 __pure __flatten
-int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
-                          const struct bkey_packed *r,
-                          const struct btree *b)
+int bch2_bkey_cmp_packed(const struct btree *b,
+                        const struct bkey_packed *l,
+                        const struct bkey_packed *r)
 {
        struct bkey unpacked;
 
@@ -1071,7 +1080,7 @@ int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
                r = (void*) &unpacked;
        }
 
-       return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+       return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
 }
 
 __pure __flatten
@@ -1082,7 +1091,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b,
        const struct bkey *l_unpacked;
 
        return unlikely(l_unpacked = packed_to_bkey_c(l))
-               ? bkey_cmp(l_unpacked->p, *r)
+               ? bpos_cmp(l_unpacked->p, *r)
                : __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
@@ -1118,11 +1127,12 @@ void bch2_bkey_pack_test(void)
        struct bkey_packed p;
 
        struct bkey_format test_format = {
-               .key_u64s       = 2,
+               .key_u64s       = 3,
                .nr_fields      = BKEY_NR_FIELDS,
                .bits_per_field = {
                        13,
                        64,
+                       32,
                },
        };
 
index 80ea488d57b0c8f806eb6ae1a32d4f1909a30ce6..2e45d88fab0382cdc9e99e9d5449702adc8f30d0 100644 (file)
@@ -33,16 +33,6 @@ struct bkey_s {
 
 #define bkey_next(_k)          vstruct_next(_k)
 
-static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
-                                                      struct bkey_packed *end)
-{
-       k = bkey_next(k);
-
-       while (k != end && !k->u64s)
-               k = (void *) ((u64 *) k + 1);
-       return k;
-}
-
 #define bkey_val_u64s(_k)      ((_k)->u64s - BKEY_U64s)
 
 static inline size_t bkey_val_bytes(const struct bkey *k)
@@ -67,13 +57,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 #define bkey_whiteout(_k)                              \
        ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
 
-#define bkey_packed_typecheck(_k)                                      \
-({                                                                     \
-       BUILD_BUG_ON(!type_is(_k, struct bkey *) &&                     \
-                    !type_is(_k, struct bkey_packed *));               \
-       type_is(_k, struct bkey_packed *);                              \
-})
-
 enum bkey_lr_packed {
        BKEY_PACKED_BOTH,
        BKEY_PACKED_RIGHT,
@@ -81,9 +64,6 @@ enum bkey_lr_packed {
        BKEY_PACKED_NONE,
 };
 
-#define bkey_lr_packed_typecheck(_l, _r)                               \
-       (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
-
 #define bkey_lr_packed(_l, _r)                                         \
        ((_l)->format + ((_r)->format << 1))
 
@@ -132,9 +112,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
                                          const struct bpos *);
 
 __pure
-int __bch2_bkey_cmp_packed(const struct bkey_packed *,
-                          const struct bkey_packed *,
-                          const struct btree *);
+int bch2_bkey_cmp_packed(const struct btree *,
+                        const struct bkey_packed *,
+                        const struct bkey_packed *);
 
 __pure
 int __bch2_bkey_cmp_left_packed(const struct btree *,
@@ -160,55 +140,58 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b,
        return bkey_cmp_left_packed(b, l, &r);
 }
 
-/*
- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
- * skip dispatching on k->format:
- */
-#define bkey_cmp_packed(_b, _l, _r)                                    \
-({                                                                     \
-       int _cmp;                                                       \
-                                                                       \
-       switch (bkey_lr_packed_typecheck(_l, _r)) {                     \
-       case BKEY_PACKED_NONE:                                          \
-               _cmp = bkey_cmp(((struct bkey *) (_l))->p,              \
-                               ((struct bkey *) (_r))->p);             \
-               break;                                                  \
-       case BKEY_PACKED_LEFT:                                          \
-               _cmp = bkey_cmp_left_packed((_b),                       \
-                                 (struct bkey_packed *) (_l),          \
-                                 &((struct bkey *) (_r))->p);          \
-               break;                                                  \
-       case BKEY_PACKED_RIGHT:                                         \
-               _cmp = -bkey_cmp_left_packed((_b),                      \
-                                 (struct bkey_packed *) (_r),          \
-                                 &((struct bkey *) (_l))->p);          \
-               break;                                                  \
-       case BKEY_PACKED_BOTH:                                          \
-               _cmp = __bch2_bkey_cmp_packed((void *) (_l),            \
-                                        (void *) (_r), (_b));          \
-               break;                                                  \
-       }                                                               \
-       _cmp;                                                           \
-})
-
-#if 1
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+       return  cmp_int(l.inode,    r.inode) ?:
+               cmp_int(l.offset,   r.offset) ?:
+               cmp_int(l.snapshot, r.snapshot);
+}
+
 static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
 {
-       if (l.inode != r.inode)
-               return l.inode < r.inode ? -1 : 1;
-       if (l.offset != r.offset)
-               return l.offset < r.offset ? -1 : 1;
-       if (l.snapshot != r.snapshot)
-               return l.snapshot < r.snapshot ? -1 : 1;
-       return 0;
+       return  cmp_int(l.inode,    r.inode) ?:
+               cmp_int(l.offset,   r.offset);
 }
-#else
-int bkey_cmp(struct bpos l, struct bpos r);
-#endif
 
 static inline struct bpos bpos_min(struct bpos l, struct bpos r)
 {
-       return bkey_cmp(l, r) < 0 ? l : r;
+       return bpos_cmp(l, r) < 0 ? l : r;
+}
+
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+       return bpos_cmp(l, r) > 0 ? l : r;
+}
+
+#define sbb(a, b, borrow)                              \
+do {                                                   \
+       typeof(a) d1, d2;                               \
+                                                       \
+       d1 = a - borrow;                                \
+       borrow  = d1 > a;                               \
+                                                       \
+       d2 = d1 - b;                                    \
+       borrow += d2 > d1;                              \
+       a = d2;                                         \
+} while (0)
+
+/* returns a - b: */
+static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
+{
+       int borrow = 0;
+
+       sbb(a.snapshot, b.snapshot,     borrow);
+       sbb(a.offset,   b.offset,       borrow);
+       sbb(a.inode,    b.inode,        borrow);
+       return a;
+}
+
+static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
+{
+       if (bpos_cmp(l, r) > 0)
+               swap(l, r);
+
+       return bpos_sub(r, l);
 }
 
 void bch2_bpos_swab(struct bpos *);
@@ -267,24 +250,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
                format->bits_per_field[BKEY_FIELD_SNAPSHOT];
 }
 
-static inline struct bpos bkey_successor(struct bpos p)
+static inline struct bpos bpos_successor(struct bpos p)
 {
-       struct bpos ret = p;
+       if (!++p.snapshot &&
+           !++p.offset &&
+           !++p.inode)
+               BUG();
 
-       if (!++ret.offset)
-               BUG_ON(!++ret.inode);
+       return p;
+}
 
-       return ret;
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+       if (!p.snapshot-- &&
+           !p.offset-- &&
+           !p.inode--)
+               BUG();
+
+       return p;
 }
 
-static inline struct bpos bkey_predecessor(struct bpos p)
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
 {
-       struct bpos ret = p;
+       p.snapshot = 0;
 
-       if (!ret.offset--)
-               BUG_ON(!ret.inode--);
+       if (!++p.offset &&
+           !++p.inode)
+               BUG();
 
-       return ret;
+       return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+       p.snapshot = 0;
+
+       if (!p.offset-- &&
+           !p.inode--)
+               BUG();
+
+       return p;
 }
 
 static inline u64 bkey_start_offset(const struct bkey *k)
@@ -439,7 +444,7 @@ static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
  * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
  * functions.
  */
-#define BKEY_VAL_ACCESSORS(name)                                       \
+#define x(name, ...)                                   \
 struct bkey_i_##name {                                                 \
        union {                                                         \
                struct bkey             k;                              \
@@ -550,22 +555,8 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
        return k;                                                       \
 }
 
-BKEY_VAL_ACCESSORS(cookie);
-BKEY_VAL_ACCESSORS(btree_ptr);
-BKEY_VAL_ACCESSORS(extent);
-BKEY_VAL_ACCESSORS(reservation);
-BKEY_VAL_ACCESSORS(inode);
-BKEY_VAL_ACCESSORS(inode_generation);
-BKEY_VAL_ACCESSORS(dirent);
-BKEY_VAL_ACCESSORS(xattr);
-BKEY_VAL_ACCESSORS(alloc);
-BKEY_VAL_ACCESSORS(quota);
-BKEY_VAL_ACCESSORS(stripe);
-BKEY_VAL_ACCESSORS(reflink_p);
-BKEY_VAL_ACCESSORS(reflink_v);
-BKEY_VAL_ACCESSORS(inline_data);
-BKEY_VAL_ACCESSORS(btree_ptr_v2);
-BKEY_VAL_ACCESSORS(indirect_inline_data);
+BCH_BKEY_TYPES();
+#undef x
 
 /* byte order helpers */
 
diff --git a/libbcachefs/bkey_buf.h b/libbcachefs/bkey_buf.h
new file mode 100644 (file)
index 0000000..0d7c67a
--- /dev/null
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+
+struct bkey_buf {
+       struct bkey_i   *k;
+       u64             onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+                                        struct bch_fs *c, unsigned u64s)
+{
+       if (s->k == (void *) s->onstack &&
+           u64s > ARRAY_SIZE(s->onstack)) {
+               s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+               memcpy(s->k, s->onstack, sizeof(s->onstack));
+       }
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+                                           struct bch_fs *c,
+                                           struct bkey_s_c k)
+{
+       bch2_bkey_buf_realloc(s, c, k.k->u64s);
+       bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+                                     struct bch_fs *c,
+                                     struct bkey_i *src)
+{
+       bch2_bkey_buf_realloc(s, c, src->k.u64s);
+       bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+                                       struct bch_fs *c,
+                                       struct btree *b,
+                                       struct bkey_packed *src)
+{
+       bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+                             bkeyp_val_u64s(&b->format, src));
+       bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+       s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+       if (s->k != (void *) s->onstack)
+               mempool_free(s->k, &c->large_bkey_pool);
+       s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
index 32849229801dbbf195f3baa4421c96e034971c21..6fe95b802e130060b48caa6ad54b2f2c38620c9b 100644 (file)
@@ -59,7 +59,7 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
        .key_invalid = key_type_cookie_invalid,         \
 }
 
-#define bch2_bkey_ops_whiteout (struct bkey_ops) {     \
+#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) {        \
        .key_invalid = empty_val_key_invalid,           \
 }
 
@@ -104,7 +104,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
        if (k.k->u64s < BKEY_U64s)
                return "u64s too small";
 
-       if (type == BKEY_TYPE_BTREE &&
+       if (type == BKEY_TYPE_btree &&
            bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
                return "value too big";
 
@@ -119,10 +119,17 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                        return "nonzero size field";
        }
 
-       if (k.k->p.snapshot)
+       if (type != BKEY_TYPE_btree &&
+           !btree_type_has_snapshots(type) &&
+           k.k->p.snapshot)
                return "nonzero snapshot";
 
-       if (type != BKEY_TYPE_BTREE &&
+       if (type != BKEY_TYPE_btree &&
+           btree_type_has_snapshots(type) &&
+           k.k->p.snapshot != U32_MAX)
+               return "invalid snapshot field";
+
+       if (type != BKEY_TYPE_btree &&
            !bkey_cmp(k.k->p, POS_MAX))
                return "POS_MAX key";
 
@@ -138,10 +145,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
-       if (bkey_cmp(k.k->p, b->data->min_key) < 0)
+       if (bpos_cmp(k.k->p, b->data->min_key) < 0)
                return "key before start of btree node";
 
-       if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+       if (bpos_cmp(k.k->p, b->data->max_key) > 0)
                return "key past end of btree node";
 
        return NULL;
@@ -149,7 +156,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 
 void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 {
-       const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
        const char *invalid;
 
        BUG_ON(!k.k->u64s);
@@ -161,33 +167,46 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
                bch2_bkey_val_to_text(&PBUF(buf), c, k);
                bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
-               return;
        }
-
-       if (ops->key_debugcheck)
-               ops->key_debugcheck(c, k);
 }
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
-       if (!bkey_cmp(pos, POS_MIN))
+       if (!bpos_cmp(pos, POS_MIN))
                pr_buf(out, "POS_MIN");
-       else if (!bkey_cmp(pos, POS_MAX))
+       else if (!bpos_cmp(pos, POS_MAX))
                pr_buf(out, "POS_MAX");
-       else
-               pr_buf(out, "%llu:%llu", pos.inode, pos.offset);
+       else {
+               if (pos.inode == U64_MAX)
+                       pr_buf(out, "U64_MAX");
+               else
+                       pr_buf(out, "%llu", pos.inode);
+               pr_buf(out, ":");
+               if (pos.offset == U64_MAX)
+                       pr_buf(out, "U64_MAX");
+               else
+                       pr_buf(out, "%llu", pos.offset);
+               pr_buf(out, ":");
+               if (pos.snapshot == U32_MAX)
+                       pr_buf(out, "U32_MAX");
+               else
+                       pr_buf(out, "%u", pos.snapshot);
+       }
 }
 
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
        if (k) {
-               pr_buf(out, "u64s %u type %s ", k->u64s,
-                      bch2_bkey_types[k->type]);
+               pr_buf(out, "u64s %u type ", k->u64s);
+
+               if (k->type < KEY_TYPE_MAX)
+                       pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+               else
+                       pr_buf(out, "%u ", k->type);
 
                bch2_bpos_to_text(out, k->p);
 
-               pr_buf(out, " snap %u len %u ver %llu",
-                      k->p.snapshot, k->size, k->version.lo);
+               pr_buf(out, " len %u ver %llu", k->size, k->version.lo);
        } else {
                pr_buf(out, "(null)");
        }
@@ -196,10 +215,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
                      struct bkey_s_c k)
 {
-       const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+       if (k.k->type < KEY_TYPE_MAX) {
+               const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
 
-       if (likely(ops->val_to_text))
-               ops->val_to_text(out, c, k);
+               if (likely(ops->val_to_text))
+                       ops->val_to_text(out, c, k);
+       } else {
+               pr_buf(out, "(invalid type %u)", k.k->type);
+       }
 }
 
 void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
@@ -236,11 +259,11 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c,
        const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
        enum merge_result ret;
 
-       if (key_merging_disabled(c) ||
+       if (bch2_key_merging_disabled ||
            !ops->key_merge ||
            l.k->type != r.k->type ||
            bversion_cmp(l.k->version, r.k->version) ||
-           bkey_cmp(l.k->p, bkey_start_pos(r.k)))
+           bpos_cmp(l.k->p, bkey_start_pos(r.k)))
                return BCH_MERGE_NOMERGE;
 
        ret = ops->key_merge(c, l, r);
@@ -255,18 +278,18 @@ static const struct old_bkey_type {
        u8              old;
        u8              new;
 } bkey_renumber_table[] = {
-       {BKEY_TYPE_BTREE,       128, KEY_TYPE_btree_ptr         },
-       {BKEY_TYPE_EXTENTS,     128, KEY_TYPE_extent            },
-       {BKEY_TYPE_EXTENTS,     129, KEY_TYPE_extent            },
-       {BKEY_TYPE_EXTENTS,     130, KEY_TYPE_reservation       },
-       {BKEY_TYPE_INODES,      128, KEY_TYPE_inode             },
-       {BKEY_TYPE_INODES,      130, KEY_TYPE_inode_generation  },
-       {BKEY_TYPE_DIRENTS,     128, KEY_TYPE_dirent            },
-       {BKEY_TYPE_DIRENTS,     129, KEY_TYPE_whiteout          },
-       {BKEY_TYPE_XATTRS,      128, KEY_TYPE_xattr             },
-       {BKEY_TYPE_XATTRS,      129, KEY_TYPE_whiteout          },
-       {BKEY_TYPE_ALLOC,       128, KEY_TYPE_alloc             },
-       {BKEY_TYPE_QUOTAS,      128, KEY_TYPE_quota             },
+       {BKEY_TYPE_btree,       128, KEY_TYPE_btree_ptr         },
+       {BKEY_TYPE_extents,     128, KEY_TYPE_extent            },
+       {BKEY_TYPE_extents,     129, KEY_TYPE_extent            },
+       {BKEY_TYPE_extents,     130, KEY_TYPE_reservation       },
+       {BKEY_TYPE_inodes,      128, KEY_TYPE_inode             },
+       {BKEY_TYPE_inodes,      130, KEY_TYPE_inode_generation  },
+       {BKEY_TYPE_dirents,     128, KEY_TYPE_dirent            },
+       {BKEY_TYPE_dirents,     129, KEY_TYPE_hash_whiteout     },
+       {BKEY_TYPE_xattrs,      128, KEY_TYPE_xattr             },
+       {BKEY_TYPE_xattrs,      129, KEY_TYPE_hash_whiteout     },
+       {BKEY_TYPE_alloc,       128, KEY_TYPE_alloc             },
+       {BKEY_TYPE_quotas,      128, KEY_TYPE_quota             },
 };
 
 void bch2_bkey_renumber(enum btree_node_type btree_node_type,
@@ -294,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
        const struct bkey_ops *ops;
        struct bkey uk;
        struct bkey_s u;
+       unsigned nr_compat = 5;
        int i;
 
        /*
         * Do these operations in reverse order in the write path:
         */
 
-       for (i = 0; i < 4; i++)
-       switch (!write ? i : 3 - i) {
+       for (i = 0; i < nr_compat; i++)
+       switch (!write ? i : nr_compat - 1 - i) {
        case 0:
                if (big_endian != CPU_BIG_ENDIAN)
                        bch2_bkey_swab_key(f, k);
@@ -312,7 +336,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
                break;
        case 2:
                if (version < bcachefs_metadata_version_inode_btree_change &&
-                   btree_id == BTREE_ID_INODES) {
+                   btree_id == BTREE_ID_inodes) {
                        if (!bkey_packed(k)) {
                                struct bkey_i *u = packed_to_bkey(k);
                                swap(u->k.p.inode, u->k.p.offset);
@@ -335,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
                }
                break;
        case 3:
+               if (version < bcachefs_metadata_version_snapshot &&
+                   (level || btree_type_has_snapshots(btree_id))) {
+                       struct bkey_i *u = packed_to_bkey(k);
+
+                       if (u) {
+                               u->k.p.snapshot = write
+                                       ? 0 : U32_MAX;
+                       } else {
+                               u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
+                               u64 max_packed = min_packed +
+                                       ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+                               uk = __bch2_bkey_unpack_key(f, k);
+                               uk.p.snapshot = write
+                                       ? min_packed : min_t(u64, U32_MAX, max_packed);
+
+                               BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+                       }
+               }
+
+               break;
+       case 4:
                if (!bkey_packed(k)) {
                        u = bkey_i_to_s(packed_to_bkey(k));
                } else {
index 0bca725ae3b8c5d3d719cc878c07bb1812cc6802..bfa6f112aeed17519677d96bc41ca9177f91abe4 100644 (file)
@@ -26,7 +26,6 @@ struct bkey_ops {
        /* Returns reason for being invalid if invalid, else NULL: */
        const char *    (*key_invalid)(const struct bch_fs *,
                                       struct bkey_s_c);
-       void            (*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
        void            (*val_to_text)(struct printbuf *, struct bch_fs *,
                                       struct bkey_s_c);
        void            (*swab)(struct bkey_s);
diff --git a/libbcachefs/bkey_on_stack.h b/libbcachefs/bkey_on_stack.h
deleted file mode 100644 (file)
index f607a0c..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_ON_STACK_H
-#define _BCACHEFS_BKEY_ON_STACK_H
-
-#include "bcachefs.h"
-
-struct bkey_on_stack {
-       struct bkey_i   *k;
-       u64             onstack[12];
-};
-
-static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
-                                        struct bch_fs *c, unsigned u64s)
-{
-       if (s->k == (void *) s->onstack &&
-           u64s > ARRAY_SIZE(s->onstack)) {
-               s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
-               memcpy(s->k, s->onstack, sizeof(s->onstack));
-       }
-}
-
-static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s,
-                                           struct bch_fs *c,
-                                           struct bkey_s_c k)
-{
-       bkey_on_stack_realloc(s, c, k.k->u64s);
-       bkey_reassemble(s->k, k);
-}
-
-static inline void bkey_on_stack_init(struct bkey_on_stack *s)
-{
-       s->k = (void *) s->onstack;
-}
-
-static inline void bkey_on_stack_exit(struct bkey_on_stack *s,
-                                     struct bch_fs *c)
-{
-       if (s->k != (void *) s->onstack)
-               mempool_free(s->k, &c->large_bkey_pool);
-       s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_ON_STACK_H */
index 839e78d1dc35fb3e71fdaff3407a9a50d58cd50d..537ab7919e886eec958e49e12e6b18962172e725 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bkey_sort.h"
 #include "bset.h"
 #include "extents.h"
@@ -14,9 +14,8 @@ static inline bool sort_iter_end(struct sort_iter *iter)
        return !iter->used;
 }
 
-static inline void __sort_iter_sift(struct sort_iter *iter,
-                                   unsigned from,
-                                   sort_cmp_fn cmp)
+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
+                                 sort_cmp_fn cmp)
 {
        unsigned i;
 
@@ -27,18 +26,12 @@ static inline void __sort_iter_sift(struct sort_iter *iter,
                swap(iter->data[i], iter->data[i + 1]);
 }
 
-static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-
-       __sort_iter_sift(iter, 0, cmp);
-}
-
 static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
 {
        unsigned i = iter->used;
 
        while (i--)
-               __sort_iter_sift(iter, i, cmp);
+               sort_iter_sift(iter, i, cmp);
 }
 
 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
@@ -46,26 +39,20 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
        return !sort_iter_end(iter) ? iter->data->k : NULL;
 }
 
-static inline void __sort_iter_advance(struct sort_iter *iter,
-                                      unsigned idx, sort_cmp_fn cmp)
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 {
-       struct sort_iter_set *i = iter->data + idx;
+       struct sort_iter_set *i = iter->data;
 
-       BUG_ON(idx >= iter->used);
+       BUG_ON(!iter->used);
 
-       i->k = bkey_next_skip_noops(i->k, i->end);
+       i->k = bkey_next(i->k);
 
        BUG_ON(i->k > i->end);
 
        if (i->k == i->end)
-               array_remove_item(iter->data, iter->used, idx);
+               array_remove_item(iter->data, iter->used, 0);
        else
-               __sort_iter_sift(iter, idx, cmp);
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-       __sort_iter_advance(iter, 0, cmp);
+               sort_iter_sift(iter, 0, cmp);
 }
 
 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
@@ -86,7 +73,7 @@ static inline int key_sort_fix_overlapping_cmp(struct btree *b,
                                               struct bkey_packed *l,
                                               struct bkey_packed *r)
 {
-       return bkey_cmp_packed(b, l, r) ?:
+       return bch2_bkey_cmp_packed(b, l, r) ?:
                cmp_int((unsigned long) l, (unsigned long) r);
 }
 
@@ -98,7 +85,7 @@ static inline bool should_drop_next_key(struct sort_iter *iter)
         * and should be dropped.
         */
        return iter->used >= 2 &&
-               !bkey_cmp_packed(iter->b,
+               !bch2_bkey_cmp_packed(iter->b,
                                 iter->data[0].k,
                                 iter->data[1].k);
 }
@@ -116,7 +103,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
        sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
 
        while ((k = sort_iter_peek(iter))) {
-               if (!bkey_whiteout(k) &&
+               if (!bkey_deleted(k) &&
                    !should_drop_next_key(iter)) {
                        bkey_copy(out, k);
                        btree_keys_account_key_add(&nr, 0, out);
@@ -136,7 +123,7 @@ static void extent_sort_append(struct bch_fs *c,
                               struct bkey_packed **out,
                               struct bkey_s k)
 {
-       if (!bkey_whiteout(k.k)) {
+       if (!bkey_deleted(k.k)) {
                if (!bch2_bkey_pack_key(*out, k.k, f))
                        memcpy_u64s_small(*out, k.k, BKEY_U64s);
 
@@ -161,7 +148,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
        memset(&nr, 0, sizeof(nr));
 
        while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
-               if (filter_whiteouts && bkey_whiteout(in))
+               if (filter_whiteouts && bkey_deleted(in))
                        continue;
 
                if (bch2_bkey_transform(out_f, out, bkey_packed(in)
@@ -187,14 +174,14 @@ bch2_sort_repack_merge(struct bch_fs *c,
                       bool filter_whiteouts)
 {
        struct bkey_packed *out = vstruct_last(dst), *k_packed;
-       struct bkey_on_stack k;
+       struct bkey_buf k;
        struct btree_nr_keys nr;
 
        memset(&nr, 0, sizeof(nr));
-       bkey_on_stack_init(&k);
+       bch2_bkey_buf_init(&k);
 
        while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
-               if (filter_whiteouts && bkey_whiteout(k_packed))
+               if (filter_whiteouts && bkey_deleted(k_packed))
                        continue;
 
                /*
@@ -204,7 +191,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
                 * node; we have to make a copy of the entire key before calling
                 * normalize
                 */
-               bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s);
+               bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
                bch2_bkey_unpack(src, k.k, k_packed);
 
                if (filter_whiteouts &&
@@ -215,7 +202,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
        }
 
        dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-       bkey_on_stack_exit(&k, c);
+       bch2_bkey_buf_exit(&k, c);
        return nr;
 }
 
@@ -223,7 +210,7 @@ static inline int sort_keys_cmp(struct btree *b,
                                struct bkey_packed *l,
                                struct bkey_packed *r)
 {
-       return bkey_cmp_packed(b, l, r) ?:
+       return bch2_bkey_cmp_packed(b, l, r) ?:
                (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
                (int) l->needs_whiteout - (int) r->needs_whiteout;
 }
@@ -240,19 +227,19 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
        while ((in = sort_iter_next(iter, sort_keys_cmp))) {
                bool needs_whiteout = false;
 
-               if (bkey_whiteout(in) &&
+               if (bkey_deleted(in) &&
                    (filter_whiteouts || !in->needs_whiteout))
                        continue;
 
                while ((next = sort_iter_peek(iter)) &&
-                      !bkey_cmp_packed(iter->b, in, next)) {
+                      !bch2_bkey_cmp_packed(iter->b, in, next)) {
                        BUG_ON(in->needs_whiteout &&
                               next->needs_whiteout);
                        needs_whiteout |= in->needs_whiteout;
                        in = sort_iter_next(iter, sort_keys_cmp);
                }
 
-               if (bkey_whiteout(in)) {
+               if (bkey_deleted(in)) {
                        memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
                        set_bkeyp_val_u64s(f, out, 0);
                } else {
@@ -264,252 +251,3 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
 
        return (u64 *) out - (u64 *) dst;
 }
-
-/* Compat code for btree_node_old_extent_overwrite: */
-
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
-                                                 struct bkey_packed *l,
-                                                 struct bkey_packed *r)
-{
-       struct bkey ul = bkey_unpack_key(b, l);
-       struct bkey ur = bkey_unpack_key(b, r);
-
-       return bkey_cmp(bkey_start_pos(&ul),
-                       bkey_start_pos(&ur)) ?:
-               cmp_int((unsigned long) r, (unsigned long) l);
-}
-
-/*
- * The algorithm in extent_sort_fix_overlapping() relies on keys in the same
- * bset being ordered by start offset - but 0 size whiteouts (which are always
- * KEY_TYPE_deleted) break this ordering, so we need to skip over them:
- */
-static void extent_iter_advance(struct sort_iter *iter, unsigned idx)
-{
-       struct sort_iter_set *i = iter->data + idx;
-
-       do {
-               i->k = bkey_next_skip_noops(i->k, i->end);
-       } while (i->k != i->end && bkey_deleted(i->k));
-
-       if (i->k == i->end)
-               array_remove_item(iter->data, iter->used, idx);
-       else
-               __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp);
-}
-
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
-                                struct sort_iter *iter)
-{
-       struct btree *b = iter->b;
-       struct bkey_format *f = &b->format;
-       struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
-       struct bkey_packed *out = dst->start;
-       struct bkey l_unpacked, r_unpacked;
-       struct bkey_s l, r;
-       struct btree_nr_keys nr;
-       struct bkey_on_stack split;
-       unsigned i;
-
-       memset(&nr, 0, sizeof(nr));
-       bkey_on_stack_init(&split);
-
-       sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
-       for (i = 0; i < iter->used;) {
-               if (bkey_deleted(iter->data[i].k))
-                       __sort_iter_advance(iter, i,
-                                           extent_sort_fix_overlapping_cmp);
-               else
-                       i++;
-       }
-
-       while (!sort_iter_end(iter)) {
-               l = __bkey_disassemble(b, _l->k, &l_unpacked);
-
-               if (iter->used == 1) {
-                       extent_sort_append(c, f, &nr, &out, l);
-                       extent_iter_advance(iter, 0);
-                       continue;
-               }
-
-               r = __bkey_disassemble(b, _r->k, &r_unpacked);
-
-               /* If current key and next key don't overlap, just append */
-               if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-                       extent_sort_append(c, f, &nr, &out, l);
-                       extent_iter_advance(iter, 0);
-                       continue;
-               }
-
-               /* Skip 0 size keys */
-               if (!r.k->size) {
-                       extent_iter_advance(iter, 1);
-                       continue;
-               }
-
-               /*
-                * overlap: keep the newer key and trim the older key so they
-                * don't overlap. comparing pointers tells us which one is
-                * newer, since the bsets are appended one after the other.
-                */
-
-               /* can't happen because of comparison func */
-               BUG_ON(_l->k < _r->k &&
-                      !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
-               if (_l->k > _r->k) {
-                       /* l wins, trim r */
-                       if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-                               extent_iter_advance(iter, 1);
-                       } else {
-                               bch2_cut_front_s(l.k->p, r);
-                               extent_save(b, _r->k, r.k);
-                               __sort_iter_sift(iter, 1,
-                                        extent_sort_fix_overlapping_cmp);
-                       }
-               } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-
-                       /*
-                        * r wins, but it overlaps in the middle of l - split l:
-                        */
-                       bkey_on_stack_reassemble(&split, c, l.s_c);
-                       bch2_cut_back(bkey_start_pos(r.k), split.k);
-
-                       bch2_cut_front_s(r.k->p, l);
-                       extent_save(b, _l->k, l.k);
-
-                       __sort_iter_sift(iter, 0,
-                                        extent_sort_fix_overlapping_cmp);
-
-                       extent_sort_append(c, f, &nr, &out,
-                                          bkey_i_to_s(split.k));
-               } else {
-                       bch2_cut_back_s(bkey_start_pos(r.k), l);
-                       extent_save(b, _l->k, l.k);
-               }
-       }
-
-       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-
-       bkey_on_stack_exit(&split, c);
-       return nr;
-}
-
-static inline int sort_extents_cmp(struct btree *b,
-                                  struct bkey_packed *l,
-                                  struct bkey_packed *r)
-{
-       return bkey_cmp_packed(b, l, r) ?:
-               (int) bkey_deleted(l) - (int) bkey_deleted(r);
-}
-
-unsigned bch2_sort_extents(struct bkey_packed *dst,
-                          struct sort_iter *iter,
-                          bool filter_whiteouts)
-{
-       struct bkey_packed *in, *out = dst;
-
-       sort_iter_sort(iter, sort_extents_cmp);
-
-       while ((in = sort_iter_next(iter, sort_extents_cmp))) {
-               if (bkey_deleted(in))
-                       continue;
-
-               if (bkey_whiteout(in) &&
-                   (filter_whiteouts || !in->needs_whiteout))
-                       continue;
-
-               bkey_copy(out, in);
-               out = bkey_next(out);
-       }
-
-       return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extent_whiteouts_cmp(struct btree *b,
-                                           struct bkey_packed *l,
-                                           struct bkey_packed *r)
-{
-       struct bkey ul = bkey_unpack_key(b, l);
-       struct bkey ur = bkey_unpack_key(b, r);
-
-       return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
-}
-
-unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst,
-                                   struct sort_iter *iter)
-{
-       const struct bkey_format *f = &iter->b->format;
-       struct bkey_packed *in, *out = dst;
-       struct bkey_i l, r;
-       bool prev = false, l_packed = false;
-       u64 max_packed_size     = bkey_field_max(f, BKEY_FIELD_SIZE);
-       u64 max_packed_offset   = bkey_field_max(f, BKEY_FIELD_OFFSET);
-       u64 new_size;
-
-       max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
-
-       sort_iter_sort(iter, sort_extent_whiteouts_cmp);
-
-       while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
-               if (bkey_deleted(in))
-                       continue;
-
-               EBUG_ON(bkeyp_val_u64s(f, in));
-               EBUG_ON(in->type != KEY_TYPE_discard);
-
-               r.k = bkey_unpack_key(iter->b, in);
-
-               if (prev &&
-                   bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
-                       if (bkey_cmp(l.k.p, r.k.p) >= 0)
-                               continue;
-
-                       new_size = l_packed
-                               ? min(max_packed_size, max_packed_offset -
-                                     bkey_start_offset(&l.k))
-                               : KEY_SIZE_MAX;
-
-                       new_size = min(new_size, r.k.p.offset -
-                                      bkey_start_offset(&l.k));
-
-                       BUG_ON(new_size < l.k.size);
-
-                       bch2_key_resize(&l.k, new_size);
-
-                       if (bkey_cmp(l.k.p, r.k.p) >= 0)
-                               continue;
-
-                       bch2_cut_front(l.k.p, &r);
-               }
-
-               if (prev) {
-                       if (!bch2_bkey_pack(out, &l, f)) {
-                               BUG_ON(l_packed);
-                               bkey_copy(out, &l);
-                       }
-                       out = bkey_next(out);
-               }
-
-               l = r;
-               prev = true;
-               l_packed = bkey_packed(in);
-       }
-
-       if (prev) {
-               if (!bch2_bkey_pack(out, &l, f)) {
-                       BUG_ON(l_packed);
-                       bkey_copy(out, &l);
-               }
-               out = bkey_next(out);
-       }
-
-       return (u64 *) out - (u64 *) dst;
-}
index 458a051fdac523fd5833c89a3769de2b2ee93a43..1059996dac7807cc4b23573303d7c9717668d551 100644 (file)
@@ -32,9 +32,6 @@ static inline void sort_iter_add(struct sort_iter *iter,
 struct btree_nr_keys
 bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
                              struct sort_iter *);
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
-                                struct sort_iter *);
 
 struct btree_nr_keys
 bch2_sort_repack(struct bset *, struct btree *,
@@ -48,10 +45,5 @@ bch2_sort_repack_merge(struct bch_fs *,
 
 unsigned bch2_sort_keys(struct bkey_packed *,
                        struct sort_iter *, bool);
-unsigned bch2_sort_extents(struct bkey_packed *,
-                          struct sort_iter *, bool);
-
-unsigned bch2_sort_extent_whiteouts(struct bkey_packed *,
-                                   struct sort_iter *);
 
 #endif /* _BCACHEFS_BKEY_SORT_H */
index f7c2841ed8a79908c602e7d6ab470a66aecc6247..f92a757f953dfbd9cffa5e26e110e55ee280c384 100644 (file)
@@ -78,7 +78,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
        for (_k = i->start;
             _k < vstruct_last(i);
             _k = _n) {
-               _n = bkey_next_skip_noops(_k, vstruct_last(i));
+               _n = bkey_next(_k);
 
                k = bkey_disassemble(b, _k, &uk);
                if (c)
@@ -93,13 +93,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
 
                n = bkey_unpack_key(b, _n);
 
-               if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) {
+               if (bpos_cmp(n.p, k.k->p) < 0) {
                        printk(KERN_ERR "Key skipped backwards\n");
                        continue;
                }
 
                if (!bkey_deleted(k.k) &&
-                   !bkey_cmp(n.p, k.k->p))
+                   !bpos_cmp(n.p, k.k->p))
                        printk(KERN_ERR "Duplicate keys\n");
        }
 }
@@ -144,7 +144,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
 
        for_each_bset(b, t)
                bset_tree_for_each_key(b, t, k)
-                       if (!bkey_whiteout(k))
+                       if (!bkey_deleted(k))
                                btree_keys_account_key_add(&nr, t - b->set, k);
 
        BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
@@ -369,10 +369,10 @@ static struct bkey_float *bkey_float(const struct btree *b,
        return ro_aux_tree_base(b, t)->f + idx;
 }
 
-static void bset_aux_tree_verify(struct btree *b)
+static void bset_aux_tree_verify(const struct btree *b)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-       struct bset_tree *t;
+       const struct bset_tree *t;
 
        for_each_bset(b, t) {
                if (t->aux_data_offset == U16_MAX)
@@ -388,15 +388,13 @@ static void bset_aux_tree_verify(struct btree *b)
 #endif
 }
 
-void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+void bch2_btree_keys_init(struct btree *b)
 {
        unsigned i;
 
        b->nsets                = 0;
        memset(&b->nr, 0, sizeof(b->nr));
-#ifdef CONFIG_BCACHEFS_DEBUG
-       b->expensive_debug_checks = expensive_debug_checks;
-#endif
+
        for (i = 0; i < MAX_BSETS; i++)
                b->set[i].data_offset = U16_MAX;
 
@@ -522,7 +520,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
        struct bkey_packed *k = btree_bkey_first(b, t);
        unsigned j = 0;
 
-       if (!btree_keys_expensive_checks(b))
+       if (!bch2_expensive_debug_checks)
                return;
 
        BUG_ON(bset_has_ro_aux_tree(t));
@@ -536,7 +534,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b,
        goto start;
        while (1) {
                if (rw_aux_to_bkey(b, t, j) == k) {
-                       BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+                       BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
                                        bkey_unpack_pos(b, k)));
 start:
                        if (++j == t->size)
@@ -546,7 +544,7 @@ start:
                               rw_aux_tree(b, t)[j - 1].offset);
                }
 
-               k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+               k = bkey_next(k);
                BUG_ON(k >= btree_bkey_last(b, t));
        }
 }
@@ -606,53 +604,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
        return (u16) v;
 }
 
-static void make_bfloat(struct btree *b, struct bset_tree *t,
-                       unsigned j,
-                       struct bkey_packed *min_key,
-                       struct bkey_packed *max_key)
+__always_inline
+static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
+                                unsigned j,
+                                struct bkey_packed *min_key,
+                                struct bkey_packed *max_key)
 {
        struct bkey_float *f = bkey_float(b, t, j);
        struct bkey_packed *m = tree_to_bkey(b, t, j);
-       struct bkey_packed *l, *r;
+       struct bkey_packed *l = is_power_of_2(j)
+               ? min_key
+               : tree_to_prev_bkey(b, t, j >> ffs(j));
+       struct bkey_packed *r = is_power_of_2(j + 1)
+               ? max_key
+               : tree_to_bkey(b, t, j >> (ffz(j) + 1));
        unsigned mantissa;
        int shift, exponent, high_bit;
 
-       if (is_power_of_2(j)) {
-               l = min_key;
-
-               if (!l->u64s) {
-                       if (!bkey_pack_pos(l, b->data->min_key, b)) {
-                               struct bkey_i tmp;
-
-                               bkey_init(&tmp.k);
-                               tmp.k.p = b->data->min_key;
-                               bkey_copy(l, &tmp);
-                       }
-               }
-       } else {
-               l = tree_to_prev_bkey(b, t, j >> ffs(j));
-
-               EBUG_ON(m < l);
-       }
-
-       if (is_power_of_2(j + 1)) {
-               r = max_key;
-
-               if (!r->u64s) {
-                       if (!bkey_pack_pos(r, t->max_key, b)) {
-                               struct bkey_i tmp;
-
-                               bkey_init(&tmp.k);
-                               tmp.k.p = t->max_key;
-                               bkey_copy(r, &tmp);
-                       }
-               }
-       } else {
-               r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
-               EBUG_ON(m > r);
-       }
-
        /*
         * for failed bfloats, the lookup code falls back to comparing against
         * the original key.
@@ -709,26 +677,54 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
        f->mantissa = mantissa;
 }
 
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+                       unsigned j,
+                       struct bkey_packed *min_key,
+                       struct bkey_packed *max_key)
+{
+       struct bkey_i *k;
+
+       if (is_power_of_2(j) &&
+           !min_key->u64s) {
+               if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
+                       k = (void *) min_key;
+                       bkey_init(&k->k);
+                       k->k.p = b->data->min_key;
+               }
+       }
+
+       if (is_power_of_2(j + 1) &&
+           !max_key->u64s) {
+               if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
+                       k = (void *) max_key;
+                       bkey_init(&k->k);
+                       k->k.p = b->data->max_key;
+               }
+       }
+
+       __make_bfloat(b, t, j, min_key, max_key);
+}
+
 /* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
        bset_aux_tree_verify(b);
 
        return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
 }
 
-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
        return __bset_tree_capacity(b, t) /
                (sizeof(struct bkey_float) + sizeof(u8));
 }
 
-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
        return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
 }
 
-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 {
        struct bkey_packed *k;
 
@@ -747,15 +743,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
        }
 }
 
-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
 {
        struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
-       struct bkey_packed min_key, max_key;
+       struct bkey_i min_key, max_key;
        unsigned j, cacheline = 1;
 
-       /* signal to make_bfloat() that they're uninitialized: */
-       min_key.u64s = max_key.u64s = 0;
-
        t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
                      bset_ro_tree_capacity(b, t));
 retry:
@@ -770,7 +763,7 @@ retry:
        /* First we figure out where the first key in each cacheline is */
        eytzinger1_for_each(j, t->size) {
                while (bkey_to_cacheline(b, t, k) < cacheline)
-                       prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+                       prev = k, k = bkey_next(k);
 
                if (k >= btree_bkey_last(b, t)) {
                        /* XXX: this path sucks */
@@ -787,13 +780,23 @@ retry:
        }
 
        while (k != btree_bkey_last(b, t))
-               prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+               prev = k, k = bkey_next(k);
+
+       if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+               bkey_init(&min_key.k);
+               min_key.k.p = b->data->min_key;
+       }
 
-       t->max_key = bkey_unpack_pos(b, prev);
+       if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+               bkey_init(&max_key.k);
+               max_key.k.p = b->data->max_key;
+       }
 
        /* Then we build the tree */
        eytzinger1_for_each(j, t->size)
-               make_bfloat(b, t, j, &min_key, &max_key);
+               __make_bfloat(b, t, j,
+                             bkey_to_packed(&min_key),
+                             bkey_to_packed(&max_key));
 }
 
 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
@@ -915,21 +918,21 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
        struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
 
        while ((p = __bkey_prev(b, t, k)) && !ret) {
-               for (i = p; i != k; i = bkey_next_skip_noops(i, k))
+               for (i = p; i != k; i = bkey_next(i))
                        if (i->type >= min_key_type)
                                ret = i;
 
                k = p;
        }
 
-       if (btree_keys_expensive_checks(b)) {
+       if (bch2_expensive_debug_checks) {
                BUG_ON(ret >= orig_k);
 
                for (i = ret
-                       ? bkey_next_skip_noops(ret, orig_k)
+                       ? bkey_next(ret)
                        : btree_bkey_first(b, t);
                     i != orig_k;
-                    i = bkey_next_skip_noops(i, orig_k))
+                    i = bkey_next(i))
                        BUG_ON(i->type >= min_key_type);
        }
 
@@ -964,9 +967,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
        /* signal to make_bfloat() that they're uninitialized: */
        min_key.u64s = max_key.u64s = 0;
 
-       if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
-               t->max_key = bkey_unpack_pos(b, k);
-
+       if (bkey_next(k) == btree_bkey_last(b, t)) {
                for (j = 1; j < t->size; j = j * 2 + 1)
                        make_bfloat(b, t, j, &min_key, &max_key);
        }
@@ -1088,7 +1089,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
                struct bkey_packed *k = start;
 
                while (1) {
-                       k = bkey_next_skip_noops(k, end);
+                       k = bkey_next(k);
                        if (k == end)
                                break;
 
@@ -1124,7 +1125,7 @@ void bch2_bset_insert(struct btree *b,
        if (bch2_bkey_pack_key(&packed, &insert->k, f))
                src = &packed;
 
-       if (!bkey_whiteout(&insert->k))
+       if (!bkey_deleted(&insert->k))
                btree_keys_account_key_add(&b->nr, t - b->set, src);
 
        if (src->u64s != clobber_u64s) {
@@ -1174,15 +1175,14 @@ void bch2_bset_delete(struct btree *b,
 __flatten
 static struct bkey_packed *bset_search_write_set(const struct btree *b,
                                struct bset_tree *t,
-                               struct bpos *search,
-                               const struct bkey_packed *packed_search)
+                               struct bpos *search)
 {
        unsigned l = 0, r = t->size;
 
        while (l + 1 != r) {
                unsigned m = (l + r) >> 1;
 
-               if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+               if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
                        l = m;
                else
                        r = m;
@@ -1227,8 +1227,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
 
 __flatten
 static struct bkey_packed *bset_search_tree(const struct btree *b,
-                               struct bset_tree *t,
-                               struct bpos *search,
+                               const struct bset_tree *t,
+                               const struct bpos *search,
                                const struct bkey_packed *packed_search)
 {
        struct ro_aux_tree *base = ro_aux_tree_base(b, t);
@@ -1242,9 +1242,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
                        prefetch(&base->f[n << 4]);
 
                f = &base->f[n];
-
-               if (!unlikely(packed_search))
-                       goto slowpath;
                if (unlikely(f->exponent >= BFLOAT_FAILED))
                        goto slowpath;
 
@@ -1308,18 +1305,8 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
        case BSET_NO_AUX_TREE:
                return btree_bkey_first(b, t);
        case BSET_RW_AUX_TREE:
-               return bset_search_write_set(b, t, search, lossy_packed_search);
+               return bset_search_write_set(b, t, search);
        case BSET_RO_AUX_TREE:
-               /*
-                * Each node in the auxiliary search tree covers a certain range
-                * of bits, and keys above and below the set it covers might
-                * differ outside those bits - so we have to special case the
-                * start and end - handle that here:
-                */
-
-               if (bkey_cmp(*search, t->max_key) > 0)
-                       return btree_bkey_last(b, t);
-
                return bset_search_tree(b, t, search, lossy_packed_search);
        default:
                unreachable();
@@ -1338,14 +1325,14 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
                while (m != btree_bkey_last(b, t) &&
                       bkey_iter_cmp_p_or_unp(b, m,
                                        lossy_packed_search, search) < 0)
-                       m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+                       m = bkey_next(m);
 
        if (!packed_search)
                while (m != btree_bkey_last(b, t) &&
                       bkey_iter_pos_cmp(b, m, search) < 0)
-                       m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+                       m = bkey_next(m);
 
-       if (btree_keys_expensive_checks(b)) {
+       if (bch2_expensive_debug_checks) {
                struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
                BUG_ON(prev &&
@@ -1356,23 +1343,6 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
        return m;
 }
 
-/*
- * Returns the first key greater than or equal to @search
- */
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search(struct btree *b,
-                               struct bset_tree *t,
-                               struct bpos *search,
-                               struct bkey_packed *packed_search,
-                               const struct bkey_packed *lossy_packed_search)
-{
-       struct bkey_packed *m = __bch2_bset_search(b, t, search,
-                                                  lossy_packed_search);
-
-       return bch2_bset_search_linear(b, t, search,
-                                packed_search, lossy_packed_search, m);
-}
-
 /* Btree node iterator */
 
 static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
@@ -1407,16 +1377,15 @@ noinline __flatten __attribute__((cold))
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
                              struct btree *b, struct bpos *search)
 {
-       struct bset_tree *t;
+       struct bkey_packed *k;
 
        trace_bkey_pack_pos_fail(search);
 
-       for_each_bset(b, t)
-               __bch2_btree_node_iter_push(iter, b,
-                       bch2_bset_search(b, t, search, NULL, NULL),
-                       btree_bkey_last(b, t));
+       bch2_btree_node_iter_init_from_start(iter, b);
 
-       bch2_btree_node_iter_sort(iter, b);
+       while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+              bkey_iter_pos_cmp(b, k, search) < 0)
+               bch2_btree_node_iter_advance(iter, b);
 }
 
 /**
@@ -1450,7 +1419,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  *    to the search key is going to have 0 sectors after the search key.
  *
  *    But this does mean that we can't just search for
- *    bkey_successor(start_of_range) to get the first extent that overlaps with
+ *    bpos_successor(start_of_range) to get the first extent that overlaps with
  *    the range we want - if we're unlucky and there's an extent that ends
  *    exactly where we searched, then there could be a deleted key at the same
  *    position and we'd get that when we search instead of the preceding extent
@@ -1468,7 +1437,8 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
        struct bkey_packed *k[MAX_BSETS];
        unsigned i;
 
-       EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
+       EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
+       EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0);
        bset_aux_tree_verify(b);
 
        memset(iter, 0, sizeof(*iter));
@@ -1601,7 +1571,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
                                  struct btree *b)
 {
-       if (btree_keys_expensive_checks(b)) {
+       if (bch2_expensive_debug_checks) {
                bch2_btree_node_iter_verify(iter, b);
                bch2_btree_node_iter_next_check(iter, b);
        }
@@ -1620,7 +1590,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
        struct bset_tree *t;
        unsigned end = 0;
 
-       if (btree_keys_expensive_checks(b))
+       if (bch2_expensive_debug_checks)
                bch2_btree_node_iter_verify(iter, b);
 
        for_each_bset(b, t) {
@@ -1656,20 +1626,19 @@ found:
        iter->data[0].k = __btree_node_key_to_offset(b, prev);
        iter->data[0].end = end;
 
-       if (btree_keys_expensive_checks(b))
+       if (bch2_expensive_debug_checks)
                bch2_btree_node_iter_verify(iter, b);
        return prev;
 }
 
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
-                                                    struct btree *b,
-                                                    unsigned min_key_type)
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
+                                             struct btree *b)
 {
        struct bkey_packed *prev;
 
        do {
                prev = bch2_btree_node_iter_prev_all(iter, b);
-       } while (prev && prev->type < min_key_type);
+       } while (prev && bkey_deleted(prev));
 
        return prev;
 }
@@ -1734,9 +1703,10 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
                uk = bkey_unpack_key(b, k);
                pr_buf(out,
                       "    failed unpacked at depth %u\n"
-                      "\t%llu:%llu\n",
-                      ilog2(j),
-                      uk.p.inode, uk.p.offset);
+                      "\t",
+                      ilog2(j));
+               bch2_bpos_to_text(out, uk.p);
+               pr_buf(out, "\n");
                break;
        }
 }
index 5921cf68910578c94d6db0b07c69dcdcbe3fef8a..506da4e0c91127295554f3432cfdd8782de24e2a 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
-#include "bcachefs_format.h"
+#include "bcachefs.h"
 #include "bkey.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
  * first key in that range of bytes again.
  */
 
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(const struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
-       return false;
-#endif
-}
-
 enum bset_aux_tree_type {
        BSET_NO_AUX_TREE,
        BSET_RO_AUX_TREE,
@@ -201,17 +190,17 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree
 
 #define BSET_CACHELINE         128
 
-static inline size_t btree_keys_cachelines(struct btree *b)
+static inline size_t btree_keys_cachelines(const struct btree *b)
 {
        return (1U << b->byte_order) / BSET_CACHELINE;
 }
 
-static inline size_t btree_aux_data_bytes(struct btree *b)
+static inline size_t btree_aux_data_bytes(const struct btree *b)
 {
        return btree_keys_cachelines(b) * 8;
 }
 
-static inline size_t btree_aux_data_u64s(struct btree *b)
+static inline size_t btree_aux_data_u64s(const struct btree *b)
 {
        return btree_aux_data_bytes(b) / sizeof(u64);
 }
@@ -228,7 +217,7 @@ __bkey_unpack_key_format_checked(const struct btree *b,
                compiled_unpack_fn unpack_fn = b->aux_data;
                unpack_fn(dst, src);
 
-               if (btree_keys_expensive_checks(b)) {
+               if (bch2_expensive_debug_checks) {
                        struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
 
                        BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
@@ -316,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
 #define bset_tree_for_each_key(_b, _t, _k)                             \
        for (_k = btree_bkey_first(_b, _t);                             \
             _k != btree_bkey_last(_b, _t);                             \
-            _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+            _k = bkey_next(_k))
 
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
@@ -366,7 +355,7 @@ static inline struct bset *bset_next_set(struct btree *b,
        return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
 }
 
-void bch2_btree_keys_init(struct btree *, bool *);
+void bch2_btree_keys_init(struct btree *);
 
 void bch2_bset_init_first(struct btree *, struct bset *);
 void bch2_bset_init_next(struct bch_fs *, struct btree *,
@@ -389,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
        EBUG_ON(r_packed && !bkey_packed(r_packed));
 
        if (unlikely(!bkey_packed(l)))
-               return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+               return bpos_cmp(packed_to_bkey_c(l)->p, *r);
 
        if (likely(r_packed))
                return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
@@ -411,25 +400,7 @@ bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 static inline struct bkey_packed *
 bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
 {
-       return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1);
-}
-
-enum bch_extent_overlap {
-       BCH_EXTENT_OVERLAP_ALL          = 0,
-       BCH_EXTENT_OVERLAP_BACK         = 1,
-       BCH_EXTENT_OVERLAP_FRONT        = 2,
-       BCH_EXTENT_OVERLAP_MIDDLE       = 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-                                                         const struct bkey *m)
-{
-       int cmp1 = bkey_cmp(k->p, m->p) < 0;
-       int cmp2 = bkey_cmp(bkey_start_pos(k),
-                           bkey_start_pos(m)) > 0;
-
-       return (cmp1 << 1) + cmp2;
+       return bch2_bkey_prev_filter(b, t, k, 1);
 }
 
 /* Btree key iteration */
@@ -477,7 +448,7 @@ static inline int bkey_iter_cmp(const struct btree *b,
                                const struct bkey_packed *l,
                                const struct bkey_packed *r)
 {
-       return bkey_cmp_packed(b, l, r)
+       return bch2_bkey_cmp_packed(b, l, r)
                ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
                ?: cmp_int(l, r);
 }
@@ -517,33 +488,23 @@ __bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
 }
 
 static inline struct bkey_packed *
-bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
-                                struct btree *b,
-                                unsigned min_key_type)
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
 {
-       while (!bch2_btree_node_iter_end(iter)) {
-               struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
-
-               if (k->type >= min_key_type)
-                       return k;
-
-               bch2_btree_node_iter_advance(iter, b);
-       }
-
-       return NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
-                             struct btree *b)
-{
-       return bch2_btree_node_iter_peek_filter(iter, b, 0);
+       return !bch2_btree_node_iter_end(iter)
+               ? __btree_node_offset_to_key(b, iter->data->k)
+               : NULL;
 }
 
 static inline struct bkey_packed *
 bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
 {
-       return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1);
+       struct bkey_packed *k;
+
+       while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
+              bkey_deleted(k))
+               bch2_btree_node_iter_advance(iter, b);
+
+       return k;
 }
 
 static inline struct bkey_packed *
@@ -559,14 +520,8 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
 
 struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
                                                  struct btree *);
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
-                                                    struct btree *, unsigned);
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
-{
-       return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1);
-}
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
+                                             struct btree *);
 
 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
                                                struct btree *,
@@ -654,7 +609,7 @@ static inline void bch2_verify_insert_pos(struct btree *b,
 
 static inline void bch2_verify_btree_nr_keys(struct btree *b)
 {
-       if (btree_keys_expensive_checks(b))
+       if (bch2_debug_check_btree_accounting)
                __bch2_verify_btree_nr_keys(b);
 }
 
index bb94fa2341eea839eca31128f1245b190d8244f8..1abc50f134e6fe4e2797f1c7445b6cb7ed85c0f6 100644 (file)
@@ -1,23 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
+#include "error.h"
 
 #include <linux/prefetch.h>
 #include <linux/sched/mm.h>
 #include <trace/events/bcachefs.h>
 
-const char * const bch2_btree_ids[] = {
-#define x(kwd, val, name) name,
-       BCH_BTREE_IDS()
-#undef x
-       NULL
-};
-
 void bch2_recalc_btree_reserve(struct bch_fs *c)
 {
        unsigned i, reserve = 16;
@@ -151,6 +146,11 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
        b->c.level      = level;
        b->c.btree_id   = id;
 
+       if (level)
+               six_lock_pcpu_alloc(&b->c.lock);
+       else
+               six_lock_pcpu_free_rcu(&b->c.lock);
+
        mutex_lock(&bc->lock);
        ret = __bch2_btree_node_hash_insert(bc, b);
        if (!ret)
@@ -211,7 +211,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
                 * - unless btree verify mode is enabled, since it runs out of
                 * the post write cleanup:
                 */
-               if (verify_btree_ondisk(c))
+               if (bch2_verify_btree_ondisk)
                        bch2_btree_node_write(c, b, SIX_LOCK_intent);
                else
                        __bch2_btree_node_write(c, b, SIX_LOCK_read);
@@ -254,7 +254,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
        unsigned long freed = 0;
        unsigned i, flags;
 
-       if (btree_shrinker_disabled(c))
+       if (bch2_btree_shrinker_disabled)
                return SHRINK_STOP;
 
        /* Return -1 if we can't do anything right now */
@@ -328,9 +328,9 @@ restart:
                        clear_btree_node_accessed(b);
        }
 
-       memalloc_nofs_restore(flags);
        mutex_unlock(&bc->lock);
 out:
+       memalloc_nofs_restore(flags);
        return (unsigned long) freed * btree_pages(c);
 }
 
@@ -341,7 +341,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
                                        btree_cache.shrink);
        struct btree_cache *bc = &c->btree_cache;
 
-       if (btree_shrinker_disabled(c))
+       if (bch2_btree_shrinker_disabled)
                return 0;
 
        return btree_cache_can_free(bc) * btree_pages(c);
@@ -381,14 +381,17 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 
                if (btree_node_dirty(b))
                        bch2_btree_complete_write(c, b, btree_current_write(b));
-               clear_btree_node_dirty(b);
+               clear_btree_node_dirty(c, b);
 
                btree_node_data_free(c, b);
        }
 
+       BUG_ON(atomic_read(&c->btree_cache.dirty));
+
        while (!list_empty(&bc->freed)) {
                b = list_first_entry(&bc->freed, struct btree, list);
                list_del(&b->list);
+               six_lock_pcpu_free(&b->c.lock);
                kfree(b);
        }
 
@@ -445,7 +448,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
        bc->shrink.scan_objects         = bch2_btree_cache_scan;
        bc->shrink.seeks                = 4;
        bc->shrink.batch                = btree_pages(c) * 2;
-       register_shrinker(&bc->shrink);
+       ret = register_shrinker(&bc->shrink);
 out:
        pr_verbose_init(c->opts, "ret %i", ret);
        return ret;
@@ -590,7 +593,7 @@ out:
        b->sib_u64s[0]          = 0;
        b->sib_u64s[1]          = 0;
        b->whiteout_u64s        = 0;
-       bch2_btree_keys_init(b, &c->expensive_debug_checks);
+       bch2_btree_keys_init(b);
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
                               start_time);
@@ -705,7 +708,8 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
  */
 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
                                  const struct bkey_i *k, unsigned level,
-                                 enum six_lock_type lock_type)
+                                 enum six_lock_type lock_type,
+                                 unsigned long trace_ip)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
@@ -767,7 +771,7 @@ lock_node:
                        btree_node_unlock(iter, level + 1);
 
                if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
-                                    lock_node_check_fn, (void *) k)) {
+                                    lock_node_check_fn, (void *) k, trace_ip)) {
                        if (b->hash_val != btree_ptr_hash_val(k))
                                goto retry;
                        return ERR_PTR(-EINTR);
@@ -808,9 +812,12 @@ lock_node:
                return ERR_PTR(-EIO);
        }
 
-       EBUG_ON(b->c.btree_id != iter->btree_id ||
-               BTREE_NODE_LEVEL(b->data) != level ||
-               bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->c.btree_id != iter->btree_id);
+       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+       EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               bpos_cmp(b->data->min_key,
+                        bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
        return b;
 }
@@ -818,7 +825,8 @@ lock_node:
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
                                         const struct bkey_i *k,
                                         enum btree_id btree_id,
-                                        unsigned level)
+                                        unsigned level,
+                                        bool nofill)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
@@ -833,6 +841,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
 retry:
        b = btree_cache_find(bc, k);
        if (unlikely(!b)) {
+               if (nofill)
+                       goto out;
+
                b = bch2_btree_node_fill(c, NULL, k, btree_id,
                                         level, SIX_LOCK_read, true);
 
@@ -840,8 +851,12 @@ retry:
                if (!b)
                        goto retry;
 
+               if (IS_ERR(b) &&
+                   !bch2_btree_cache_cannibalize_lock(c, NULL))
+                       goto retry;
+
                if (IS_ERR(b))
-                       return b;
+                       goto out;
        } else {
 lock_node:
                ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
@@ -876,143 +891,36 @@ lock_node:
 
        if (unlikely(btree_node_read_error(b))) {
                six_unlock_read(&b->c.lock);
-               return ERR_PTR(-EIO);
-       }
-
-       EBUG_ON(b->c.btree_id != btree_id ||
-               BTREE_NODE_LEVEL(b->data) != level ||
-               bkey_cmp(b->data->max_key, k->k.p));
-
-       return b;
-}
-
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
-                                         struct btree_iter *iter,
-                                         struct btree *b,
-                                         enum btree_node_sibling sib)
-{
-       struct btree_trans *trans = iter->trans;
-       struct btree *parent;
-       struct btree_node_iter node_iter;
-       struct bkey_packed *k;
-       BKEY_PADDED(k) tmp;
-       struct btree *ret = NULL;
-       unsigned level = b->c.level;
-
-       parent = btree_iter_node(iter, level + 1);
-       if (!parent)
-               return NULL;
-
-       /*
-        * There's a corner case where a btree_iter might have a node locked
-        * that is just outside its current pos - when
-        * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
-        *
-        * But the lock ordering checks in __bch2_btree_node_lock() go off of
-        * iter->pos, not the node's key: so if the iterator is marked as
-        * needing to be traversed, we risk deadlock if we don't bail out here:
-        */
-       if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-               return ERR_PTR(-EINTR);
-
-       if (!bch2_btree_node_relock(iter, level + 1)) {
-               ret = ERR_PTR(-EINTR);
+               b = ERR_PTR(-EIO);
                goto out;
        }
 
-       node_iter = iter->l[parent->c.level].iter;
-
-       k = bch2_btree_node_iter_peek_all(&node_iter, parent);
-       BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
-
-       k = sib == btree_prev_sib
-               ? bch2_btree_node_iter_prev(&node_iter, parent)
-               : (bch2_btree_node_iter_advance(&node_iter, parent),
-                  bch2_btree_node_iter_peek(&node_iter, parent));
-       if (!k)
-               goto out;
-
-       bch2_bkey_unpack(parent, &tmp.k, k);
-
-       ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-                                 SIX_LOCK_intent);
-
-       if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
-               struct btree_iter *linked;
-
-               if (!bch2_btree_node_relock(iter, level + 1))
-                       goto out;
-
-               /*
-                * We might have got -EINTR because trylock failed, and we're
-                * holding other locks that would cause us to deadlock:
-                */
-               trans_for_each_iter(trans, linked)
-                       if (btree_iter_cmp(iter, linked) < 0)
-                               __bch2_btree_iter_unlock(linked);
-
-               if (sib == btree_prev_sib)
-                       btree_node_unlock(iter, level);
-
-               ret = bch2_btree_node_get(c, iter, &tmp.k, level,
-                                         SIX_LOCK_intent);
-
-               /*
-                * before btree_iter_relock() calls btree_iter_verify_locks():
-                */
-               if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
-                       btree_node_unlock(iter, level + 1);
-
-               if (!bch2_btree_node_relock(iter, level)) {
-                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
-
-                       if (!IS_ERR(ret)) {
-                               six_unlock_intent(&ret->c.lock);
-                               ret = ERR_PTR(-EINTR);
-                       }
-               }
-
-               bch2_trans_relock(trans);
-       }
+       EBUG_ON(b->c.btree_id != btree_id);
+       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+       EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               bpos_cmp(b->data->min_key,
+                        bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 out:
-       if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
-               btree_node_unlock(iter, level + 1);
-
-       if (PTR_ERR_OR_ZERO(ret) == -EINTR)
-               bch2_btree_iter_upgrade(iter, level + 2);
-
-       BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
-
-       if (!IS_ERR_OR_NULL(ret)) {
-               struct btree *n1 = ret, *n2 = b;
-
-               if (sib != btree_prev_sib)
-                       swap(n1, n2);
-
-               BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
-                               n2->data->min_key));
-       }
-
-       bch2_btree_trans_verify_locks(trans);
-
-       return ret;
+       bch2_btree_cache_cannibalize_unlock(c);
+       return b;
 }
 
 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
-                             const struct bkey_i *k, unsigned level)
+                             const struct bkey_i *k,
+                             enum btree_id btree_id, unsigned level)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
 
-       BUG_ON(!btree_node_locked(iter, level + 1));
+       BUG_ON(iter && !btree_node_locked(iter, level + 1));
        BUG_ON(level >= BTREE_MAX_DEPTH);
 
        b = btree_cache_find(bc, k);
        if (b)
                return;
 
-       bch2_btree_node_fill(c, iter, k, iter->btree_id,
-                            level, SIX_LOCK_read, false);
+       bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
 }
 
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
@@ -1025,20 +933,19 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 
        bch2_btree_keys_stats(b, &stats);
 
-       pr_buf(out,
-              "l %u %llu:%llu - %llu:%llu:\n"
-              "    ptrs: ",
-              b->c.level,
-              b->data->min_key.inode,
-              b->data->min_key.offset,
-              b->data->max_key.inode,
-              b->data->max_key.offset);
+       pr_buf(out, "l %u ", b->c.level);
+       bch2_bpos_to_text(out, b->data->min_key);
+       pr_buf(out, " - ");
+       bch2_bpos_to_text(out, b->data->max_key);
+       pr_buf(out, ":\n"
+              "    ptrs: ");
        bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+
        pr_buf(out, "\n"
               "    format: u64s %u fields %u %u %u %u %u\n"
               "    unpack fn len: %u\n"
               "    bytes used %zu/%zu (%zu%% full)\n"
-              "    sib u64s: %u, %u (merge threshold %zu)\n"
+              "    sib u64s: %u, %u (merge threshold %u)\n"
               "    nr packed keys %u\n"
               "    nr unpacked keys %u\n"
               "    floats %zu\n"
@@ -1055,9 +962,16 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
               b->nr.live_u64s * 100 / btree_max_u64s(c),
               b->sib_u64s[0],
               b->sib_u64s[1],
-              BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+              c->btree_foreground_merge_threshold,
               b->nr.packed_keys,
               b->nr.unpacked_keys,
               stats.floats,
               stats.failed);
 }
+
+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+       pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+       pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+}
index d0d3a85bb8be6354c5cb6218e1e3a69116586aa4..4791c3b64452d915486a2a5754210fcc1098f0ab 100644 (file)
@@ -7,8 +7,6 @@
 
 struct btree_iter;
 
-extern const char * const bch2_btree_ids[];
-
 void bch2_recalc_btree_reserve(struct bch_fs *);
 
 void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
@@ -23,16 +21,13 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
 struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
                                  const struct bkey_i *, unsigned,
-                                 enum six_lock_type);
+                                 enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
-                                        enum btree_id, unsigned);
-
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
-                               struct btree *, enum btree_node_sibling);
+                                        enum btree_id, unsigned, bool);
 
 void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
-                             const struct bkey_i *, unsigned);
+                             const struct bkey_i *, enum btree_id, unsigned);
 
 void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
@@ -94,11 +89,12 @@ static inline unsigned btree_blocks(struct bch_fs *c)
 #define BTREE_FOREGROUND_MERGE_THRESHOLD(c)    (btree_max_u64s(c) * 1 / 3)
 #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)                   \
        (BTREE_FOREGROUND_MERGE_THRESHOLD(c) +                  \
-        (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+        (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
 
 #define btree_node_root(_c, _b)        ((_c)->btree_roots[(_b)->c.btree_id].b)
 
 void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
                             struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
 
 #endif /* _BCACHEFS_BTREE_CACHE_H */
index e8c1e752a25d63ec7fc32c5982d3ad3e32bc1f9c..268e00729409030db9c3bc597656df534fbd9046 100644 (file)
@@ -8,7 +8,7 @@
 #include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -50,39 +50,248 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
        __gc_pos_set(c, new_pos);
 }
 
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
 static int bch2_gc_check_topology(struct bch_fs *c,
-                                 struct bkey_s_c k,
-                                 struct bpos *expected_start,
-                                 struct bpos expected_end,
+                                 struct btree *b,
+                                 struct bkey_buf *prev,
+                                 struct bkey_buf cur,
                                  bool is_last)
 {
+       struct bpos node_start  = b->data->min_key;
+       struct bpos node_end    = b->data->max_key;
+       struct bpos expected_start = bkey_deleted(&prev->k->k)
+               ? node_start
+               : bpos_successor(prev->k->k.p);
+       char buf1[200], buf2[200];
+       bool update_min = false;
+       bool update_max = false;
        int ret = 0;
 
-       if (k.k->type == KEY_TYPE_btree_ptr_v2) {
-               struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+       if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
+               struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
 
-               if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c,
-                               "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu",
-                               bp.v->min_key.inode,
-                               bp.v->min_key.offset,
-                               expected_start->inode,
-                               expected_start->offset)) {
-                       BUG();
+               if (bkey_deleted(&prev->k->k)) {
+                       struct printbuf out = PBUF(buf1);
+                       pr_buf(&out, "start of node: ");
+                       bch2_bpos_to_text(&out, node_start);
+               } else {
+                       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
                }
-       }
 
-       *expected_start = bkey_cmp(k.k->p, POS_MAX)
-               ? bkey_successor(k.k->p)
-               : k.k->p;
+               if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
+                               "btree node with incorrect min_key at btree %s level %u:\n"
+                               "  prev %s\n"
+                               "  cur %s",
+                               bch2_btree_ids[b->c.btree_id], b->c.level,
+                               buf1,
+                               (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
+                       update_min = true;
+       }
 
        if (fsck_err_on(is_last &&
-                       bkey_cmp(k.k->p, expected_end), c,
-                       "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu",
-                       k.k->p.inode,
-                       k.k->p.offset,
-                       expected_end.inode,
-                       expected_end.offset)) {
-               BUG();
+                       bpos_cmp(cur.k->k.p, node_end), c,
+                       "btree node with incorrect max_key at btree %s level %u:\n"
+                       "  %s\n"
+                       "  expected %s",
+                       bch2_btree_ids[b->c.btree_id], b->c.level,
+                       (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+                       (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
+               update_max = true;
+
+       bch2_bkey_buf_copy(prev, c, cur.k);
+
+       if (update_min || update_max) {
+               struct bkey_i *new;
+               struct bkey_i_btree_ptr_v2 *bp = NULL;
+               struct btree *n;
+
+               if (update_max) {
+                       ret = bch2_journal_key_delete(c, b->c.btree_id,
+                                                     b->c.level, cur.k->k.p);
+                       if (ret)
+                               return ret;
+               }
+
+               new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
+               if (!new) {
+                       bch_err(c, "%s: error allocating new key", __func__);
+                       return -ENOMEM;
+               }
+
+               bkey_copy(new, cur.k);
+
+               if (new->k.type == KEY_TYPE_btree_ptr_v2)
+                       bp = bkey_i_to_btree_ptr_v2(new);
+
+               if (update_min)
+                       bp->v.min_key = expected_start;
+               if (update_max)
+                       new->k.p = node_end;
+               if (bp)
+                       SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
+
+               ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
+               if (ret) {
+                       kfree(new);
+                       return ret;
+               }
+
+               n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
+                                              b->c.level - 1, true);
+               if (n) {
+                       mutex_lock(&c->btree_cache.lock);
+                       bch2_btree_node_hash_remove(&c->btree_cache, n);
+
+                       bkey_copy(&n->key, new);
+                       if (update_min)
+                               n->data->min_key = expected_start;
+                       if (update_max)
+                               n->data->max_key = node_end;
+
+                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
+                       BUG_ON(ret);
+                       mutex_unlock(&c->btree_cache.lock);
+                       six_unlock_read(&n->c.lock);
+               }
+       }
+fsck_err:
+       return ret;
+}
+
+static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+                              unsigned level, bool is_root,
+                              struct bkey_s_c *k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p = { 0 };
+       bool do_update = false;
+       int ret = 0;
+
+       bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+               struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
+               struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+
+               if (fsck_err_on(!g->gen_valid, c,
+                               "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+                               bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+                               p.ptr.gen)) {
+                       if (p.ptr.cached) {
+                               g2->_mark.gen   = g->_mark.gen          = p.ptr.gen;
+                               g2->gen_valid   = g->gen_valid          = true;
+                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+                       } else {
+                               do_update = true;
+                       }
+               }
+
+               if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+                               "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+                               bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+                               p.ptr.gen, g->mark.gen)) {
+                       if (p.ptr.cached) {
+                               g2->_mark.gen   = g->_mark.gen  = p.ptr.gen;
+                               g2->gen_valid   = g->gen_valid  = true;
+                               g2->_mark.data_type             = 0;
+                               g2->_mark.dirty_sectors         = 0;
+                               g2->_mark.cached_sectors        = 0;
+                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+                       } else {
+                               do_update = true;
+                       }
+               }
+
+               if (fsck_err_on(!p.ptr.cached &&
+                               gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+                               "bucket %u:%zu data type %s stale dirty ptr: %u < %u",
+                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+                               bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+                               p.ptr.gen, g->mark.gen))
+                       do_update = true;
+
+               if (p.has_ec) {
+                       struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
+
+                       if (fsck_err_on(!m || !m->alive, c,
+                                       "pointer to nonexistent stripe %llu",
+                                       (u64) p.ec.idx))
+                               do_update = true;
+
+                       if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
+                                       "pointer does not match stripe %llu",
+                                       (u64) p.ec.idx))
+                               do_update = true;
+               }
+       }
+
+       if (do_update) {
+               struct bkey_ptrs ptrs;
+               union bch_extent_entry *entry;
+               struct bch_extent_ptr *ptr;
+               struct bkey_i *new;
+
+               if (is_root) {
+                       bch_err(c, "cannot update btree roots yet");
+                       return -EINVAL;
+               }
+
+               new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+               if (!new) {
+                       bch_err(c, "%s: error allocating new key", __func__);
+                       return -ENOMEM;
+               }
+
+               bkey_reassemble(new, *k);
+
+               bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+                       struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+                       (ptr->cached &&
+                        (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+                       (!ptr->cached &&
+                        gen_cmp(ptr->gen, g->mark.gen) < 0);
+               }));
+again:
+               ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+               bkey_extent_entry_for_each(ptrs, entry) {
+                       if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+                               struct stripe *m = genradix_ptr(&c->stripes[true],
+                                                               entry->stripe_ptr.idx);
+                               union bch_extent_entry *next_ptr;
+
+                               bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+                                       if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+                                               goto found;
+                               next_ptr = NULL;
+found:
+                               if (!next_ptr) {
+                                       bch_err(c, "aieee, found stripe ptr with no data ptr");
+                                       continue;
+                               }
+
+                               if (!m || !m->alive ||
+                                   !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+                                                              &next_ptr->ptr,
+                                                              m->sectors)) {
+                                       bch2_bkey_extent_entry_drop(new, entry);
+                                       goto again;
+                               }
+                       }
+               }
+
+               ret = bch2_journal_key_insert(c, btree_id, level, new);
+               if (ret)
+                       kfree(new);
+               else
+                       *k = bkey_i_to_s_c(new);
        }
 fsck_err:
        return ret;
@@ -90,7 +299,9 @@ fsck_err:
 
 /* marking of btree keys/nodes: */
 
-static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+                           unsigned level, bool is_root,
+                           struct bkey_s_c k,
                            u8 *max_stale, bool initial)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -101,10 +312,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
        int ret = 0;
 
        if (initial) {
-               BUG_ON(journal_seq_verify(c) &&
+               BUG_ON(bch2_journal_seq_verify &&
                       k.k->version.lo > journal_cur_seq(&c->journal));
 
-               /* XXX change to fsck check */
                if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
                                "key version number higher than recorded: %llu > %llu",
                                k.k->version.lo,
@@ -116,37 +326,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
                                "superblock not marked as containing replicas (type %u)",
                                k.k->type)) {
                        ret = bch2_mark_bkey_replicas(c, k);
-                       if (ret)
-                               return ret;
-               }
-
-               bkey_for_each_ptr(ptrs, ptr) {
-                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-                       struct bucket *g = PTR_BUCKET(ca, ptr, true);
-                       struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
-
-                       if (mustfix_fsck_err_on(!g->gen_valid, c,
-                                       "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
-                                       ptr->dev, PTR_BUCKET_NR(ca, ptr),
-                                       bch2_data_types[ptr_data_type(k.k, ptr)],
-                                       ptr->gen)) {
-                               g2->_mark.gen   = g->_mark.gen          = ptr->gen;
-                               g2->gen_valid   = g->gen_valid          = true;
-                       }
-
-                       if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
-                                       "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
-                                       ptr->dev, PTR_BUCKET_NR(ca, ptr),
-                                       bch2_data_types[ptr_data_type(k.k, ptr)],
-                                       ptr->gen, g->mark.gen)) {
-                               g2->_mark.gen   = g->_mark.gen          = ptr->gen;
-                               g2->gen_valid   = g->gen_valid          = true;
-                               g2->_mark.data_type             = 0;
-                               g2->_mark.dirty_sectors         = 0;
-                               g2->_mark.cached_sectors        = 0;
-                               set_bit(BCH_FS_FIXED_GENS, &c->flags);
+                       if (ret) {
+                               bch_err(c, "error marking bkey replicas: %i", ret);
+                               goto err;
                        }
                }
+
+               ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k);
        }
 
        bkey_for_each_ptr(ptrs, ptr) {
@@ -161,16 +347,19 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
        bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
 fsck_err:
+err:
+       if (ret)
+               bch_err(c, "%s: ret %i", __func__, ret);
        return ret;
 }
 
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
                              bool initial)
 {
-       struct bpos next_node_start = b->data->min_key;
        struct btree_node_iter iter;
        struct bkey unpacked;
        struct bkey_s_c k;
+       struct bkey_buf prev, cur;
        int ret = 0;
 
        *max_stale = 0;
@@ -179,37 +368,40 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
                return 0;
 
        bch2_btree_node_iter_init_from_start(&iter, b);
+       bch2_bkey_buf_init(&prev);
+       bch2_bkey_buf_init(&cur);
+       bkey_init(&prev.k->k);
 
        while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
-               bch2_bkey_debugcheck(c, b, k);
-
-               ret = bch2_gc_mark_key(c, k, max_stale, initial);
+               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+                                      k, max_stale, initial);
                if (ret)
                        break;
 
                bch2_btree_node_iter_advance(&iter, b);
 
                if (b->c.level) {
-                       ret = bch2_gc_check_topology(c, k,
-                                       &next_node_start,
-                                       b->data->max_key,
+                       bch2_bkey_buf_reassemble(&cur, c, k);
+
+                       ret = bch2_gc_check_topology(c, b, &prev, cur,
                                        bch2_btree_node_iter_end(&iter));
                        if (ret)
                                break;
                }
        }
 
+       bch2_bkey_buf_exit(&cur, c);
+       bch2_bkey_buf_exit(&prev, c);
        return ret;
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-                        bool initial, bool metadata_only)
+                        bool initial)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct btree *b;
-       unsigned depth = metadata_only                  ? 1
-               : expensive_debug_checks(c)             ? 0
+       unsigned depth = bch2_expensive_debug_checks    ? 0
                : !btree_node_type_needs_gc(btree_id)   ? 1
                : 0;
        u8 max_stale = 0;
@@ -233,11 +425,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
                        if (max_stale > 64)
                                bch2_btree_node_rewrite(c, iter,
                                                b->data->keys.seq,
-                                               BTREE_INSERT_USE_RESERVE|
                                                BTREE_INSERT_NOWAIT|
                                                BTREE_INSERT_GC_LOCK_HELD);
-                       else if (!btree_gc_rewrite_disabled(c) &&
-                                (btree_gc_always_rewrite(c) || max_stale > 16))
+                       else if (!bch2_btree_gc_rewrite_disabled &&
+                                (bch2_btree_gc_always_rewrite || max_stale > 16))
                                bch2_btree_node_rewrite(c, iter,
                                                b->data->keys.seq,
                                                BTREE_INSERT_NOWAIT|
@@ -246,6 +437,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
                bch2_trans_cond_resched(&trans);
        }
+       bch2_trans_iter_put(&trans, iter);
+
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
                return ret;
@@ -253,7 +446,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        mutex_lock(&c->btree_root_lock);
        b = c->btree_roots[btree_id].b;
        if (!btree_node_fake(b))
-               ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+                                      bkey_i_to_s_c(&b->key),
                                       &max_stale, initial);
        gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
        mutex_unlock(&c->btree_root_lock);
@@ -262,76 +456,102 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 }
 
 static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
-                                     struct journal_keys *journal_keys,
                                      unsigned target_depth)
 {
        struct btree_and_journal_iter iter;
        struct bkey_s_c k;
-       struct bpos next_node_start = b->data->min_key;
+       struct bkey_buf cur, prev;
        u8 max_stale = 0;
        int ret = 0;
 
-       bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+       bch2_bkey_buf_init(&prev);
+       bch2_bkey_buf_init(&cur);
+       bkey_init(&prev.k->k);
 
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               bch2_bkey_debugcheck(c, b, k);
-
-               BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
-               BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+               BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+               BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
-               ret = bch2_gc_mark_key(c, k, &max_stale, true);
-               if (ret)
+               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+                                      k, &max_stale, true);
+               if (ret) {
+                       bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
                        break;
+               }
 
                if (b->c.level) {
-                       struct btree *child;
-                       BKEY_PADDED(k) tmp;
-
-                       bkey_reassemble(&tmp.k, k);
-                       k = bkey_i_to_s_c(&tmp.k);
+                       bch2_bkey_buf_reassemble(&cur, c, k);
+                       k = bkey_i_to_s_c(cur.k);
 
                        bch2_btree_and_journal_iter_advance(&iter);
 
-                       ret = bch2_gc_check_topology(c, k,
-                                       &next_node_start,
-                                       b->data->max_key,
+                       ret = bch2_gc_check_topology(c, b,
+                                       &prev, cur,
                                        !bch2_btree_and_journal_iter_peek(&iter).k);
                        if (ret)
                                break;
+               } else {
+                       bch2_btree_and_journal_iter_advance(&iter);
+               }
+       }
 
-                       if (b->c.level > target_depth) {
-                               child = bch2_btree_node_get_noiter(c, &tmp.k,
-                                                       b->c.btree_id, b->c.level - 1);
-                               ret = PTR_ERR_OR_ZERO(child);
-                               if (ret)
-                                       break;
+       if (b->c.level > target_depth) {
+               bch2_btree_and_journal_iter_exit(&iter);
+               bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
-                               ret = bch2_gc_btree_init_recurse(c, child,
-                                               journal_keys, target_depth);
-                               six_unlock_read(&child->c.lock);
+               while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+                       struct btree *child;
 
+                       bch2_bkey_buf_reassemble(&cur, c, k);
+                       bch2_btree_and_journal_iter_advance(&iter);
+
+                       child = bch2_btree_node_get_noiter(c, cur.k,
+                                               b->c.btree_id, b->c.level - 1,
+                                               false);
+                       ret = PTR_ERR_OR_ZERO(child);
+
+                       if (fsck_err_on(ret == -EIO, c,
+                                       "unreadable btree node")) {
+                               ret = bch2_journal_key_delete(c, b->c.btree_id,
+                                                             b->c.level, cur.k->k.p);
                                if (ret)
-                                       break;
+                                       return ret;
+
+                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+                               continue;
                        }
-               } else {
-                       bch2_btree_and_journal_iter_advance(&iter);
+
+                       if (ret) {
+                               bch_err(c, "%s: error %i getting btree node",
+                                       __func__, ret);
+                               break;
+                       }
+
+                       ret = bch2_gc_btree_init_recurse(c, child,
+                                                        target_depth);
+                       six_unlock_read(&child->c.lock);
+
+                       if (ret)
+                               break;
                }
        }
-
+fsck_err:
+       bch2_bkey_buf_exit(&cur, c);
+       bch2_bkey_buf_exit(&prev, c);
+       bch2_btree_and_journal_iter_exit(&iter);
        return ret;
 }
 
 static int bch2_gc_btree_init(struct bch_fs *c,
-                             struct journal_keys *journal_keys,
-                             enum btree_id btree_id,
-                             bool metadata_only)
+                             enum btree_id btree_id)
 {
        struct btree *b;
-       unsigned target_depth = metadata_only           ? 1
-               : expensive_debug_checks(c)             ? 0
-               : !btree_node_type_needs_gc(btree_id)   ? 1
+       unsigned target_depth = bch2_expensive_debug_checks     ? 0
+               : !btree_node_type_needs_gc(btree_id)           ? 1
                : 0;
        u8 max_stale = 0;
+       char buf[100];
        int ret = 0;
 
        b = c->btree_roots[btree_id].b;
@@ -340,30 +560,30 @@ static int bch2_gc_btree_init(struct bch_fs *c,
                return 0;
 
        six_lock_read(&b->c.lock, NULL, NULL);
-       if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
-                       "btree root with incorrect min_key: %llu:%llu",
-                       b->data->min_key.inode,
-                       b->data->min_key.offset)) {
+       if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
+                       "btree root with incorrect min_key: %s",
+                       (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
                BUG();
        }
 
-       if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
-                       "btree root with incorrect min_key: %llu:%llu",
-                       b->data->max_key.inode,
-                       b->data->max_key.offset)) {
+       if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
+                       "btree root with incorrect max_key: %s",
+                       (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
                BUG();
        }
 
        if (b->c.level >= target_depth)
-               ret = bch2_gc_btree_init_recurse(c, b,
-                                       journal_keys, target_depth);
+               ret = bch2_gc_btree_init_recurse(c, b, target_depth);
 
        if (!ret)
-               ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+                                      bkey_i_to_s_c(&b->key),
                                       &max_stale, true);
 fsck_err:
        six_unlock_read(&b->c.lock);
 
+       if (ret)
+               bch_err(c, "%s: ret %i", __func__, ret);
        return ret;
 }
 
@@ -373,8 +593,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
                (int) btree_id_to_gc_phase(r);
 }
 
-static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
-                         bool initial, bool metadata_only)
+static int bch2_gc_btrees(struct bch_fs *c, bool initial)
 {
        enum btree_id ids[BTREE_ID_NR];
        unsigned i;
@@ -386,11 +605,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
        for (i = 0; i < BTREE_ID_NR; i++) {
                enum btree_id id = ids[i];
                int ret = initial
-                       ? bch2_gc_btree_init(c, journal_keys,
-                                            id, metadata_only)
-                       : bch2_gc_btree(c, id, initial, metadata_only);
-               if (ret)
+                       ? bch2_gc_btree_init(c, id)
+                       : bch2_gc_btree(c, id, initial);
+               if (ret) {
+                       bch_err(c, "%s: ret %i", __func__, ret);
                        return ret;
+               }
        }
 
        return 0;
@@ -546,8 +766,8 @@ static void bch2_gc_free(struct bch_fs *c)
                        ca->mi.nbuckets * sizeof(struct bucket));
                ca->buckets[1] = NULL;
 
-               free_percpu(ca->usage[1]);
-               ca->usage[1] = NULL;
+               free_percpu(ca->usage_gc);
+               ca->usage_gc = NULL;
        }
 
        free_percpu(c->usage_gc);
@@ -555,13 +775,12 @@ static void bch2_gc_free(struct bch_fs *c)
 }
 
 static int bch2_gc_done(struct bch_fs *c,
-                       bool initial, bool metadata_only)
+                       bool initial)
 {
        struct bch_dev *ca;
-       bool verify = !metadata_only &&
-               (!initial ||
-                (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
-       unsigned i;
+       bool verify = (!initial ||
+                      (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+       unsigned i, dev;
        int ret = 0;
 
 #define copy_field(_f, _msg, ...)                                      \
@@ -570,18 +789,17 @@ static int bch2_gc_done(struct bch_fs *c,
                        fsck_err(c, _msg ": got %llu, should be %llu"   \
                                , ##__VA_ARGS__, dst->_f, src->_f);     \
                dst->_f = src->_f;                                      \
-               ret = 1;                                                \
+               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_stripe_field(_f, _msg, ...)                               \
        if (dst->_f != src->_f) {                                       \
                if (verify)                                             \
                        fsck_err(c, "stripe %zu has wrong "_msg         \
                                ": got %u, should be %u",               \
-                               dst_iter.pos, ##__VA_ARGS__,            \
+                               iter.pos, ##__VA_ARGS__,                \
                                dst->_f, src->_f);                      \
                dst->_f = src->_f;                                      \
-               dst->dirty = true;                                      \
-               ret = 1;                                                \
+               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_bucket_field(_f)                                          \
        if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
@@ -592,49 +810,46 @@ static int bch2_gc_done(struct bch_fs *c,
                                bch2_data_types[dst->b[b].mark.data_type],\
                                dst->b[b].mark._f, src->b[b].mark._f);  \
                dst->b[b]._mark._f = src->b[b].mark._f;                 \
-               ret = 1;                                                \
+               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)                                   \
        copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-       if (!metadata_only) {
-               struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
-               struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+       {
+               struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
                struct stripe *dst, *src;
-               unsigned i;
-
-               c->ec_stripes_heap.used = 0;
-
-               while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
-                      (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
-                       BUG_ON(src_iter.pos != dst_iter.pos);
 
-                       copy_stripe_field(alive,        "alive");
-                       copy_stripe_field(sectors,      "sectors");
-                       copy_stripe_field(algorithm,    "algorithm");
-                       copy_stripe_field(nr_blocks,    "nr_blocks");
-                       copy_stripe_field(nr_redundant, "nr_redundant");
-                       copy_stripe_field(blocks_nonempty,
-                                         "blocks_nonempty");
+               while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
+                       dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
+
+                       if (dst->alive          != src->alive ||
+                           dst->sectors        != src->sectors ||
+                           dst->algorithm      != src->algorithm ||
+                           dst->nr_blocks      != src->nr_blocks ||
+                           dst->nr_redundant   != src->nr_redundant) {
+                               bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
+                               ret = -EINVAL;
+                               goto fsck_err;
+                       }
 
                        for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
                                copy_stripe_field(block_sectors[i],
                                                  "block_sectors[%u]", i);
 
-                       if (dst->alive) {
-                               spin_lock(&c->ec_stripes_heap_lock);
-                               bch2_stripes_heap_insert(c, dst, dst_iter.pos);
-                               spin_unlock(&c->ec_stripes_heap_lock);
-                       }
+                       dst->blocks_nonempty = 0;
+                       for (i = 0; i < dst->nr_blocks; i++)
+                               dst->blocks_nonempty += dst->block_sectors[i] != 0;
 
-                       genradix_iter_advance(&dst_iter, &c->stripes[0]);
-                       genradix_iter_advance(&src_iter, &c->stripes[1]);
+                       genradix_iter_advance(&iter, &c->stripes[1]);
                }
        }
 
-       for_each_member_device(ca, c, i) {
+       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+               bch2_fs_usage_acc_to_base(c, i);
+
+       for_each_member_device(ca, c, dev) {
                struct bucket_array *dst = __bucket_array(ca, 0);
                struct bucket_array *src = __bucket_array(ca, 1);
                size_t b;
@@ -649,12 +864,23 @@ static int bch2_gc_done(struct bch_fs *c,
 
                        dst->b[b].oldest_gen = src->b[b].oldest_gen;
                }
-       };
 
-       bch2_fs_usage_acc_to_base(c, 0);
-       bch2_fs_usage_acc_to_base(c, 1);
+               {
+                       struct bch_dev_usage *dst = ca->usage_base;
+                       struct bch_dev_usage *src = (void *)
+                               bch2_acc_percpu_u64s((void *) ca->usage_gc,
+                                                    dev_usage_u64s());
 
-       bch2_dev_usage_from_buckets(c);
+                       copy_dev_field(buckets_ec,              "buckets_ec");
+                       copy_dev_field(buckets_unavailable,     "buckets_unavailable");
+
+                       for (i = 0; i < BCH_DATA_NR; i++) {
+                               copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
+                               copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
+                               copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+                       }
+               }
+       };
 
        {
                unsigned nr = fs_usage_u64s(c);
@@ -664,28 +890,20 @@ static int bch2_gc_done(struct bch_fs *c,
 
                copy_fs_field(hidden,           "hidden");
                copy_fs_field(btree,            "btree");
+               copy_fs_field(data,     "data");
+               copy_fs_field(cached,   "cached");
+               copy_fs_field(reserved, "reserved");
+               copy_fs_field(nr_inodes,"nr_inodes");
 
-               if (!metadata_only) {
-                       copy_fs_field(data,     "data");
-                       copy_fs_field(cached,   "cached");
-                       copy_fs_field(reserved, "reserved");
-                       copy_fs_field(nr_inodes,"nr_inodes");
-
-                       for (i = 0; i < BCH_REPLICAS_MAX; i++)
-                               copy_fs_field(persistent_reserved[i],
-                                             "persistent_reserved[%i]", i);
-               }
+               for (i = 0; i < BCH_REPLICAS_MAX; i++)
+                       copy_fs_field(persistent_reserved[i],
+                                     "persistent_reserved[%i]", i);
 
                for (i = 0; i < c->replicas.nr; i++) {
                        struct bch_replicas_entry *e =
                                cpu_replicas_entry(&c->replicas, i);
                        char buf[80];
 
-                       if (metadata_only &&
-                           (e->data_type == BCH_DATA_user ||
-                            e->data_type == BCH_DATA_cached))
-                               continue;
-
                        bch2_replicas_entry_to_text(&PBUF(buf), e);
 
                        copy_fs_field(replicas[i], "%s", buf);
@@ -698,11 +916,12 @@ static int bch2_gc_done(struct bch_fs *c,
 #undef copy_stripe_field
 #undef copy_field
 fsck_err:
+       if (ret)
+               bch_err(c, "%s: ret %i", __func__, ret);
        return ret;
 }
 
-static int bch2_gc_start(struct bch_fs *c,
-                        bool metadata_only)
+static int bch2_gc_start(struct bch_fs *c)
 {
        struct bch_dev *ca;
        unsigned i;
@@ -719,7 +938,7 @@ static int bch2_gc_start(struct bch_fs *c,
 
        for_each_member_device(ca, c, i) {
                BUG_ON(ca->buckets[1]);
-               BUG_ON(ca->usage[1]);
+               BUG_ON(ca->usage_gc);
 
                ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
                                ca->mi.nbuckets * sizeof(struct bucket),
@@ -730,9 +949,9 @@ static int bch2_gc_start(struct bch_fs *c,
                        return -ENOMEM;
                }
 
-               ca->usage[1] = alloc_percpu(struct bch_dev_usage);
-               if (!ca->usage[1]) {
-                       bch_err(c, "error allocating ca->usage[gc]");
+               ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+               if (!ca->usage_gc) {
+                       bch_err(c, "error allocating ca->usage_gc");
                        percpu_ref_put(&ca->ref);
                        return -ENOMEM;
                }
@@ -766,13 +985,6 @@ static int bch2_gc_start(struct bch_fs *c,
 
                        d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
                        d->gen_valid = s->gen_valid;
-
-                       if (metadata_only &&
-                           (s->mark.data_type == BCH_DATA_user ||
-                            s->mark.data_type == BCH_DATA_cached)) {
-                               d->_mark = s->mark;
-                               d->_mark.owned_by_allocator = 0;
-                       }
                }
        };
 
@@ -799,8 +1011,7 @@ static int bch2_gc_start(struct bch_fs *c,
  *    move around - if references move backwards in the ordering GC
  *    uses, GC could skip past them
  */
-int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
-           bool initial, bool metadata_only)
+int bch2_gc(struct bch_fs *c, bool initial)
 {
        struct bch_dev *ca;
        u64 start_time = local_clock();
@@ -816,13 +1027,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
        closure_wait_event(&c->btree_interior_update_wait,
                           !bch2_btree_interior_updates_nr_pending(c));
 again:
-       ret = bch2_gc_start(c, metadata_only);
+       ret = bch2_gc_start(c);
        if (ret)
                goto out;
 
        bch2_mark_superblocks(c);
 
-       ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
+       ret = bch2_gc_btrees(c, initial);
        if (ret)
                goto out;
 
@@ -832,16 +1043,15 @@ again:
        bch2_mark_allocator_buckets(c);
 
        c->gc_count++;
-out:
-       if (!ret &&
-           (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
-            (!iter && test_restart_gc(c)))) {
+
+       if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+           (!iter && bch2_test_restart_gc)) {
                /*
                 * XXX: make sure gens we fixed got saved
                 */
                if (iter++ <= 2) {
-                       bch_info(c, "Fixed gens, restarting mark and sweep:");
-                       clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+                       bch_info(c, "Second GC pass needed, restarting:");
+                       clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
                        percpu_down_write(&c->mark_lock);
@@ -856,12 +1066,12 @@ out:
                bch_info(c, "Unable to fix bucket gens, looping");
                ret = -EINVAL;
        }
-
+out:
        if (!ret) {
                bch2_journal_block(&c->journal);
 
                percpu_down_write(&c->mark_lock);
-               ret = bch2_gc_done(c, initial, metadata_only);
+               ret = bch2_gc_done(c, initial);
 
                bch2_journal_unblock(&c->journal);
        } else {
@@ -931,19 +1141,21 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        int ret = 0;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-                                  BTREE_ITER_PREFETCH);
+                                  BTREE_ITER_PREFETCH|
+                                  BTREE_ITER_NOT_EXTENTS|
+                                  BTREE_ITER_ALL_SNAPSHOTS);
 
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k))) {
                if (gc_btree_gens_key(c, k)) {
-                       bkey_on_stack_reassemble(&sk, c, k);
+                       bch2_bkey_buf_reassemble(&sk, c, k);
                        bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
                        bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
@@ -959,11 +1171,12 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
                        }
                }
 
-               bch2_btree_iter_next(iter);
+               bch2_btree_iter_advance(iter);
        }
+       bch2_trans_iter_put(&trans, iter);
 
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
@@ -1061,6 +1274,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
        /* Find a format that all keys in @old_nodes can pack into */
        bch2_bkey_format_init(&format_state);
 
+       /*
+        * XXX: this won't correctly take it account the new min/max keys:
+        */
        for (i = 0; i < nr_old_nodes; i++)
                __bch2_btree_calc_format(&format_state, old_nodes[i]);
 
@@ -1075,17 +1291,16 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                }
 
        if (bch2_keylist_realloc(&keylist, NULL, 0,
-                       (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+                       BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) {
                trace_btree_gc_coalesce_fail(c,
                                BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
                return;
        }
 
-       as = bch2_btree_update_start(iter->trans, iter->btree_id,
+       as = bch2_btree_update_start(iter, old_nodes[0]->c.level,
                        btree_update_reserve_required(c, parent) + nr_old_nodes,
                        BTREE_INSERT_NOFAIL|
-                       BTREE_INSERT_USE_RESERVE,
-                       NULL);
+                       BTREE_INSERT_USE_RESERVE);
        if (IS_ERR(as)) {
                trace_btree_gc_coalesce_fail(c,
                                BTREE_GC_COALESCE_FAIL_RESERVE_GET);
@@ -1123,7 +1338,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                     k < vstruct_last(s2) &&
                     vstruct_blocks_plus(n1->data, c->block_bits,
                                         u64s + k->u64s) <= blocks;
-                    k = bkey_next_skip_noops(k, vstruct_last(s2))) {
+                    k = bkey_next(k)) {
                        last = k;
                        u64s += k->u64s;
                }
@@ -1152,7 +1367,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                        n1->key.k.p = n1->data->max_key =
                                bkey_unpack_pos(n1, last);
 
-                       n2->data->min_key = bkey_successor(n1->data->max_key);
+                       n2->data->min_key = bpos_successor(n1->data->max_key);
 
                        memcpy_u64s(vstruct_last(s1),
                                    s2->start, u64s);
@@ -1195,7 +1410,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                unsigned j;
 
                for (j = 0; j < nr_new_nodes; j++)
-                       if (!bkey_cmp(old_nodes[i]->key.k.p,
+                       if (!bpos_cmp(old_nodes[i]->key.k.p,
                                      new_nodes[j]->key.k.p))
                                goto next;
 
@@ -1258,6 +1473,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
        struct btree *b;
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        unsigned i;
+       int ret = 0;
 
        /* Sliding window of adjacent btree nodes */
        struct btree *merge[GC_MERGE_NODES];
@@ -1306,8 +1522,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
                lock_seq[0] = merge[0]->c.lock.state.seq;
 
                if (kthread && kthread_should_stop()) {
-                       bch2_trans_exit(&trans);
-                       return -ESHUTDOWN;
+                       ret = -ESHUTDOWN;
+                       break;
                }
 
                bch2_trans_cond_resched(&trans);
@@ -1322,7 +1538,9 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
                        memset(merge + 1, 0,
                               (GC_MERGE_NODES - 1) * sizeof(merge[0]));
        }
-       return bch2_trans_exit(&trans);
+       bch2_trans_iter_put(&trans, iter);
+
+       return bch2_trans_exit(&trans) ?: ret;
 }
 
 /**
@@ -1355,7 +1573,7 @@ static int bch2_gc_thread(void *arg)
 {
        struct bch_fs *c = arg;
        struct io_clock *clock = &c->io_clock[WRITE];
-       unsigned long last = atomic_long_read(&clock->now);
+       unsigned long last = atomic64_read(&clock->now);
        unsigned last_kick = atomic_read(&c->kick_gc);
        int ret;
 
@@ -1376,7 +1594,7 @@ static int bch2_gc_thread(void *arg)
                        if (c->btree_gc_periodic) {
                                unsigned long next = last + c->capacity / 16;
 
-                               if (atomic_long_read(&clock->now) >= next)
+                               if (atomic64_read(&clock->now) >= next)
                                        break;
 
                                bch2_io_clock_schedule_timeout(clock, next);
@@ -1388,14 +1606,14 @@ static int bch2_gc_thread(void *arg)
                }
                __set_current_state(TASK_RUNNING);
 
-               last = atomic_long_read(&clock->now);
+               last = atomic64_read(&clock->now);
                last_kick = atomic_read(&c->kick_gc);
 
                /*
                 * Full gc is currently incompatible with btree key cache:
                 */
 #if 0
-               ret = bch2_gc(c, NULL, false, false);
+               ret = bch2_gc(c, false, false);
 #else
                ret = bch2_gc_gens(c);
 #endif
@@ -1425,11 +1643,14 @@ int bch2_gc_thread_start(struct bch_fs *c)
 {
        struct task_struct *p;
 
-       BUG_ON(c->gc_thread);
+       if (c->gc_thread)
+               return 0;
 
-       p = kthread_create(bch2_gc_thread, c, "bch_gc");
-       if (IS_ERR(p))
+       p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
+       if (IS_ERR(p)) {
+               bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
                return PTR_ERR(p);
+       }
 
        get_task_struct(p);
        c->gc_thread = p;
index 3694a3df62a8c208271e64bdaa3b3439d697d89b..b1362a9f94186e75c0e206293b782aef6ae03d06 100644 (file)
@@ -6,8 +6,7 @@
 
 void bch2_coalesce(struct bch_fs *);
 
-struct journal_keys;
-int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
+int bch2_gc(struct bch_fs *, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
@@ -46,19 +45,15 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
 
 static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 {
-       if (l.phase != r.phase)
-               return l.phase < r.phase ? -1 : 1;
-       if (bkey_cmp(l.pos, r.pos))
-               return bkey_cmp(l.pos, r.pos);
-       if (l.level != r.level)
-               return l.level < r.level ? -1 : 1;
-       return 0;
+       return  cmp_int(l.phase, r.phase) ?:
+               bpos_cmp(l.pos, r.pos) ?:
+               cmp_int(l.level, r.level);
 }
 
 static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
 {
        switch (id) {
-#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
        BCH_BTREE_IDS()
 #undef x
        default:
index 682f599cbef588d6a912cceaf7627b73a12d48d1..ec1290fa9138191cf845502883aff65266147e07 100644 (file)
@@ -24,8 +24,7 @@
 
 static void verify_no_dups(struct btree *b,
                           struct bkey_packed *start,
-                          struct bkey_packed *end,
-                          bool extents)
+                          struct bkey_packed *end)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
        struct bkey_packed *k, *p;
@@ -33,16 +32,13 @@ static void verify_no_dups(struct btree *b,
        if (start == end)
                return;
 
-       for (p = start, k = bkey_next_skip_noops(start, end);
+       for (p = start, k = bkey_next(start);
             k != end;
-            p = k, k = bkey_next_skip_noops(k, end)) {
+            p = k, k = bkey_next(k)) {
                struct bkey l = bkey_unpack_key(b, p);
                struct bkey r = bkey_unpack_key(b, k);
 
-               BUG_ON(extents
-                      ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
-                      : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
-               //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
+               BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
        }
 #endif
 }
@@ -51,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v)
 {
        struct bkey_packed *k;
 
-       for (k = i->start;
-            k != vstruct_last(i);
-            k = bkey_next_skip_noops(k, vstruct_last(i)))
+       for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
                k->needs_whiteout = v;
 }
 
@@ -102,14 +96,14 @@ static void sort_bkey_ptrs(const struct btree *bt,
                        break;
 
                for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
-                       b = bkey_cmp_packed(bt,
+                       b = bch2_bkey_cmp_packed(bt,
                                            ptrs[c],
                                            ptrs[d]) >= 0 ? c : d;
                if (d == n)
                        b = c;
 
                while (b != a &&
-                      bkey_cmp_packed(bt,
+                      bch2_bkey_cmp_packed(bt,
                                       ptrs[a],
                                       ptrs[b]) >= 0)
                        b = (b - 1) / 2;
@@ -150,8 +144,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
        }
 
        verify_no_dups(b, new_whiteouts,
-                      (void *) ((u64 *) new_whiteouts + b->whiteout_u64s),
-                      btree_node_old_extent_overwrite(b));
+                      (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
 
        memcpy_u64s(unwritten_whiteouts_start(c, b),
                    new_whiteouts, b->whiteout_u64s);
@@ -176,144 +169,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t,
        }
 }
 
-static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
-                                         struct btree *b,
-                                         enum compact_mode mode)
-{
-       const struct bkey_format *f = &b->format;
-       struct bset_tree *t;
-       struct bkey_packed *whiteouts = NULL;
-       struct bkey_packed *u_start, *u_pos;
-       struct sort_iter sort_iter;
-       unsigned bytes, whiteout_u64s = 0, u64s;
-       bool used_mempool, compacting = false;
-
-       BUG_ON(!btree_node_is_extents(b));
-
-       for_each_bset(b, t)
-               if (should_compact_bset(b, t, whiteout_u64s != 0, mode))
-                       whiteout_u64s += bset_dead_u64s(b, t);
-
-       if (!whiteout_u64s)
-               return false;
-
-       bch2_sort_whiteouts(c, b);
-
-       sort_iter_init(&sort_iter, b);
-
-       whiteout_u64s += b->whiteout_u64s;
-       bytes = whiteout_u64s * sizeof(u64);
-
-       whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
-       u_start = u_pos = whiteouts;
-
-       memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
-                   b->whiteout_u64s);
-       u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
-
-       sort_iter_add(&sort_iter, u_start, u_pos);
-
-       for_each_bset(b, t) {
-               struct bset *i = bset(b, t);
-               struct bkey_packed *k, *n, *out, *start, *end;
-               struct btree_node_entry *src = NULL, *dst = NULL;
-
-               if (t != b->set && !bset_written(b, i)) {
-                       src = container_of(i, struct btree_node_entry, keys);
-                       dst = max(write_block(b),
-                                 (void *) btree_bkey_last(b, t - 1));
-               }
-
-               if (src != dst)
-                       compacting = true;
-
-               if (!should_compact_bset(b, t, compacting, mode)) {
-                       if (src != dst) {
-                               memmove(dst, src, sizeof(*src) +
-                                       le16_to_cpu(src->keys.u64s) *
-                                       sizeof(u64));
-                               i = &dst->keys;
-                               set_btree_bset(b, t, i);
-                       }
-                       continue;
-               }
-
-               compacting = true;
-               u_start = u_pos;
-               start = i->start;
-               end = vstruct_last(i);
-
-               if (src != dst) {
-                       memmove(dst, src, sizeof(*src));
-                       i = &dst->keys;
-                       set_btree_bset(b, t, i);
-               }
-
-               out = i->start;
-
-               for (k = start; k != end; k = n) {
-                       n = bkey_next_skip_noops(k, end);
-
-                       if (bkey_deleted(k))
-                               continue;
-
-                       BUG_ON(bkey_whiteout(k) &&
-                              k->needs_whiteout &&
-                              bkey_written(b, k));
-
-                       if (bkey_whiteout(k) && !k->needs_whiteout)
-                               continue;
-
-                       if (bkey_whiteout(k)) {
-                               memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
-                               set_bkeyp_val_u64s(f, u_pos, 0);
-                               u_pos = bkey_next(u_pos);
-                       } else {
-                               bkey_copy(out, k);
-                               out = bkey_next(out);
-                       }
-               }
-
-               sort_iter_add(&sort_iter, u_start, u_pos);
-
-               i->u64s = cpu_to_le16((u64 *) out - i->_data);
-               set_btree_bset_end(b, t);
-               bch2_bset_set_no_aux_tree(b, t);
-       }
-
-       b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
-
-       BUG_ON((void *) unwritten_whiteouts_start(c, b) <
-              (void *) btree_bkey_last(b, bset_tree_last(b)));
-
-       u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
-                                         &sort_iter);
-
-       BUG_ON(u64s > b->whiteout_u64s);
-       BUG_ON(u_pos != whiteouts && !u64s);
-
-       if (u64s != b->whiteout_u64s) {
-               void *src = unwritten_whiteouts_start(c, b);
-
-               b->whiteout_u64s = u64s;
-               memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
-       }
-
-       verify_no_dups(b,
-                      unwritten_whiteouts_start(c, b),
-                      unwritten_whiteouts_end(c, b),
-                      true);
-
-       btree_bounce_free(c, bytes, used_mempool, whiteouts);
-
-       bch2_btree_build_aux_trees(b);
-
-       bch_btree_keys_u64s_remaining(c, b);
-       bch2_verify_btree_nr_keys(b);
-
-       return true;
-}
-
 static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 {
        struct bset_tree *t;
@@ -356,9 +211,9 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
                out = i->start;
 
                for (k = start; k != end; k = n) {
-                       n = bkey_next_skip_noops(k, end);
+                       n = bkey_next(k);
 
-                       if (!bkey_whiteout(k)) {
+                       if (!bkey_deleted(k)) {
                                bkey_copy(out, k);
                                out = bkey_next(out);
                        } else {
@@ -382,9 +237,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
                            enum compact_mode mode)
 {
-       return !btree_node_old_extent_overwrite(b)
-               ? bch2_drop_whiteouts(b, mode)
-               : bch2_compact_extent_whiteouts(c, b, mode);
+       return bch2_drop_whiteouts(b, mode);
 }
 
 static void btree_node_sort(struct bch_fs *c, struct btree *b,
@@ -422,14 +275,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
        start_time = local_clock();
 
-       if (btree_node_old_extent_overwrite(b))
-               filter_whiteouts = bset_written(b, start_bset);
-
-       u64s = (btree_node_old_extent_overwrite(b)
-               ? bch2_sort_extents
-               : bch2_sort_keys)(out->keys.start,
-                                 &sort_iter,
-                                 filter_whiteouts);
+       u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
 
        out->keys.u64s = cpu_to_le16(u64s);
 
@@ -597,18 +443,30 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
                bch2_btree_iter_reinit_node(iter, b);
 }
 
-static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
-                         struct btree *b, struct bset *i,
-                         unsigned offset, int write)
+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+                         struct btree *b)
 {
-       pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n"
-              "pos ",
-              write ? "before write " : "",
-              b->c.btree_id, b->c.level,
+       pr_buf(out, "%s level %u/%u\n  ",
+              bch2_btree_ids[b->c.btree_id],
+              b->c.level,
               c->btree_roots[b->c.btree_id].level);
        bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
 
-       pr_buf(out, " node offset %u", b->written);
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+                         struct bch_dev *ca,
+                         struct btree *b, struct bset *i,
+                         unsigned offset, int write)
+{
+       pr_buf(out, "error validating btree node ");
+       if (write)
+               pr_buf(out, "before write ");
+       if (ca)
+               pr_buf(out, "on %s ", ca->name);
+       pr_buf(out, "at btree ");
+       btree_pos_to_text(out, c, b);
+
+       pr_buf(out, "\n  node offset %u", b->written);
        if (i)
                pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
 }
@@ -624,25 +482,30 @@ enum btree_validate_ret {
        BTREE_RETRY_READ = 64,
 };
 
-#define btree_err(type, c, b, i, msg, ...)                             \
+#define btree_err(type, c, ca, b, i, msg, ...)                         \
 ({                                                                     \
        __label__ out;                                                  \
        char _buf[300];                                                 \
+       char *_buf2 = _buf;                                             \
        struct printbuf out = PBUF(_buf);                               \
                                                                        \
-       btree_err_msg(&out, c, b, i, b->written, write);                \
+       _buf2 = kmalloc(4096, GFP_ATOMIC);                              \
+       if (_buf2)                                                      \
+               out = _PBUF(_buf2, 4986);                               \
+                                                                       \
+       btree_err_msg(&out, c, ca, b, i, b->written, write);            \
        pr_buf(&out, ": " msg, ##__VA_ARGS__);                          \
                                                                        \
        if (type == BTREE_ERR_FIXABLE &&                                \
            write == READ &&                                            \
            !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
-               mustfix_fsck_err(c, "%s", _buf);                        \
+               mustfix_fsck_err(c, "%s", _buf2);                       \
                goto out;                                               \
        }                                                               \
                                                                        \
        switch (write) {                                                \
        case READ:                                                      \
-               bch_err(c, "%s", _buf);                                 \
+               bch_err(c, "%s", _buf2);                                        \
                                                                        \
                switch (type) {                                         \
                case BTREE_ERR_FIXABLE:                                 \
@@ -663,7 +526,7 @@ enum btree_validate_ret {
                }                                                       \
                break;                                                  \
        case WRITE:                                                     \
-               bch_err(c, "corrupt metadata before write: %s", _buf);  \
+               bch_err(c, "corrupt metadata before write: %s", _buf2); \
                                                                        \
                if (bch2_fs_inconsistent(c)) {                          \
                        ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
@@ -672,34 +535,62 @@ enum btree_validate_ret {
                break;                                                  \
        }                                                               \
 out:                                                                   \
+       if (_buf2 != _buf)                                              \
+               kfree(_buf2);                                           \
        true;                                                           \
 })
 
 #define btree_err_on(cond, ...)        ((cond) ? btree_err(__VA_ARGS__) : false)
 
-static int validate_bset(struct bch_fs *c, struct btree *b,
-                        struct bset *i, unsigned sectors,
-                        int write, bool have_retry)
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
+                        struct btree *b, struct bset *i,
+                        unsigned sectors, int write, bool have_retry)
 {
        unsigned version = le16_to_cpu(i->version);
        const char *err;
+       char buf1[100];
+       char buf2[100];
        int ret = 0;
 
        btree_err_on((version != BCH_BSET_VERSION_OLD &&
                      version < bcachefs_metadata_version_min) ||
                     version >= bcachefs_metadata_version_max,
-                    BTREE_ERR_FATAL, c, b, i,
+                    BTREE_ERR_FATAL, c, ca, b, i,
                     "unsupported bset version");
 
+       if (btree_err_on(version < c->sb.version_min,
+                        BTREE_ERR_FIXABLE, c, NULL, b, i,
+                        "bset version %u older than superblock version_min %u",
+                        version, c->sb.version_min)) {
+               mutex_lock(&c->sb_lock);
+               c->disk_sb.sb->version_min = cpu_to_le16(version);
+               bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
+       }
+
+       if (btree_err_on(version > c->sb.version,
+                        BTREE_ERR_FIXABLE, c, NULL, b, i,
+                        "bset version %u newer than superblock version %u",
+                        version, c->sb.version)) {
+               mutex_lock(&c->sb_lock);
+               c->disk_sb.sb->version = cpu_to_le16(version);
+               bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
+       }
+
+       btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
+                    BTREE_ERR_FATAL, c, ca, b, i,
+                    "BSET_SEPARATE_WHITEOUTS no longer supported");
+
        if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
-                        BTREE_ERR_FIXABLE, c, b, i,
+                        BTREE_ERR_FIXABLE, c, ca, b, i,
                         "bset past end of btree node")) {
                i->u64s = 0;
                return 0;
        }
 
        btree_err_on(b->written && !i->u64s,
-                    BTREE_ERR_FIXABLE, c, b, i,
+                    BTREE_ERR_FIXABLE, c, ca, b, i,
                     "empty bset");
 
        if (!b->written) {
@@ -713,24 +604,18 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
                        /* XXX endianness */
                        btree_err_on(bp->seq != bn->keys.seq,
-                                    BTREE_ERR_MUST_RETRY, c, b, NULL,
+                                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
                                     "incorrect sequence number (wrong btree node)");
                }
 
                btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-                            BTREE_ERR_MUST_RETRY, c, b, i,
+                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
                             "incorrect btree id");
 
                btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-                            BTREE_ERR_MUST_RETRY, c, b, i,
+                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
                             "incorrect level");
 
-               if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
-                       u64 *p = (u64 *) &bn->ptr;
-
-                       *p = swab64(*p);
-               }
-
                if (!write)
                        compat_btree_node(b->c.level, b->c.btree_id, version,
                                          BSET_BIG_ENDIAN(i), write, bn);
@@ -739,42 +624,30 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                        struct bch_btree_ptr_v2 *bp =
                                &bkey_i_to_btree_ptr_v2(&b->key)->v;
 
-                       btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
-                                    BTREE_ERR_MUST_RETRY, c, b, NULL,
-                                    "incorrect min_key: got %llu:%llu should be %llu:%llu",
-                                    b->data->min_key.inode,
-                                    b->data->min_key.offset,
-                                    bp->min_key.inode,
-                                    bp->min_key.offset);
+                       if (BTREE_PTR_RANGE_UPDATED(bp)) {
+                               b->data->min_key = bp->min_key;
+                               b->data->max_key = b->key.k.p;
+                       }
+
+                       btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
+                                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                                    "incorrect min_key: got %s should be %s",
+                                    (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
+                                    (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
                }
 
-               btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
-                            BTREE_ERR_MUST_RETRY, c, b, i,
-                            "incorrect max key %llu:%llu",
-                            bn->max_key.inode,
-                            bn->max_key.offset);
+               btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
+                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
+                            "incorrect max key %s",
+                            (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
 
                if (write)
                        compat_btree_node(b->c.level, b->c.btree_id, version,
                                          BSET_BIG_ENDIAN(i), write, bn);
 
-               /* XXX: ideally we would be validating min_key too */
-#if 0
-               /*
-                * not correct anymore, due to btree node write error
-                * handling
-                *
-                * need to add bn->seq to btree keys and verify
-                * against that
-                */
-               btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
-                                                 bn->ptr),
-                            BTREE_ERR_FATAL, c, b, i,
-                            "incorrect backpointer");
-#endif
                err = bch2_bkey_format_validate(&bn->format);
                btree_err_on(err,
-                            BTREE_ERR_FATAL, c, b, i,
+                            BTREE_ERR_FATAL, c, ca, b, i,
                             "invalid bkey format: %s", err);
 
                compat_bformat(b->c.level, b->c.btree_id, version,
@@ -791,14 +664,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 {
        unsigned version = le16_to_cpu(i->version);
        struct bkey_packed *k, *prev = NULL;
-       bool seen_non_whiteout = false;
        int ret = 0;
 
-       if (!BSET_SEPARATE_WHITEOUTS(i)) {
-               seen_non_whiteout = true;
-               *whiteout_u64s = 0;
-       }
-
        for (k = i->start;
             k != vstruct_last(i);) {
                struct bkey_s u;
@@ -806,14 +673,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                const char *invalid;
 
                if (btree_err_on(bkey_next(k) > vstruct_last(i),
-                                BTREE_ERR_FIXABLE, c, b, i,
+                                BTREE_ERR_FIXABLE, c, NULL, b, i,
                                 "key extends past end of bset")) {
                        i->u64s = cpu_to_le16((u64 *) k - i->_data);
                        break;
                }
 
                if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-                                BTREE_ERR_FIXABLE, c, b, i,
+                                BTREE_ERR_FIXABLE, c, NULL, b, i,
                                 "invalid bkey format %u", k->format)) {
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
@@ -836,8 +703,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                        char buf[160];
 
                        bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
-                       btree_err(BTREE_ERR_FIXABLE, c, b, i,
-                                 "invalid bkey:\n%s\n%s", invalid, buf);
+                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
+                                 "invalid bkey: %s\n%s", invalid, buf);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
@@ -850,18 +717,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                                    BSET_BIG_ENDIAN(i), write,
                                    &b->format, k);
 
-               /*
-                * with the separate whiteouts thing (used for extents), the
-                * second set of keys actually can have whiteouts too, so we
-                * can't solely go off bkey_whiteout()...
-                */
-
-               if (!seen_non_whiteout &&
-                   (!bkey_whiteout(k) ||
-                    (prev && bkey_iter_cmp(b, prev, k) > 0))) {
-                       *whiteout_u64s = k->_data - i->_data;
-                       seen_non_whiteout = true;
-               } else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+               if (prev && bkey_iter_cmp(b, prev, k) > 0) {
                        char buf1[80];
                        char buf2[80];
                        struct bkey up = bkey_unpack_key(b, prev);
@@ -870,20 +726,26 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                        bch2_bkey_to_text(&PBUF(buf2), u.k);
 
                        bch2_dump_bset(c, b, i, 0);
-                       btree_err(BTREE_ERR_FATAL, c, b, i,
-                                 "keys out of order: %s > %s",
-                                 buf1, buf2);
-                       /* XXX: repair this */
+
+                       if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
+                                     "keys out of order: %s > %s",
+                                     buf1, buf2)) {
+                               i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+                               memmove_u64s_down(k, bkey_next(k),
+                                                 (u64 *) vstruct_end(i) - (u64 *) k);
+                               continue;
+                       }
                }
 
                prev = k;
-               k = bkey_next_skip_noops(k, vstruct_last(i));
+               k = bkey_next(k);
        }
 fsck_err:
        return ret;
 }
 
-int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
+                             struct btree *b, bool have_retry)
 {
        struct btree_node_entry *bne;
        struct sort_iter *iter;
@@ -895,20 +757,22 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
        unsigned u64s;
        int ret, retry_read = 0, write = READ;
 
+       b->version_ondisk = U16_MAX;
+
        iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
        sort_iter_init(iter, b);
        iter->size = (btree_blocks(c) + 1) * 2;
 
        if (bch2_meta_read_fault("btree"))
-               btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+               btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
                          "dynamic fault");
 
        btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-                    BTREE_ERR_MUST_RETRY, c, b, NULL,
+                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
                     "bad magic");
 
        btree_err_on(!b->data->keys.seq,
-                    BTREE_ERR_MUST_RETRY, c, b, NULL,
+                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
                     "bad btree header");
 
        if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
@@ -916,7 +780,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                        &bkey_i_to_btree_ptr_v2(&b->key)->v;
 
                btree_err_on(b->data->keys.seq != bp->seq,
-                            BTREE_ERR_MUST_RETRY, c, b, NULL,
+                            BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
                             "got wrong btree node (seq %llx want %llx)",
                             b->data->keys.seq, bp->seq);
        }
@@ -931,7 +795,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                        i = &b->data->keys;
 
                        btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
                                     "unknown checksum type %llu",
                                     BSET_CSUM_TYPE(i));
 
@@ -939,16 +803,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                        csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
 
                        btree_err_on(bch2_crc_cmp(csum, b->data->csum),
-                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
                                     "invalid checksum");
 
                        bset_encrypt(c, i, b->written << 9);
 
-                       if (btree_node_is_extents(b) &&
-                           !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
-                               set_btree_node_old_extent_overwrite(b);
-                               set_btree_node_need_rewrite(b);
-                       }
+                       btree_err_on(btree_node_is_extents(b) &&
+                                    !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
+                                    BTREE_ERR_FATAL, c, NULL, b, NULL,
+                                    "btree node does not have NEW_EXTENT_OVERWRITE set");
 
                        sectors = vstruct_sectors(b->data, c->block_bits);
                } else {
@@ -959,7 +822,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                                break;
 
                        btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
                                     "unknown checksum type %llu",
                                     BSET_CSUM_TYPE(i));
 
@@ -967,7 +830,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                        csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 
                        btree_err_on(bch2_crc_cmp(csum, bne->csum),
-                                    BTREE_ERR_WANT_RETRY, c, b, i,
+                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
                                     "invalid checksum");
 
                        bset_encrypt(c, i, b->written << 9);
@@ -975,7 +838,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                        sectors = vstruct_sectors(bne, c->block_bits);
                }
 
-               ret = validate_bset(c, b, i, sectors,
+               b->version_ondisk = min(b->version_ondisk,
+                                       le16_to_cpu(i->version));
+
+               ret = validate_bset(c, ca, b, i, sectors,
                                    READ, have_retry);
                if (ret)
                        goto fsck_err;
@@ -997,7 +863,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                                        true);
 
                btree_err_on(blacklisted && first,
-                            BTREE_ERR_FIXABLE, c, b, i,
+                            BTREE_ERR_FIXABLE, c, ca, b, i,
                             "first btree node bset has blacklisted journal seq");
                if (blacklisted && !first)
                        continue;
@@ -1014,7 +880,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
             bset_byte_offset(b, bne) < btree_bytes(c);
             bne = (void *) bne + block_bytes(c))
                btree_err_on(bne->keys.seq == b->data->keys.seq,
-                            BTREE_ERR_WANT_RETRY, c, b, NULL,
+                            BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
                             "found bset signature after last bset");
 
        sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
@@ -1022,9 +888,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
        set_btree_bset(b, b->set, &b->data->keys);
 
-       b->nr = (btree_node_old_extent_overwrite(b)
-                ? bch2_extent_sort_fix_overlapping
-                : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter);
+       b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
 
        u64s = le16_to_cpu(sorted->keys.u64s);
        *sorted = *b->data;
@@ -1044,12 +908,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
 
                if (invalid ||
-                   (inject_invalid_keys(c) &&
+                   (bch2_inject_invalid_keys &&
                     !bversion_cmp(u.k->version, MAX_VERSION))) {
                        char buf[160];
 
                        bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
-                       btree_err(BTREE_ERR_FIXABLE, c, b, i,
+                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
                                  "invalid bkey %s: %s", buf, invalid);
 
                        btree_keys_account_key_drop(&b->nr, 0, k);
@@ -1067,7 +931,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                        bp.v->mem_ptr = 0;
                }
 
-               k = bkey_next_skip_noops(k, vstruct_last(i));
+               k = bkey_next(k);
        }
 
        bch2_bset_build_aux_tree(b, b->set, false);
@@ -1079,7 +943,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
        bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
-               if (ca->mi.state != BCH_MEMBER_STATE_RW)
+               if (ca->mi.state != BCH_MEMBER_STATE_rw)
                        set_btree_node_need_rewrite(b);
        }
 out:
@@ -1104,6 +968,8 @@ static void btree_node_read_work(struct work_struct *work)
        struct btree *b         = rb->bio.bi_private;
        struct bio *bio         = &rb->bio;
        struct bch_io_failures failed = { .nr = 0 };
+       char buf[200];
+       struct printbuf out;
        bool can_retry;
 
        goto start;
@@ -1123,8 +989,10 @@ static void btree_node_read_work(struct work_struct *work)
                        bio->bi_status = BLK_STS_REMOVED;
                }
 start:
-               bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
-                                  bch2_blk_status_to_str(bio->bi_status));
+               out = PBUF(buf);
+               btree_pos_to_text(&out, c, b);
+               bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+                                  bch2_blk_status_to_str(bio->bi_status), buf);
                if (rb->have_ioref)
                        percpu_ref_put(&ca->io_ref);
                rb->have_ioref = false;
@@ -1136,7 +1004,7 @@ start:
                                &failed, &rb->pick) > 0;
 
                if (!bio->bi_status &&
-                   !bch2_btree_node_read_done(c, b, can_retry))
+                   !bch2_btree_node_read_done(c, ca, b, can_retry))
                        break;
 
                if (!can_retry) {
@@ -1302,12 +1170,13 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
                                        struct btree_write_bio *wbio)
 {
        struct btree *b         = wbio->wbio.bio.bi_private;
-       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+       struct bkey_buf k;
        struct bch_extent_ptr *ptr;
        struct btree_trans trans;
        struct btree_iter *iter;
        int ret;
 
+       bch2_bkey_buf_init(&k);
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
@@ -1326,21 +1195,23 @@ retry:
 
        BUG_ON(!btree_node_hashed(b));
 
-       bkey_copy(&tmp.k, &b->key);
+       bch2_bkey_buf_copy(&k, c, &b->key);
 
-       bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
+       bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
                bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k)))
+       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
                goto err;
 
-       ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+       ret = bch2_btree_node_update_key(c, iter, b, k.k);
        if (ret == -EINTR)
                goto retry;
        if (ret)
                goto err;
 out:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
+       bch2_bkey_buf_exit(&k, c);
        bio_put(&wbio->wbio.bio);
        btree_node_write_done(c, b);
        return;
@@ -1408,7 +1279,7 @@ static void btree_node_write_endio(struct bio *bio)
        if (wbio->have_ioref)
                bch2_latency_acct(ca, wbio->submit_time, WRITE);
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
                               bch2_blk_status_to_str(bio->bi_status)) ||
            bch2_meta_write_fault("btree")) {
                spin_lock_irqsave(&c->btree_write_error_lock, flags);
@@ -1437,13 +1308,15 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
        unsigned whiteout_u64s = 0;
        int ret;
 
-       if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
+       if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
                return -1;
 
-       ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
-               validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
-       if (ret)
+       ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+               validate_bset(c, NULL, b, i, sectors, WRITE, false);
+       if (ret) {
                bch2_inconsistent_error(c);
+               dump_stack();
+       }
 
        return ret;
 }
@@ -1456,7 +1329,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        struct bset *i;
        struct btree_node *bn = NULL;
        struct btree_node_entry *bne = NULL;
-       BKEY_PADDED(key) k;
+       struct bkey_buf k;
        struct bch_extent_ptr *ptr;
        struct sort_iter sort_iter;
        struct nonce nonce;
@@ -1467,6 +1340,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        bool validate_before_checksum = false;
        void *data;
 
+       bch2_bkey_buf_init(&k);
+
        if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
                return;
 
@@ -1486,6 +1361,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (!btree_node_may_write(b))
                        return;
 
+               if (old & (1 << BTREE_NODE_never_write))
+                       return;
+
                if (old & (1 << BTREE_NODE_write_in_flight)) {
                        btree_node_wait_on_io(b);
                        continue;
@@ -1498,6 +1376,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                new ^=  (1 << BTREE_NODE_write_idx);
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
 
+       atomic_dec(&c->btree_cache.dirty);
+
        BUG_ON(btree_node_fake(b));
        BUG_ON((b->will_make_reachable != 0) != !b->written);
 
@@ -1530,6 +1410,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                seq = max(seq, le64_to_cpu(i->journal_seq));
        }
 
+       BUG_ON(b->written && !seq);
+
+       /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+       bytes += 8;
+
        data = btree_bounce_alloc(c, bytes, &used_mempool);
 
        if (!b->written) {
@@ -1545,24 +1430,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        i->journal_seq  = cpu_to_le64(seq);
        i->u64s         = 0;
 
-       if (!btree_node_old_extent_overwrite(b)) {
-               sort_iter_add(&sort_iter,
-                             unwritten_whiteouts_start(c, b),
-                             unwritten_whiteouts_end(c, b));
-               SET_BSET_SEPARATE_WHITEOUTS(i, false);
-       } else {
-               memcpy_u64s(i->start,
-                           unwritten_whiteouts_start(c, b),
-                           b->whiteout_u64s);
-               i->u64s = cpu_to_le16(b->whiteout_u64s);
-               SET_BSET_SEPARATE_WHITEOUTS(i, true);
-       }
+       sort_iter_add(&sort_iter,
+                     unwritten_whiteouts_start(c, b),
+                     unwritten_whiteouts_end(c, b));
+       SET_BSET_SEPARATE_WHITEOUTS(i, false);
 
        b->whiteout_u64s = 0;
 
-       u64s = btree_node_old_extent_overwrite(b)
-               ? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
-               : bch2_sort_keys(i->start, &sort_iter, false);
+       u64s = bch2_sort_keys(i->start, &sort_iter, false);
        le16_add_cpu(&i->u64s, u64s);
 
        set_needs_whiteout(i, false);
@@ -1590,7 +1465,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                validate_before_checksum = true;
 
        /* validate_bset will be modifying: */
-       if (le16_to_cpu(i->version) < bcachefs_metadata_version_max)
+       if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
                validate_before_checksum = true;
 
        /* if we're going to be encrypting, check metadata validity first: */
@@ -1665,15 +1540,19 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
         * just make all btree node writes FUA to keep things sane.
         */
 
-       bkey_copy(&k.key, &b->key);
+       bch2_bkey_buf_copy(&k, c, &b->key);
 
-       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr)
+       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
                ptr->offset += b->written;
 
        b->written += sectors_to_write;
 
+       atomic64_inc(&c->btree_writes_nr);
+       atomic64_add(sectors_to_write, &c->btree_writes_sectors);
+
        /* XXX: submitting IO with btree locks held: */
-       bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key);
+       bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
+       bch2_bkey_buf_exit(&k, c);
        return;
 err:
        set_btree_node_noevict(b);
@@ -1793,23 +1672,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c)
        __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
 
-void bch2_btree_verify_flushed(struct bch_fs *c)
-{
-       struct bucket_table *tbl;
-       struct rhash_head *pos;
-       struct btree *b;
-       unsigned i;
-
-       rcu_read_lock();
-       for_each_cached_btree(b, c, tbl, i, pos) {
-               unsigned long flags = READ_ONCE(b->flags);
-
-               BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
-                      (flags & (1 << BTREE_NODE_write_in_flight)));
-       }
-       rcu_read_unlock();
-}
-
 void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
 {
        struct bucket_table *tbl;
index 626d0f071b7008d7f9f76f89df0a1bda34adb2b0..9c14cd30a09e18d112c94e680cdbe80279b0852b 100644 (file)
@@ -14,6 +14,23 @@ struct btree_write;
 struct btree;
 struct btree_iter;
 
+static inline bool btree_node_dirty(struct btree *b)
+{
+       return test_bit(BTREE_NODE_dirty, &b->flags);
+}
+
+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+       if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+               atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+       if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+               atomic_dec(&c->btree_cache.dirty);
+}
+
 struct btree_read_bio {
        struct bch_fs           *c;
        u64                     start_time;
@@ -117,7 +134,8 @@ void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct bch_fs *, struct btree *,
                         struct btree_iter *);
 
-int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
+                             struct btree *, bool);
 void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
 int bch2_btree_root_read(struct bch_fs *, enum btree_id,
                         const struct bkey_i *, unsigned);
@@ -168,20 +186,29 @@ do {                                                                      \
 
 void bch2_btree_flush_all_reads(struct bch_fs *);
 void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_btree_verify_flushed(struct bch_fs *);
 void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
-                                unsigned version, unsigned big_endian,
-                                int write, struct bkey_format *f)
+                                 unsigned version, unsigned big_endian,
+                                 int write, struct bkey_format *f)
 {
        if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_id == BTREE_ID_INODES) {
+           btree_id == BTREE_ID_inodes) {
                swap(f->bits_per_field[BKEY_FIELD_INODE],
                     f->bits_per_field[BKEY_FIELD_OFFSET]);
                swap(f->field_offset[BKEY_FIELD_INODE],
                     f->field_offset[BKEY_FIELD_OFFSET]);
        }
+
+       if (version < bcachefs_metadata_version_snapshot &&
+           (level || btree_type_has_snapshots(btree_id))) {
+               u64 max_packed =
+                       ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+               f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+                       ? 0
+                       : U32_MAX - max_packed;
+       }
 }
 
 static inline void compat_bpos(unsigned level, enum btree_id btree_id,
@@ -192,7 +219,7 @@ static inline void compat_bpos(unsigned level, enum btree_id btree_id,
                bch2_bpos_swab(p);
 
        if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_id == BTREE_ID_INODES)
+           btree_id == BTREE_ID_inodes)
                swap(p->inode, p->offset);
 }
 
@@ -203,18 +230,26 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
 {
        if (version < bcachefs_metadata_version_inode_btree_change &&
            btree_node_type_is_extents(btree_id) &&
-           bkey_cmp(bn->min_key, POS_MIN) &&
+           bpos_cmp(bn->min_key, POS_MIN) &&
            write)
-               bn->min_key = bkey_predecessor(bn->min_key);
+               bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+       if (version < bcachefs_metadata_version_snapshot &&
+           write)
+               bn->max_key.snapshot = 0;
 
        compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
        compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
 
+       if (version < bcachefs_metadata_version_snapshot &&
+           !write)
+               bn->max_key.snapshot = U32_MAX;
+
        if (version < bcachefs_metadata_version_inode_btree_change &&
            btree_node_type_is_extents(btree_id) &&
-           bkey_cmp(bn->min_key, POS_MIN) &&
+           bpos_cmp(bn->min_key, POS_MIN) &&
            !write)
-               bn->min_key = bkey_successor(bn->min_key);
+               bn->min_key = bpos_nosnap_successor(bn->min_key);
 }
 
 #endif /* _BCACHEFS_BTREE_IO_H */
index 6fab76c3220c55f64687d93299ac942d941cb84d..425c9ad779e326b7b3ea32c01f54046bd0b73891 100644 (file)
@@ -2,18 +2,53 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update.h"
 #include "debug.h"
+#include "error.h"
 #include "extents.h"
 #include "journal.h"
+#include "replicas.h"
 
 #include <linux/prefetch.h>
 #include <trace/events/bcachefs.h>
 
+static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
+
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+       /* Are we iterating over keys in all snapshots? */
+       if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+               p = bpos_successor(p);
+       } else {
+               p = bpos_nosnap_successor(p);
+               p.snapshot = iter->snapshot;
+       }
+
+       return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+       /* Are we iterating over keys in all snapshots? */
+       if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+               p = bpos_predecessor(p);
+       } else {
+               p = bpos_nosnap_predecessor(p);
+               p.snapshot = iter->snapshot;
+       }
+
+       return p;
+}
+
 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
 {
        return l < BTREE_MAX_DEPTH &&
@@ -26,20 +61,20 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
 
        if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
            bkey_cmp(pos, POS_MAX))
-               pos = bkey_successor(pos);
+               pos = bkey_successor(iter, pos);
        return pos;
 }
 
 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
                                              struct btree *b)
 {
-       return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
+       return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
 }
 
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
                                             struct btree *b)
 {
-       return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
+       return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
 }
 
 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
@@ -197,13 +232,14 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                            unsigned level, struct btree_iter *iter,
                            enum six_lock_type type,
-                           six_lock_should_sleep_fn should_sleep_fn,
-                           void *p)
+                           six_lock_should_sleep_fn should_sleep_fn, void *p,
+                           unsigned long ip)
 {
        struct btree_trans *trans = iter->trans;
-       struct btree_iter *linked;
+       struct btree_iter *linked, *deadlock_iter = NULL;
        u64 start_time = local_clock();
-       bool ret = true;
+       unsigned reason = 9;
+       bool ret;
 
        /* Check if it's safe to block: */
        trans_for_each_iter(trans, linked) {
@@ -224,15 +260,33 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 */
                if (type == SIX_LOCK_intent &&
                    linked->nodes_locked != linked->nodes_intent_locked) {
-                       if (!(trans->nounlock)) {
-                               linked->locks_want = max_t(unsigned,
-                                               linked->locks_want,
-                                               __fls(linked->nodes_locked) + 1);
-                               if (!btree_iter_get_locks(linked, true, false))
-                                       ret = false;
-                       } else {
-                               ret = false;
+                       linked->locks_want = max_t(unsigned,
+                                       linked->locks_want,
+                                       __fls(linked->nodes_locked) + 1);
+                       if (!btree_iter_get_locks(linked, true, false)) {
+                               deadlock_iter = linked;
+                               reason = 1;
+                       }
+               }
+
+               if (linked->btree_id != iter->btree_id) {
+                       if (linked->btree_id > iter->btree_id) {
+                               deadlock_iter = linked;
+                               reason = 3;
                        }
+                       continue;
+               }
+
+               /*
+                * Within the same btree, cached iterators come before non
+                * cached iterators:
+                */
+               if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
+                       if (btree_iter_is_cached(iter)) {
+                               deadlock_iter = linked;
+                               reason = 4;
+                       }
+                       continue;
                }
 
                /*
@@ -240,30 +294,24 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 * another iterator has possible descendants locked of the node
                 * we're about to lock, it must have the ancestors locked too:
                 */
-               if (linked->btree_id == iter->btree_id &&
-                   level > __fls(linked->nodes_locked)) {
-                       if (!(trans->nounlock)) {
-                               linked->locks_want =
-                                       max(level + 1, max_t(unsigned,
-                                           linked->locks_want,
-                                           iter->locks_want));
-                               if (!btree_iter_get_locks(linked, true, false))
-                                       ret = false;
-                       } else {
-                               ret = false;
+               if (level > __fls(linked->nodes_locked)) {
+                       linked->locks_want =
+                               max(level + 1, max_t(unsigned,
+                                   linked->locks_want,
+                                   iter->locks_want));
+                       if (!btree_iter_get_locks(linked, true, false)) {
+                               deadlock_iter = linked;
+                               reason = 5;
                        }
                }
 
                /* Must lock btree nodes in key order: */
-               if ((cmp_int(iter->btree_id, linked->btree_id) ?:
-                    -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
-                       ret = false;
-
-               if (iter->btree_id == linked->btree_id &&
-                   btree_node_locked(linked, level) &&
-                   bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
-                                                btree_iter_type(linked))) <= 0)
-                       ret = false;
+               if (btree_node_locked(linked, level) &&
+                   bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+                                                btree_iter_type(linked))) <= 0) {
+                       deadlock_iter = linked;
+                       reason = 7;
+               }
 
                /*
                 * Recheck if this is a node we already have locked - since one
@@ -277,20 +325,36 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                }
        }
 
-       if (unlikely(!ret)) {
-               trace_trans_restart_would_deadlock(iter->trans->ip);
+       if (unlikely(deadlock_iter)) {
+               trace_trans_restart_would_deadlock(iter->trans->ip, ip,
+                               reason,
+                               deadlock_iter->btree_id,
+                               btree_iter_type(deadlock_iter),
+                               iter->btree_id,
+                               btree_iter_type(iter));
                return false;
        }
 
        if (six_trylock_type(&b->c.lock, type))
                return true;
 
-       if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
-               return false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+       trans->locking_iter_idx = iter->idx;
+       trans->locking_pos      = pos;
+       trans->locking_btree_id = iter->btree_id;
+       trans->locking_level    = level;
+       trans->locking          = b;
+#endif
 
-       bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
-                              start_time);
-       return true;
+       ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       trans->locking = NULL;
+#endif
+       if (ret)
+               bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
+                                      start_time);
+       return ret;
 }
 
 /* Btree iterator locking: */
@@ -319,7 +383,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans)
 {
        struct btree_iter *iter;
 
-       trans_for_each_iter_all(trans, iter)
+       trans_for_each_iter(trans, iter)
                bch2_btree_iter_verify_locks(iter);
 }
 #else
@@ -360,50 +424,25 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
        return false;
 }
 
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
-                                       unsigned new_locks_want)
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+                                unsigned new_locks_want)
 {
-       unsigned l = iter->level;
+       unsigned l;
 
-       EBUG_ON(iter->locks_want >= new_locks_want);
+       EBUG_ON(iter->locks_want < new_locks_want);
 
        iter->locks_want = new_locks_want;
 
-       do {
-               if (!btree_iter_node(iter, l))
-                       break;
-
-               if (!bch2_btree_node_upgrade(iter, l)) {
-                       iter->locks_want = l;
-                       return false;
-               }
-
-               l++;
-       } while (l < iter->locks_want);
-
-       return true;
-}
-
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
-                                unsigned downgrade_to)
-{
-       unsigned l, new_locks_want = downgrade_to ?:
-               (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
-
-       if (iter->locks_want < downgrade_to) {
-               iter->locks_want = new_locks_want;
-
-               while (iter->nodes_locked &&
-                      (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
-                       if (l > iter->level) {
-                               btree_node_unlock(iter, l);
-                       } else {
-                               if (btree_node_intent_locked(iter, l)) {
-                                       six_lock_downgrade(&iter->l[l].b->c.lock);
-                                       iter->nodes_intent_locked ^= 1 << l;
-                               }
-                               break;
+       while (iter->nodes_locked &&
+              (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
+               if (l > iter->level) {
+                       btree_node_unlock(iter, l);
+               } else {
+                       if (btree_node_intent_locked(iter, l)) {
+                               six_lock_downgrade(&iter->l[l].b->c.lock);
+                               iter->nodes_intent_locked ^= 1 << l;
                        }
+                       break;
                }
        }
 
@@ -423,13 +462,12 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 bool bch2_trans_relock(struct btree_trans *trans)
 {
        struct btree_iter *iter;
-       bool ret = true;
 
        trans_for_each_iter(trans, iter)
-               if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
-                       ret &= bch2_btree_iter_relock(iter, true);
-
-       return ret;
+               if (btree_iter_keep(trans, iter) &&
+                   !bch2_btree_iter_relock(iter, true))
+                       return false;
+       return true;
 }
 
 void bch2_trans_unlock(struct btree_trans *trans)
@@ -463,17 +501,20 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
                                         unsigned level)
 {
-       struct bpos pos = btree_iter_search_key(iter);
-       struct btree_iter_level *l = &iter->l[level];
-       struct btree_node_iter tmp = l->iter;
-       bool locked = btree_node_locked(iter, level);
+       struct btree_iter_level *l;
+       struct btree_node_iter tmp;
+       bool locked;
        struct bkey_packed *p, *k;
-       char buf1[100], buf2[100];
+       char buf1[100], buf2[100], buf3[100];
        const char *msg;
 
-       if (!debug_check_iterators(iter->trans->c))
+       if (!bch2_debug_check_iterators)
                return;
 
+       l       = &iter->l[level];
+       tmp     = l->iter;
+       locked  = btree_node_locked(iter, level);
+
        if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
                if (!level)
                        bch2_btree_iter_verify_cached(iter);
@@ -488,12 +529,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
        if (!bch2_btree_node_relock(iter, level))
                return;
 
-       /*
-        * Ideally this invariant would always be true, and hopefully in the
-        * future it will be, but for now set_pos_same_leaf() breaks it:
-        */
-       BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
-              !btree_iter_pos_in_node(iter, l->b));
+       BUG_ON(!btree_iter_pos_in_node(iter, l->b));
 
        /*
         * node iterators don't use leaf node iterator:
@@ -512,16 +548,16 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
         * whiteouts)
         */
        p = level || btree_node_type_is_extents(iter->btree_id)
-               ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard)
+               ? bch2_btree_node_iter_prev(&tmp, l->b)
                : bch2_btree_node_iter_prev_all(&tmp, l->b);
        k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 
-       if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
+       if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
                msg = "before";
                goto err;
        }
 
-       if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+       if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
                msg = "after";
                goto err;
        }
@@ -530,44 +566,72 @@ unlock:
                btree_node_unlock(iter, level);
        return;
 err:
-       strcpy(buf1, "(none)");
        strcpy(buf2, "(none)");
+       strcpy(buf3, "(none)");
+
+       bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
 
        if (p) {
                struct bkey uk = bkey_unpack_key(l->b, p);
-               bch2_bkey_to_text(&PBUF(buf1), &uk);
+               bch2_bkey_to_text(&PBUF(buf2), &uk);
        }
 
        if (k) {
                struct bkey uk = bkey_unpack_key(l->b, k);
-               bch2_bkey_to_text(&PBUF(buf2), &uk);
+               bch2_bkey_to_text(&PBUF(buf3), &uk);
        }
 
        panic("iterator should be %s key at level %u:\n"
-             "iter pos %s %llu:%llu\n"
+             "iter pos %s\n"
              "prev key %s\n"
              "cur  key %s\n",
-             msg, level,
-             iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
-             iter->pos.inode, iter->pos.offset,
-             buf1, buf2);
+             msg, level, buf1, buf2, buf3);
 }
 
 static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
+       enum btree_iter_type type = btree_iter_type(iter);
        unsigned i;
 
-       bch2_btree_trans_verify_locks(iter->trans);
+       EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              iter->pos.snapshot != iter->snapshot);
+
+       BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+       BUG_ON(type == BTREE_ITER_NODES &&
+              !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+       BUG_ON(type != BTREE_ITER_NODES &&
+              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              !btree_type_has_snapshots(iter->btree_id));
+
+       bch2_btree_iter_verify_locks(iter);
 
        for (i = 0; i < BTREE_MAX_DEPTH; i++)
                bch2_btree_iter_verify_level(iter, i);
 }
 
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+       enum btree_iter_type type = btree_iter_type(iter);
+
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              iter->pos.snapshot != iter->snapshot);
+
+       BUG_ON((type == BTREE_ITER_KEYS ||
+               type == BTREE_ITER_CACHED) &&
+              (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+               bkey_cmp(iter->pos, iter->k.p) > 0));
+}
+
 void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 {
        struct btree_iter *iter;
 
-       if (!debug_check_iterators(trans->c))
+       if (!bch2_debug_check_iterators)
                return;
 
        trans_for_each_iter_with_node(trans, b, iter)
@@ -578,6 +642,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
 
 static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
 
 #endif
 
@@ -603,12 +668,11 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
                                               struct bkey_packed *where)
 {
        struct btree_iter_level *l = &iter->l[b->c.level];
-       struct bpos pos = btree_iter_search_key(iter);
 
        if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
                return;
 
-       if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
+       if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
                bch2_btree_node_iter_advance(&l->iter, l->b);
 
        btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
@@ -643,7 +707,6 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
        bool iter_current_key_modified =
                orig_iter_pos >= offset &&
                orig_iter_pos <= offset + clobber_u64s;
-       struct bpos iter_pos = btree_iter_search_key(iter);
 
        btree_node_iter_for_each(node_iter, set)
                if (set->end == old_end)
@@ -651,7 +714,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
        /* didn't find the bset in the iterator - might have to readd it: */
        if (new_u64s &&
-           bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
+           bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
                bch2_btree_node_iter_push(node_iter, b, where, end);
                goto fixup_done;
        } else {
@@ -666,7 +729,7 @@ found:
                return;
 
        if (new_u64s &&
-           bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
+           bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
                set->k = offset;
        } else if (set->k < offset + clobber_u64s) {
                set->k = offset + new_u64s;
@@ -739,7 +802,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
                __bch2_btree_node_iter_fix(iter, b, node_iter, t,
                                           where, clobber_u64s, new_u64s);
 
-               if (debug_check_iterators(iter->trans->c))
+               if (bch2_debug_check_iterators)
                        bch2_btree_node_iter_verify(node_iter, b);
        }
 
@@ -769,45 +832,50 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
 
        ret = bkey_disassemble(l->b, k, u);
 
-       if (debug_check_bkeys(iter->trans->c))
+       if (bch2_debug_check_bkeys)
                bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
 
        return ret;
 }
 
 /* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
-                                                   struct btree_iter_level *l,
-                                                   struct bkey *u)
+static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
+                                                       struct btree_iter_level *l,
+                                                       struct bkey *u)
 {
        return __btree_iter_unpack(iter, l, u,
                        bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
-                                               struct btree_iter_level *l)
+static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
+                                                   struct btree_iter_level *l)
 {
-       return __btree_iter_unpack(iter, l, &iter->k,
+       struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
                        bch2_btree_node_iter_peek(&l->iter, l->b));
+
+       iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+       return k;
 }
 
-static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
-                                               struct btree_iter_level *l)
+static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
+                                                   struct btree_iter_level *l)
 {
-       return __btree_iter_unpack(iter, l, &iter->k,
+       struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
                        bch2_btree_node_iter_prev(&l->iter, l->b));
+
+       iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+       return k;
 }
 
 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
                                             struct btree_iter_level *l,
                                             int max_advance)
 {
-       struct bpos pos = btree_iter_search_key(iter);
        struct bkey_packed *k;
        int nr_advanced = 0;
 
        while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-              bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+              bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
                if (max_advance > 0 && nr_advanced >= max_advance)
                        return false;
 
@@ -845,12 +913,23 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
        if (!k ||
            bkey_deleted(k) ||
            bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
-               char buf[100];
+               char buf1[100];
+               char buf2[100];
+               char buf3[100];
+               char buf4[100];
                struct bkey uk = bkey_unpack_key(b, k);
 
-               bch2_bkey_to_text(&PBUF(buf), &uk);
-               panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
-                     buf, b->key.k.p.inode, b->key.k.p.offset);
+               bch2_dump_btree_node(iter->trans->c, l->b);
+               bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+               bch2_bkey_to_text(&PBUF(buf2), &uk);
+               bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
+               bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
+               panic("parent iter doesn't point to new node:\n"
+                     "iter pos %s %s\n"
+                     "iter key %s\n"
+                     "new node %s-%s\n",
+                     bch2_btree_ids[iter->btree_id], buf1,
+                     buf2, buf3, buf4);
        }
 
        if (!parent_locked)
@@ -860,10 +939,16 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
 static inline void __btree_iter_init(struct btree_iter *iter,
                                     unsigned level)
 {
-       struct bpos pos = btree_iter_search_key(iter);
        struct btree_iter_level *l = &iter->l[level];
 
-       bch2_btree_node_iter_init(&l->iter, l->b, &pos);
+       bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
+
+       /*
+        * Iterators to interior nodes should always be pointed at the first non
+        * whiteout:
+        */
+       if (level)
+               bch2_btree_node_iter_peek(&l->iter, l->b);
 
        btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
@@ -919,7 +1004,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 
        trans_for_each_iter(iter->trans, linked)
                if (linked->l[level].b == b) {
-                       __btree_node_unlock(linked, level);
+                       btree_node_unlock(linked, level);
                        linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
                }
 }
@@ -945,7 +1030,8 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
 }
 
 static inline int btree_iter_lock_root(struct btree_iter *iter,
-                                      unsigned depth_want)
+                                      unsigned depth_want,
+                                      unsigned long trace_ip)
 {
        struct bch_fs *c = iter->trans->c;
        struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
@@ -974,7 +1060,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
                lock_type = __btree_lock_want(iter, iter->level);
                if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
                                              iter, lock_type,
-                                             lock_root_check_fn, rootp)))
+                                             lock_root_check_fn, rootp,
+                                             trace_ip)))
                        return -EINTR;
 
                if (likely(b == READ_ONCE(*rootp) &&
@@ -1002,27 +1089,32 @@ static void btree_iter_prefetch(struct btree_iter *iter)
        struct btree_iter_level *l = &iter->l[iter->level];
        struct btree_node_iter node_iter = l->iter;
        struct bkey_packed *k;
-       BKEY_PADDED(k) tmp;
+       struct bkey_buf tmp;
        unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
                ? (iter->level > 1 ? 0 :  2)
                : (iter->level > 1 ? 1 : 16);
        bool was_locked = btree_node_locked(iter, iter->level);
 
+       bch2_bkey_buf_init(&tmp);
+
        while (nr) {
                if (!bch2_btree_node_relock(iter, iter->level))
-                       return;
+                       break;
 
                bch2_btree_node_iter_advance(&node_iter, l->b);
                k = bch2_btree_node_iter_peek(&node_iter, l->b);
                if (!k)
                        break;
 
-               bch2_bkey_unpack(l->b, &tmp.k, k);
-               bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
+               bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+               bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
+                                        iter->level - 1);
        }
 
        if (!was_locked)
                btree_node_unlock(iter, iter->level);
+
+       bch2_bkey_buf_exit(&tmp, c);
 }
 
 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
@@ -1046,45 +1138,45 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
                btree_node_unlock(iter, plevel);
 }
 
-static __always_inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter,
+                                          unsigned long trace_ip)
 {
        struct bch_fs *c = iter->trans->c;
        struct btree_iter_level *l = &iter->l[iter->level];
        struct btree *b;
        unsigned level = iter->level - 1;
        enum six_lock_type lock_type = __btree_lock_want(iter, level);
-       BKEY_PADDED(k) tmp;
+       struct bkey_buf tmp;
+       int ret;
 
        EBUG_ON(!btree_node_locked(iter, iter->level));
 
-       bch2_bkey_unpack(l->b, &tmp.k,
+       bch2_bkey_buf_init(&tmp);
+       bch2_bkey_buf_unpack(&tmp, c, l->b,
                         bch2_btree_node_iter_peek(&l->iter, l->b));
 
-       b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
-       if (unlikely(IS_ERR(b)))
-               return PTR_ERR(b);
+       b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
+       ret = PTR_ERR_OR_ZERO(b);
+       if (unlikely(ret))
+               goto err;
 
        mark_btree_node_locked(iter, level, lock_type);
        btree_iter_node_set(iter, b);
 
-       if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
-           unlikely(b != btree_node_mem_ptr(&tmp.k)))
+       if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+           unlikely(b != btree_node_mem_ptr(tmp.k)))
                btree_node_mem_ptr_set(iter, level + 1, b);
 
        if (iter->flags & BTREE_ITER_PREFETCH)
                btree_iter_prefetch(iter);
 
        iter->level = level;
-
-       return 0;
-}
-
-static void btree_iter_up(struct btree_iter *iter)
-{
-       btree_node_unlock(iter, iter->level++);
+err:
+       bch2_bkey_buf_exit(&tmp, c);
+       return ret;
 }
 
-static int btree_iter_traverse_one(struct btree_iter *);
+static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
 
 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
 {
@@ -1104,11 +1196,12 @@ retry_all:
                sorted[nr_sorted++] = iter->idx;
 
 #define btree_iter_cmp_by_idx(_l, _r)                          \
-               btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
+               btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
 
        bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
 #undef btree_iter_cmp_by_idx
        bch2_trans_unlock(trans);
+       cond_resched();
 
        if (unlikely(ret == -ENOMEM)) {
                struct closure cl;
@@ -1139,7 +1232,7 @@ retry_all:
                if (!(trans->iters_linked & (1ULL << idx)))
                        continue;
 
-               ret = btree_iter_traverse_one(&trans->iters[idx]);
+               ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
                if (ret)
                        goto retry_all;
        }
@@ -1171,9 +1264,9 @@ static inline bool btree_iter_good_node(struct btree_iter *iter,
            !bch2_btree_node_relock(iter, l))
                return false;
 
-       if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+       if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
                return false;
-       if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+       if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
                return false;
        return true;
 }
@@ -1202,7 +1295,8 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_iter_traverse_one(struct btree_iter *iter)
+static int btree_iter_traverse_one(struct btree_iter *iter,
+                                  unsigned long trace_ip)
 {
        unsigned depth_want = iter->level;
 
@@ -1223,24 +1317,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
        if (unlikely(iter->level >= BTREE_MAX_DEPTH))
                return 0;
 
-       /*
-        * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
-        * here unnecessary
-        */
        iter->level = btree_iter_up_until_good_node(iter, 0);
 
-       /*
-        * If we've got a btree node locked (i.e. we aren't about to relock the
-        * root) - advance its node iterator if necessary:
-        *
-        * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
-        */
-       if (is_btree_node(iter, iter->level)) {
-               BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
-
-               btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
-       }
-
        /*
         * Note: iter->nodes[iter->level] may be temporarily NULL here - that
         * would indicate to other code that we got to the end of the btree,
@@ -1249,8 +1327,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
         */
        while (iter->level > depth_want) {
                int ret = btree_iter_node(iter, iter->level)
-                       ? btree_iter_down(iter)
-                       : btree_iter_lock_root(iter, depth_want);
+                       ? btree_iter_down(iter, trace_ip)
+                       : btree_iter_lock_root(iter, depth_want, trace_ip);
                if (unlikely(ret)) {
                        if (ret == 1)
                                return 0;
@@ -1275,32 +1353,41 @@ static int btree_iter_traverse_one(struct btree_iter *iter)
        return 0;
 }
 
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 {
        struct btree_trans *trans = iter->trans;
        int ret;
 
        ret =   bch2_trans_cond_resched(trans) ?:
-               btree_iter_traverse_one(iter);
+               btree_iter_traverse_one(iter, _RET_IP_);
        if (unlikely(ret))
                ret = __btree_iter_traverse_all(trans, ret);
 
        return ret;
 }
 
-static inline void bch2_btree_iter_checks(struct btree_iter *iter)
+/*
+ * Note:
+ * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
+ * for internal btree iterator users
+ *
+ * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
+ * btree_iter_traverse() does not:
+ */
+static inline int __must_check
+btree_iter_traverse(struct btree_iter *iter)
 {
-       enum btree_iter_type type = btree_iter_type(iter);
-
-       EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+       return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+               ? __bch2_btree_iter_traverse(iter)
+               : 0;
+}
 
-       BUG_ON((type == BTREE_ITER_KEYS ||
-               type == BTREE_ITER_CACHED) &&
-              (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
-               bkey_cmp(iter->pos, iter->k.p) > 0));
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+       btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
-       bch2_btree_iter_verify_locks(iter);
-       bch2_btree_iter_verify_level(iter, iter->level);
+       return btree_iter_traverse(iter);
 }
 
 /* Iterate across nodes (leaf and interior nodes) */
@@ -1311,12 +1398,9 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
        int ret;
 
        EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
-       bch2_btree_iter_checks(iter);
-
-       if (iter->uptodate == BTREE_ITER_UPTODATE)
-               return iter->l[iter->level].b;
+       bch2_btree_iter_verify(iter);
 
-       ret = bch2_btree_iter_traverse(iter);
+       ret = btree_iter_traverse(iter);
        if (ret)
                return NULL;
 
@@ -1324,10 +1408,9 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
        if (!b)
                return NULL;
 
-       BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+       BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
-       iter->pos = b->key.k.p;
-       iter->uptodate = BTREE_ITER_UPTODATE;
+       iter->pos = iter->real_pos = b->key.k.p;
 
        bch2_btree_iter_verify(iter);
 
@@ -1340,7 +1423,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        int ret;
 
        EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
-       bch2_btree_iter_checks(iter);
+       bch2_btree_iter_verify(iter);
 
        /* already got to end? */
        if (!btree_iter_node(iter, iter->level))
@@ -1348,12 +1431,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
        bch2_trans_cond_resched(iter->trans);
 
-       btree_iter_up(iter);
+       btree_node_unlock(iter, iter->level);
+       iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
+       iter->level++;
 
-       if (!bch2_btree_node_relock(iter, iter->level))
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
-
-       ret = bch2_btree_iter_traverse(iter);
+       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+       ret = btree_iter_traverse(iter);
        if (ret)
                return NULL;
 
@@ -1362,34 +1445,28 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        if (!b)
                return NULL;
 
-       if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+       if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
                /*
                 * Haven't gotten to the end of the parent node: go back down to
                 * the next child node
                 */
+               btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
 
-               /*
-                * We don't really want to be unlocking here except we can't
-                * directly tell btree_iter_traverse() "traverse to this level"
-                * except by setting iter->level, so we have to unlock so we
-                * don't screw up our lock invariants:
-                */
-               if (btree_node_read_locked(iter, iter->level))
-                       btree_node_unlock(iter, iter->level);
-
-               iter->pos       = bkey_successor(iter->pos);
-               iter->level     = iter->min_depth;
+               /* Unlock to avoid screwing up our lock invariants: */
+               btree_node_unlock(iter, iter->level);
 
+               iter->level = iter->min_depth;
                btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-               ret = bch2_btree_iter_traverse(iter);
+               bch2_btree_iter_verify(iter);
+
+               ret = btree_iter_traverse(iter);
                if (ret)
                        return NULL;
 
                b = iter->l[iter->level].b;
        }
 
-       iter->pos = b->key.k.p;
-       iter->uptodate = BTREE_ITER_UPTODATE;
+       iter->pos = iter->real_pos = b->key.k.p;
 
        bch2_btree_iter_verify(iter);
 
@@ -1398,43 +1475,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
 /* Iterate across keys (in leaf nodes only) */
 
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
-{
-       struct btree_iter_level *l = &iter->l[0];
-
-       EBUG_ON(iter->level != 0);
-       EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
-       EBUG_ON(!btree_node_locked(iter, 0));
-       EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
-
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos = new_pos;
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
-       btree_iter_advance_to_pos(iter, l, -1);
-
-       /*
-        * XXX:
-        * keeping a node locked that's outside (even just outside) iter->pos
-        * breaks __bch2_btree_node_lock(). This seems to only affect
-        * bch2_btree_node_get_sibling so for now it's fixed there, but we
-        * should try to get rid of this corner case.
-        *
-        * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK)
-        */
-
-       if (bch2_btree_node_iter_end(&l->iter) &&
-           btree_iter_pos_after_node(iter, l->b))
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-}
-
-static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
+static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
 {
+       int cmp = bpos_cmp(new_pos, iter->real_pos);
        unsigned l = iter->level;
 
        if (!cmp)
                goto out;
 
+       iter->real_pos = new_pos;
+
        if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
                btree_node_unlock(iter, 0);
                iter->l[0].b = BTREE_ITER_NO_NODE_UP;
@@ -1464,245 +1514,162 @@ out:
                btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
        else
                btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+       bch2_btree_iter_verify(iter);
 }
 
-void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
-                              bool strictly_greater)
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
-       struct bpos old = btree_iter_search_key(iter);
-       int cmp;
-
-       iter->flags &= ~BTREE_ITER_IS_EXTENTS;
-       iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
+       struct bpos pos = iter->k.p;
+       bool ret = bpos_cmp(pos, POS_MAX) != 0;
 
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos = new_pos;
-
-       cmp = bkey_cmp(btree_iter_search_key(iter), old);
-
-       btree_iter_pos_changed(iter, cmp);
+       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+               pos = bkey_successor(iter, pos);
+       bch2_btree_iter_set_pos(iter, pos);
+       return ret;
 }
 
-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
 {
-       int cmp = bkey_cmp(new_pos, iter->pos);
-
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos = new_pos;
+       struct bpos pos = bkey_start_pos(&iter->k);
+       bool ret = bpos_cmp(pos, POS_MIN) != 0;
 
-       btree_iter_pos_changed(iter, cmp);
+       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+               pos = bkey_predecessor(iter, pos);
+       bch2_btree_iter_set_pos(iter, pos);
+       return ret;
 }
 
 static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
 {
-       struct btree_iter_level *l = &iter->l[0];
-       bool ret;
-
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos = l->b->key.k.p;
+       struct bpos next_pos = iter->l[0].b->key.k.p;
+       bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
 
-       ret = bkey_cmp(iter->pos, POS_MAX) != 0;
-       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               iter->k.p = iter->pos = bkey_successor(iter->pos);
+       /*
+        * Typically, we don't want to modify iter->pos here, since that
+        * indicates where we searched from - unless we got to the end of the
+        * btree, in that case we want iter->pos to reflect that:
+        */
+       if (ret)
+               btree_iter_set_search_pos(iter, bpos_successor(next_pos));
+       else
+               bch2_btree_iter_set_pos(iter, POS_MAX);
 
-       btree_iter_pos_changed(iter, 1);
        return ret;
 }
 
 static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
 {
-       struct btree_iter_level *l = &iter->l[0];
-       bool ret;
-
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos = l->b->data->min_key;
-       iter->uptodate  = BTREE_ITER_NEED_TRAVERSE;
-
-       ret = bkey_cmp(iter->pos, POS_MIN) != 0;
-       if (ret) {
-               iter->k.p = iter->pos = bkey_predecessor(iter->pos);
+       struct bpos next_pos = iter->l[0].b->data->min_key;
+       bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
 
-               if (iter->flags & BTREE_ITER_IS_EXTENTS)
-                       iter->k.p = iter->pos = bkey_predecessor(iter->pos);
-       }
+       if (ret)
+               btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
+       else
+               bch2_btree_iter_set_pos(iter, POS_MIN);
 
-       btree_iter_pos_changed(iter, -1);
        return ret;
 }
 
-/**
- * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key
- * it currently points to
- */
-static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
+static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+                                              enum btree_id btree_id, struct bpos pos)
 {
-       struct btree_iter_level *l = &iter->l[0];
-       struct bkey_s_c ret = { .k = &iter->k };
-
-       if (!bkey_deleted(&iter->k)) {
-               struct bkey_packed *_k =
-                       __bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
-               ret.v = bkeyp_val(&l->b->format, _k);
-
-               if (debug_check_iterators(iter->trans->c)) {
-                       struct bkey k = bkey_unpack_key(l->b, _k);
+       struct btree_insert_entry *i;
 
-                       BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
+       trans_for_each_update2(trans, i)
+               if ((cmp_int(btree_id,  i->iter->btree_id) ?:
+                    bkey_cmp(pos,      i->k->k.p)) <= 0) {
+                       if (btree_id == i->iter->btree_id)
+                               return i->k;
+                       break;
                }
 
-               if (debug_check_bkeys(iter->trans->c))
-                       bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
-       }
-
-       return ret;
+       return NULL;
 }
 
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates)
 {
-       struct btree_iter_level *l = &iter->l[0];
+       struct bpos search_key = btree_iter_search_key(iter);
+       struct bkey_i *next_update = with_updates
+               ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
+               : NULL;
        struct bkey_s_c k;
        int ret;
 
        EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-       bch2_btree_iter_checks(iter);
+       bch2_btree_iter_verify(iter);
+       bch2_btree_iter_verify_entry_exit(iter);
 
-       if (iter->uptodate == BTREE_ITER_UPTODATE &&
-           !bkey_deleted(&iter->k))
-               return btree_iter_peek_uptodate(iter);
+       btree_iter_set_search_pos(iter, search_key);
 
        while (1) {
-               ret = bch2_btree_iter_traverse(iter);
+               ret = btree_iter_traverse(iter);
                if (unlikely(ret))
                        return bkey_s_c_err(ret);
 
-               k = __btree_iter_peek(iter, l);
-               if (likely(k.k))
+               k = btree_iter_level_peek(iter, &iter->l[0]);
+
+               if (next_update &&
+                   bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
+                       k = bkey_i_to_s_c(next_update);
+
+               if (likely(k.k)) {
+                       if (bkey_deleted(k.k)) {
+                               btree_iter_set_search_pos(iter,
+                                               bkey_successor(iter, k.k->p));
+                               continue;
+                       }
+
                        break;
+               }
 
                if (!btree_iter_set_pos_to_next_leaf(iter))
                        return bkey_s_c_null;
        }
 
        /*
-        * iter->pos should always be equal to the key we just
-        * returned - except extents can straddle iter->pos:
+        * iter->pos should be mononotically increasing, and always be equal to
+        * the key we just returned - except extents can straddle iter->pos:
         */
-       if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
-           bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+       if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
                iter->pos = bkey_start_pos(k.k);
 
-       iter->uptodate = BTREE_ITER_UPTODATE;
-
-       bch2_btree_iter_verify_level(iter, 0);
+       bch2_btree_iter_verify_entry_exit(iter);
+       bch2_btree_iter_verify(iter);
        return k;
 }
 
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+       return __btree_iter_peek(iter, false);
+}
+
 /**
  * bch2_btree_iter_next: returns first key greater than iterator's current
  * position
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
-       if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+       if (!bch2_btree_iter_advance(iter))
                return bkey_s_c_null;
 
-       bch2_btree_iter_set_pos(iter,
-               (iter->flags & BTREE_ITER_IS_EXTENTS)
-               ? iter->k.p
-               : bkey_successor(iter->k.p));
-
        return bch2_btree_iter_peek(iter);
 }
 
-static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
-{
-       struct bpos pos = btree_iter_search_key(iter);
-       struct btree_trans *trans = iter->trans;
-       struct btree_insert_entry *i;
-
-       trans_for_each_update2(trans, i)
-               if ((cmp_int(iter->btree_id,    i->iter->btree_id) ?:
-                    bkey_cmp(pos,              i->k->k.p)) <= 0)
-                       break;
-
-       return i < trans->updates2 + trans->nr_updates2 &&
-               iter->btree_id == i->iter->btree_id
-               ? bkey_i_to_s_c(i->k)
-               : bkey_s_c_null;
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
-{
-       struct btree_iter_level *l = &iter->l[0];
-       struct bkey_s_c k = __btree_iter_peek(iter, l);
-       struct bkey_s_c u = __btree_trans_updates_peek(iter);
-
-       if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
-               return k;
-       if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) {
-               iter->k = *u.k;
-               return u;
-       }
-       return bkey_s_c_null;
-}
-
 struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
 {
-       struct bkey_s_c k;
-       int ret;
-
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-       bch2_btree_iter_checks(iter);
-
-       while (1) {
-               ret = bch2_btree_iter_traverse(iter);
-               if (unlikely(ret))
-                       return bkey_s_c_err(ret);
-
-               k = __bch2_btree_iter_peek_with_updates(iter);
-
-               if (k.k && bkey_deleted(k.k)) {
-                       bch2_btree_iter_set_pos(iter,
-                               (iter->flags & BTREE_ITER_IS_EXTENTS)
-                               ? iter->k.p
-                               : bkey_successor(iter->k.p));
-                       continue;
-               }
-
-               if (likely(k.k))
-                       break;
-
-               if (!btree_iter_set_pos_to_next_leaf(iter))
-                       return bkey_s_c_null;
-       }
-
-       /*
-        * iter->pos should always be equal to the key we just
-        * returned - except extents can straddle iter->pos:
-        */
-       if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
-           bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
-               iter->pos = bkey_start_pos(k.k);
-
-       iter->uptodate = BTREE_ITER_UPTODATE;
-       return k;
+       return __btree_iter_peek(iter, true);
 }
 
 struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
 {
-       if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+       if (!bch2_btree_iter_advance(iter))
                return bkey_s_c_null;
 
-       bch2_btree_iter_set_pos(iter,
-               (iter->flags & BTREE_ITER_IS_EXTENTS)
-               ? iter->k.p
-               : bkey_successor(iter->k.p));
-
        return bch2_btree_iter_peek_with_updates(iter);
 }
 
@@ -1712,38 +1679,57 @@ struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
-       struct bpos pos = iter->pos;
        struct btree_iter_level *l = &iter->l[0];
        struct bkey_s_c k;
        int ret;
 
        EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-       bch2_btree_iter_checks(iter);
+       bch2_btree_iter_verify(iter);
+       bch2_btree_iter_verify_entry_exit(iter);
 
-       if (iter->uptodate == BTREE_ITER_UPTODATE &&
-           !bkey_deleted(&iter->k))
-               return btree_iter_peek_uptodate(iter);
+       btree_iter_set_search_pos(iter, iter->pos);
 
        while (1) {
-               ret = bch2_btree_iter_traverse(iter);
-               if (unlikely(ret))
-                       return bkey_s_c_err(ret);
+               ret = btree_iter_traverse(iter);
+               if (unlikely(ret)) {
+                       k = bkey_s_c_err(ret);
+                       goto no_key;
+               }
 
-               k = __btree_iter_peek(iter, l);
-               if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0)
-                       k = __btree_iter_prev(iter, l);
+               k = btree_iter_level_peek(iter, l);
+               if (!k.k ||
+                   ((iter->flags & BTREE_ITER_IS_EXTENTS)
+                    ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
+                    : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0))
+                       k = btree_iter_level_prev(iter, l);
 
                if (likely(k.k))
                        break;
 
-               if (!btree_iter_set_pos_to_prev_leaf(iter))
-                       return bkey_s_c_null;
+               if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+                       k = bkey_s_c_null;
+                       goto no_key;
+               }
        }
 
-       EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
-       iter->pos       = bkey_start_pos(k.k);
-       iter->uptodate  = BTREE_ITER_UPTODATE;
+       EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+
+       /* Extents can straddle iter->pos: */
+       if (bkey_cmp(k.k->p, iter->pos) < 0)
+               iter->pos = k.k->p;
+out:
+       bch2_btree_iter_verify_entry_exit(iter);
+       bch2_btree_iter_verify(iter);
        return k;
+no_key:
+       /*
+        * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
+        * then we errored going to the previous leaf - make sure it's
+        * consistent with iter->pos:
+        */
+       bkey_init(&iter->k);
+       iter->k.p = iter->pos;
+       goto out;
 }
 
 /**
@@ -1752,81 +1738,52 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
-       struct bpos pos = bkey_start_pos(&iter->k);
-
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-       bch2_btree_iter_checks(iter);
-
-       if (unlikely(!bkey_cmp(pos, POS_MIN)))
+       if (!bch2_btree_iter_rewind(iter))
                return bkey_s_c_null;
 
-       bch2_btree_iter_set_pos(iter, bkey_predecessor(pos));
-
        return bch2_btree_iter_peek_prev(iter);
 }
 
 static inline struct bkey_s_c
 __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 {
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree_node_iter node_iter;
        struct bkey_s_c k;
-       struct bkey n;
-       int ret;
+       struct bpos pos, next_start;
 
        /* keys & holes can't span inode numbers: */
        if (iter->pos.offset == KEY_OFFSET_MAX) {
                if (iter->pos.inode == KEY_INODE_MAX)
                        return bkey_s_c_null;
 
-               bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
-
-               ret = bch2_btree_iter_traverse(iter);
-               if (unlikely(ret))
-                       return bkey_s_c_err(ret);
+               bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
        }
 
-       /*
-        * iterator is now at the correct position for inserting at iter->pos,
-        * but we need to keep iterating until we find the first non whiteout so
-        * we know how big a hole we have, if any:
-        */
-
-       node_iter = l->iter;
-       k = __btree_iter_unpack(iter, l, &iter->k,
-               bch2_btree_node_iter_peek(&node_iter, l->b));
-
-       if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
-               /*
-                * We're not setting iter->uptodate because the node iterator
-                * doesn't necessarily point at the key we're returning:
-                */
+       pos = iter->pos;
+       k = bch2_btree_iter_peek(iter);
+       iter->pos = pos;
 
-               EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
-               bch2_btree_iter_verify_level(iter, 0);
+       if (bkey_err(k))
                return k;
-       }
 
-       /* hole */
+       if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
+               return k;
 
-       if (!k.k)
-               k.k = &l->b->key.k;
+       next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
 
-       bkey_init(&n);
-       n.p = iter->pos;
-       bch2_key_resize(&n,
+       bkey_init(&iter->k);
+       iter->k.p = iter->pos;
+       bch2_key_resize(&iter->k,
                        min_t(u64, KEY_SIZE_MAX,
-                             (k.k->p.inode == n.p.inode
-                              ? bkey_start_offset(k.k)
+                             (next_start.inode == iter->pos.inode
+                              ? next_start.offset
                               : KEY_OFFSET_MAX) -
-                             n.p.offset));
+                             iter->pos.offset));
 
-       EBUG_ON(!n.size);
+       EBUG_ON(!iter->k.size);
 
-       iter->k = n;
-       iter->uptodate = BTREE_ITER_UPTODATE;
+       bch2_btree_iter_verify_entry_exit(iter);
+       bch2_btree_iter_verify(iter);
 
-       bch2_btree_iter_verify_level(iter, 0);
        return (struct bkey_s_c) { &iter->k, NULL };
 }
 
@@ -1837,19 +1794,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        int ret;
 
        EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-       bch2_btree_iter_checks(iter);
+       bch2_btree_iter_verify(iter);
+       bch2_btree_iter_verify_entry_exit(iter);
 
-       if (iter->uptodate == BTREE_ITER_UPTODATE)
-               return btree_iter_peek_uptodate(iter);
-
-       ret = bch2_btree_iter_traverse(iter);
-       if (unlikely(ret))
-               return bkey_s_c_err(ret);
+       btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
        if (iter->flags & BTREE_ITER_IS_EXTENTS)
                return __bch2_btree_iter_peek_slot_extents(iter);
 
-       k = __btree_iter_peek_all(iter, l, &iter->k);
+       ret = btree_iter_traverse(iter);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
+
+       k = btree_iter_level_peek_all(iter, l, &iter->k);
 
        EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
 
@@ -1860,20 +1817,23 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                k = (struct bkey_s_c) { &iter->k, NULL };
        }
 
-       iter->uptodate = BTREE_ITER_UPTODATE;
-       bch2_btree_iter_verify_level(iter, 0);
+       bch2_btree_iter_verify_entry_exit(iter);
+       bch2_btree_iter_verify(iter);
        return k;
 }
 
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
-       if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+       if (!bch2_btree_iter_advance(iter))
                return bkey_s_c_null;
 
-       bch2_btree_iter_set_pos(iter,
-               (iter->flags & BTREE_ITER_IS_EXTENTS)
-               ? iter->k.p
-               : bkey_successor(iter->k.p));
+       return bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+{
+       if (!bch2_btree_iter_rewind(iter))
+               return bkey_s_c_null;
 
        return bch2_btree_iter_peek_slot(iter);
 }
@@ -1884,9 +1844,9 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
        int ret;
 
        EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
-       bch2_btree_iter_checks(iter);
+       bch2_btree_iter_verify(iter);
 
-       ret = bch2_btree_iter_traverse(iter);
+       ret = btree_iter_traverse(iter);
        if (unlikely(ret))
                return bkey_s_c_err(ret);
 
@@ -1900,26 +1860,17 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
 }
 
 static inline void bch2_btree_iter_init(struct btree_trans *trans,
-                       struct btree_iter *iter, enum btree_id btree_id,
-                       struct bpos pos, unsigned flags)
+                       struct btree_iter *iter, enum btree_id btree_id)
 {
        struct bch_fs *c = trans->c;
        unsigned i;
 
-       if (btree_node_type_is_extents(btree_id) &&
-           !(flags & BTREE_ITER_NODES))
-               flags |= BTREE_ITER_IS_EXTENTS;
-
        iter->trans                     = trans;
-       iter->pos                       = pos;
-       bkey_init(&iter->k);
-       iter->k.p                       = pos;
-       iter->flags                     = flags;
        iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
        iter->btree_id                  = btree_id;
        iter->level                     = 0;
        iter->min_depth                 = 0;
-       iter->locks_want                = flags & BTREE_ITER_INTENT ? 1 : 0;
+       iter->locks_want                = 0;
        iter->nodes_locked              = 0;
        iter->nodes_intent_locked       = 0;
        for (i = 0; i < ARRAY_SIZE(iter->l); i++)
@@ -1948,6 +1899,7 @@ int bch2_trans_iter_put(struct btree_trans *trans,
                return 0;
 
        BUG_ON(trans->iters + iter->idx != iter);
+       BUG_ON(!btree_iter_live(trans, iter));
 
        ret = btree_iter_err(iter);
 
@@ -1965,115 +1917,51 @@ int bch2_trans_iter_free(struct btree_trans *trans,
        if (IS_ERR_OR_NULL(iter))
                return 0;
 
-       trans->iters_touched &= ~(1ULL << iter->idx);
+       set_btree_iter_dontneed(trans, iter);
 
        return bch2_trans_iter_put(trans, iter);
 }
 
-static int bch2_trans_realloc_iters(struct btree_trans *trans,
-                                   unsigned new_size)
+noinline __cold
+static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
 {
-       void *p, *new_iters, *new_updates, *new_updates2;
-       size_t iters_bytes;
-       size_t updates_bytes;
-
-       new_size = roundup_pow_of_two(new_size);
-
-       BUG_ON(new_size > BTREE_ITER_MAX);
-
-       if (new_size <= trans->size)
-               return 0;
-
-       BUG_ON(trans->used_mempool);
 
-       bch2_trans_unlock(trans);
-
-       iters_bytes     = sizeof(struct btree_iter) * new_size;
-       updates_bytes   = sizeof(struct btree_insert_entry) * new_size;
-
-       p = kmalloc(iters_bytes +
-                   updates_bytes +
-                   updates_bytes, GFP_NOFS);
-       if (p)
-               goto success;
-
-       p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
-       new_size = BTREE_ITER_MAX;
-
-       trans->used_mempool = true;
-success:
-       new_iters       = p; p += iters_bytes;
-       new_updates     = p; p += updates_bytes;
-       new_updates2    = p; p += updates_bytes;
-
-       memcpy(new_iters, trans->iters,
-              sizeof(struct btree_iter) * trans->nr_iters);
-       memcpy(new_updates, trans->updates,
-              sizeof(struct btree_insert_entry) * trans->nr_updates);
-       memcpy(new_updates2, trans->updates2,
-              sizeof(struct btree_insert_entry) * trans->nr_updates2);
-
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-               memset(trans->iters, POISON_FREE,
-                      sizeof(struct btree_iter) * trans->nr_iters +
-                      sizeof(struct btree_insert_entry) * trans->nr_iters);
-
-       if (trans->iters != trans->iters_onstack)
-               kfree(trans->iters);
-
-       trans->iters            = new_iters;
-       trans->updates          = new_updates;
-       trans->updates2         = new_updates2;
-       trans->size             = new_size;
+       struct btree_iter *iter;
+       struct btree_insert_entry *i;
+       char buf[100];
 
-       if (trans->iters_live) {
-               trace_trans_restart_iters_realloced(trans->ip, trans->size);
-               return -EINTR;
+       trans_for_each_iter(trans, iter)
+               printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
+                      bch2_btree_ids[iter->btree_id],
+                      (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
+                      btree_iter_live(trans, iter) ? " live" : "",
+                      (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+                      iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+                      (void *) iter->ip_allocated);
+
+       trans_for_each_update(trans, i) {
+               char buf[300];
+
+               bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
+               printk(KERN_ERR "update: btree %s %s\n",
+                      bch2_btree_ids[i->iter->btree_id], buf);
        }
-
-       return 0;
+       panic("trans iter oveflow\n");
 }
 
 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
 {
-       unsigned idx = __ffs64(~trans->iters_linked);
+       unsigned idx;
 
-       if (idx < trans->nr_iters)
-               goto got_slot;
+       if (unlikely(trans->iters_linked ==
+                    ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+               btree_trans_iter_alloc_fail(trans);
 
-       if (trans->nr_iters == trans->size) {
-               int ret;
-
-               if (trans->nr_iters >= BTREE_ITER_MAX) {
-                       struct btree_iter *iter;
-
-                       trans_for_each_iter(trans, iter) {
-                               pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
-                                      bch2_btree_ids[iter->btree_id],
-                                      iter->pos.inode,
-                                      iter->pos.offset,
-                                      (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
-                                      (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
-                                      iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
-                                      (void *) iter->ip_allocated);
-                       }
-
-                       panic("trans iter oveflow\n");
-               }
+       idx = __ffs64(~trans->iters_linked);
 
-               ret = bch2_trans_realloc_iters(trans, trans->size * 2);
-               if (ret)
-                       return ERR_PTR(ret);
-       }
-
-       idx = trans->nr_iters++;
-       BUG_ON(trans->nr_iters > trans->size);
-
-       trans->iters[idx].idx = idx;
-got_slot:
-       BUG_ON(trans->iters_linked & (1ULL << idx));
-       trans->iters_linked |= 1ULL << idx;
-       trans->iters[idx].flags = 0;
+       trans->iters_linked     |= 1ULL << idx;
+       trans->iters[idx].idx    = idx;
+       trans->iters[idx].flags  = 0;
        return &trans->iters[idx];
 }
 
@@ -2095,21 +1983,21 @@ static inline void btree_iter_copy(struct btree_iter *dst,
        dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
 }
 
-static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
-{
-       if (bkey_cmp(l, r) > 0)
-               swap(l, r);
-
-       return POS(r.inode - l.inode, r.offset - l.offset);
-}
-
-static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
-                                                unsigned btree_id, struct bpos pos,
-                                                unsigned flags)
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+                                        unsigned btree_id, struct bpos pos,
+                                        unsigned locks_want,
+                                        unsigned depth,
+                                        unsigned flags)
 {
        struct btree_iter *iter, *best = NULL;
 
-       BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
+       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+           !btree_type_has_snapshots(btree_id))
+               flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+       if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
+               pos.snapshot = btree_type_has_snapshots(btree_id)
+                       ? U32_MAX : 0;
 
        trans_for_each_iter(trans, iter) {
                if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
@@ -2119,8 +2007,8 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
                        continue;
 
                if (best &&
-                   bkey_cmp(bpos_diff(best->pos, pos),
-                            bpos_diff(iter->pos, pos)) < 0)
+                   bkey_cmp(bpos_diff(best->real_pos, pos),
+                            bpos_diff(iter->real_pos, pos)) > 0)
                        continue;
 
                best = iter;
@@ -2128,52 +2016,50 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
 
        if (!best) {
                iter = btree_trans_iter_alloc(trans);
-               if (IS_ERR(iter))
-                       return iter;
-
-               bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
-       } else if ((trans->iters_live & (1ULL << best->idx)) ||
-                  (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
+               bch2_btree_iter_init(trans, iter, btree_id);
+       } else if (btree_iter_keep(trans, best)) {
                iter = btree_trans_iter_alloc(trans);
-               if (IS_ERR(iter))
-                       return iter;
-
                btree_iter_copy(iter, best);
        } else {
                iter = best;
        }
 
-       iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
-       iter->flags &= ~BTREE_ITER_USER_FLAGS;
-       iter->flags |= flags & BTREE_ITER_USER_FLAGS;
+       trans->iters_live       |= 1ULL << iter->idx;
+       trans->iters_touched    |= 1ULL << iter->idx;
 
-       if (iter->flags & BTREE_ITER_INTENT)
-               bch2_btree_iter_upgrade(iter, 1);
-       else
-               bch2_btree_iter_downgrade(iter);
+       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+           btree_node_type_is_extents(btree_id) &&
+           !(flags & BTREE_ITER_NOT_EXTENTS) &&
+           !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+               flags |= BTREE_ITER_IS_EXTENTS;
 
-       BUG_ON(iter->btree_id != btree_id);
-       BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
-       BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-       BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT);
-       BUG_ON(trans->iters_live & (1ULL << iter->idx));
+       iter->flags = flags;
 
-       trans->iters_live       |= 1ULL << iter->idx;
-       trans->iters_touched    |= 1ULL << iter->idx;
+       iter->snapshot = pos.snapshot;
 
-       return iter;
-}
+       locks_want = min(locks_want, BTREE_MAX_DEPTH);
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
-                                        enum btree_id btree_id,
-                                        struct bpos pos, unsigned flags)
-{
-       struct btree_iter *iter =
-               __btree_trans_get_iter(trans, btree_id, pos, flags);
+       if (locks_want > iter->locks_want) {
+               iter->locks_want = locks_want;
+               btree_iter_get_locks(iter, true, false);
+       } else if (locks_want < iter->locks_want) {
+               __bch2_btree_iter_downgrade(iter, locks_want);
+       }
+
+       while (iter->level < depth) {
+               btree_node_unlock(iter, iter->level);
+               iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
+               iter->level++;
+       }
+
+       while (iter->level > depth)
+               iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT;
+
+       iter->min_depth = depth;
+
+       bch2_btree_iter_set_pos(iter, pos);
+       btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
 
-       if (!IS_ERR(iter))
-               __bch2_btree_iter_set_pos(iter, pos,
-                       btree_node_type_is_extents(btree_id));
        return iter;
 }
 
@@ -2185,20 +2071,18 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
                                            unsigned flags)
 {
        struct btree_iter *iter =
-               __btree_trans_get_iter(trans, btree_id, pos,
-                                      flags|BTREE_ITER_NODES);
-       unsigned i;
+               __bch2_trans_get_iter(trans, btree_id, pos,
+                                     locks_want, depth,
+                                     BTREE_ITER_NODES|
+                                     BTREE_ITER_NOT_EXTENTS|
+                                     BTREE_ITER_ALL_SNAPSHOTS|
+                                     flags);
 
-       BUG_ON(IS_ERR(iter));
        BUG_ON(bkey_cmp(iter->pos, pos));
-
-       iter->locks_want = locks_want;
-       iter->level     = depth;
-       iter->min_depth = depth;
-
-       for (i = 0; i < ARRAY_SIZE(iter->l); i++)
-               iter->l[i].b            = NULL;
-       iter->l[iter->level].b          = BTREE_ITER_NO_NODE_INIT;
+       BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
+       BUG_ON(iter->level      != depth);
+       BUG_ON(iter->min_depth  != depth);
+       iter->ip_allocated = _RET_IP_;
 
        return iter;
 }
@@ -2209,9 +2093,6 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
        struct btree_iter *iter;
 
        iter = btree_trans_iter_alloc(trans);
-       if (IS_ERR(iter))
-               return iter;
-
        btree_iter_copy(iter, src);
 
        trans->iters_live |= 1ULL << iter->idx;
@@ -2219,7 +2100,7 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
         * We don't need to preserve this iter since it's cheap to copy it
         * again - this will cause trans_iter_put() to free it right away:
         */
-       trans->iters_touched &= ~(1ULL << iter->idx);
+       set_btree_iter_dontneed(trans, iter);
 
        return iter;
 }
@@ -2286,11 +2167,11 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
 
        trans->iters_touched &= trans->iters_live;
 
-       trans->need_reset               = 0;
        trans->nr_updates               = 0;
        trans->nr_updates2              = 0;
        trans->mem_top                  = 0;
 
+       trans->hooks                    = NULL;
        trans->extra_journal_entries    = NULL;
        trans->extra_journal_entry_u64s = 0;
 
@@ -2301,35 +2182,52 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
                       (void *) &trans->fs_usage_deltas->memset_start);
        }
 
+       if (!(flags & TRANS_RESET_NOUNLOCK))
+               bch2_trans_cond_resched(trans);
+
        if (!(flags & TRANS_RESET_NOTRAVERSE))
                bch2_btree_iter_traverse_all(trans);
 }
 
+static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+{
+       size_t iters_bytes      = sizeof(struct btree_iter) * BTREE_ITER_MAX;
+       size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
+       void *p = NULL;
+
+       BUG_ON(trans->used_mempool);
+
+#ifdef __KERNEL__
+       p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+#endif
+       if (!p)
+               p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+
+       trans->iters            = p; p += iters_bytes;
+       trans->updates          = p; p += updates_bytes;
+       trans->updates2         = p; p += updates_bytes;
+}
+
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
                     unsigned expected_nr_iters,
                     size_t expected_mem_bytes)
 {
-       memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
+       memset(trans, 0, sizeof(*trans));
+       trans->c                = c;
+       trans->ip               = _RET_IP_;
 
        /*
         * reallocating iterators currently completely breaks
-        * bch2_trans_iter_put():
+        * bch2_trans_iter_put(), we always allocate the max:
         */
-       expected_nr_iters = BTREE_ITER_MAX;
-
-       trans->c                = c;
-       trans->ip               = _RET_IP_;
-       trans->size             = ARRAY_SIZE(trans->iters_onstack);
-       trans->iters            = trans->iters_onstack;
-       trans->updates          = trans->updates_onstack;
-       trans->updates2         = trans->updates2_onstack;
-       trans->fs_usage_deltas  = NULL;
+       bch2_trans_alloc_iters(trans, c);
 
-       if (expected_nr_iters > trans->size)
-               bch2_trans_realloc_iters(trans, expected_nr_iters);
+       if (expected_mem_bytes) {
+               trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
+               trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+       }
 
-       if (expected_mem_bytes)
-               bch2_trans_preload_mem(trans, expected_mem_bytes);
+       trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
        trans->pid = current->pid;
@@ -2341,37 +2239,74 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 
 int bch2_trans_exit(struct btree_trans *trans)
 {
+       struct bch_fs *c = trans->c;
+
        bch2_trans_unlock(trans);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
+       if (trans->iters_live) {
+               struct btree_iter *iter;
+
+               bch_err(c, "btree iterators leaked!");
+               trans_for_each_iter(trans, iter)
+                       if (btree_iter_live(trans, iter))
+                               printk(KERN_ERR "  btree %s allocated at %pS\n",
+                                      bch2_btree_ids[iter->btree_id],
+                                      (void *) iter->ip_allocated);
+               /* Be noisy about this: */
+               bch2_fatal_error(c);
+       }
+
        mutex_lock(&trans->c->btree_trans_lock);
        list_del(&trans->list);
        mutex_unlock(&trans->c->btree_trans_lock);
 #endif
 
+       srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+
        bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
        kfree(trans->fs_usage_deltas);
        kfree(trans->mem);
-       if (trans->used_mempool)
+
+#ifdef __KERNEL__
+       /*
+        * Userspace doesn't have a real percpu implementation:
+        */
+       trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+#endif
+       if (trans->iters)
                mempool_free(trans->iters, &trans->c->btree_iters_pool);
-       else if (trans->iters != trans->iters_onstack)
-               kfree(trans->iters);
+
        trans->mem      = (void *) 0x1;
        trans->iters    = (void *) 0x1;
 
        return trans->error ? -EIO : 0;
 }
 
-static void bch2_btree_iter_node_to_text(struct printbuf *out,
-                                struct btree_bkey_cached_common *_b,
-                                enum btree_iter_type type)
+static void __maybe_unused
+bch2_btree_iter_node_to_text(struct printbuf *out,
+                            struct btree_bkey_cached_common *_b,
+                            enum btree_iter_type type)
 {
-       pr_buf(out, "    %px l=%u %s:",
-              _b, _b->level, bch2_btree_ids[_b->btree_id]);
+       pr_buf(out, "    l=%u %s:",
+              _b->level, bch2_btree_ids[_b->btree_id]);
        bch2_bpos_to_text(out, btree_node_pos(_b, type));
 }
 
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+{
+       struct btree_iter *iter;
+
+       trans_for_each_iter(trans, iter)
+               if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
+                   iter->nodes_locked)
+                       return true;
+       return false;
+}
+#endif
+
 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -2382,14 +2317,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
        mutex_lock(&c->btree_trans_lock);
        list_for_each_entry(trans, &c->btree_trans_list, list) {
-               pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
+               if (!trans_has_btree_nodes_locked(trans))
+                       continue;
+
+               pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
 
                trans_for_each_iter(trans, iter) {
                        if (!iter->nodes_locked)
                                continue;
 
-                       pr_buf(out, "  iter %u %s:",
+                       pr_buf(out, "  iter %u %c %s:",
                               iter->idx,
+                              btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
                               bch2_btree_ids[iter->btree_id]);
                        bch2_bpos_to_text(out, iter->pos);
                        pr_buf(out, "\n");
@@ -2408,17 +2347,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
                b = READ_ONCE(trans->locking);
                if (b) {
-                       pr_buf(out, "  locking iter %u l=%u %s:",
+                       iter = &trans->iters[trans->locking_iter_idx];
+                       pr_buf(out, "  locking iter %u %c l=%u %s:",
                               trans->locking_iter_idx,
+                              btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
                               trans->locking_level,
                               bch2_btree_ids[trans->locking_btree_id]);
                        bch2_bpos_to_text(out, trans->locking_pos);
 
-
                        pr_buf(out, " node ");
                        bch2_btree_iter_node_to_text(out,
                                        (void *) b,
-                                       btree_iter_type(&trans->iters[trans->locking_iter_idx]));
+                                       btree_iter_type(iter));
                        pr_buf(out, "\n");
                }
        }
@@ -2429,6 +2369,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
        mempool_exit(&c->btree_iters_pool);
+       cleanup_srcu_struct(&c->btree_trans_barrier);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
@@ -2438,7 +2379,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
        INIT_LIST_HEAD(&c->btree_trans_list);
        mutex_init(&c->btree_trans_lock);
 
-       return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+       return  init_srcu_struct(&c->btree_trans_barrier) ?:
+               mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
                        sizeof(struct btree_iter) * nr +
                        sizeof(struct btree_insert_entry) * nr +
                        sizeof(struct btree_insert_entry) * nr);
index bd9ec3ec9a92a2809128f6ae799dfeae44bc9ca9..07d9b6d36e51fa5c1011648cfe0eefae8753fd7c 100644 (file)
@@ -48,21 +48,22 @@ static inline int btree_iter_err(const struct btree_iter *iter)
 
 /* Iterate over iters within a transaction: */
 
-#define trans_for_each_iter_all(_trans, _iter)                         \
-       for (_iter = (_trans)->iters;                                   \
-            _iter < (_trans)->iters + (_trans)->nr_iters;              \
-            _iter++)
-
 static inline struct btree_iter *
 __trans_next_iter(struct btree_trans *trans, unsigned idx)
 {
-       EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx);
+       u64 l;
+
+       if (idx == BTREE_ITER_MAX)
+               return NULL;
 
-       for (; idx < trans->nr_iters; idx++)
-               if (trans->iters_linked & (1ULL << idx))
-                       return &trans->iters[idx];
+       l = trans->iters_linked >> idx;
+       if (!l)
+               return NULL;
 
-       return NULL;
+       idx += __ffs64(l);
+       EBUG_ON(idx >= BTREE_ITER_MAX);
+       EBUG_ON(trans->iters[idx].idx != idx);
+       return &trans->iters[idx];
 }
 
 #define trans_for_each_iter(_trans, _iter)                             \
@@ -115,7 +116,6 @@ bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
 bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
 
 static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
                                           unsigned new_locks_want)
@@ -123,9 +123,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
        new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
        return iter->locks_want < new_locks_want
-               ? (!iter->trans->nounlock
-                  ? __bch2_btree_iter_upgrade(iter, new_locks_want)
-                  : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
+               ? __bch2_btree_iter_upgrade(iter, new_locks_want)
                : iter->uptodate <= BTREE_ITER_NEED_PEEK;
 }
 
@@ -133,8 +131,10 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
 
 static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
 {
-       if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
-               __bch2_btree_iter_downgrade(iter, 0);
+       unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+       if (iter->locks_want > new_locks_want)
+               __bch2_btree_iter_downgrade(iter, new_locks_want);
 }
 
 void bch2_trans_downgrade(struct btree_trans *);
@@ -144,15 +144,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
 
 void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
 
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
-
-static inline int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
-{
-       return iter->uptodate >= BTREE_ITER_NEED_RELOCK
-               ? __bch2_btree_iter_traverse(iter)
-               : 0;
-}
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
 int bch2_btree_iter_traverse_all(struct btree_trans *);
 
@@ -170,18 +162,31 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 
 struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
 
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
-void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
-void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
+bool bch2_btree_iter_advance(struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_iter *);
 
-static inline int btree_iter_cmp(const struct btree_iter *l,
-                                const struct btree_iter *r)
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+               new_pos.snapshot = iter->snapshot;
+
+       iter->k.type = KEY_TYPE_deleted;
+       iter->k.p.inode         = iter->pos.inode       = new_pos.inode;
+       iter->k.p.offset        = iter->pos.offset      = new_pos.offset;
+       iter->k.p.snapshot      = iter->pos.snapshot    = new_pos.snapshot;
+       iter->k.size = 0;
+}
+
+/* Sort order for locking btree iterators: */
+static inline int btree_iter_lock_cmp(const struct btree_iter *l,
+                                     const struct btree_iter *r)
 {
        return   cmp_int(l->btree_id, r->btree_id) ?:
-               -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
+               -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
                 bkey_cmp(l->pos, r->pos);
 }
 
@@ -239,14 +244,11 @@ static inline int bkey_err(struct bkey_s_c k)
 
 #define for_each_btree_key(_trans, _iter, _btree_id,                   \
                           _start, _flags, _k, _ret)                    \
-       for ((_ret) = PTR_ERR_OR_ZERO((_iter) =                         \
-                       bch2_trans_get_iter((_trans), (_btree_id),      \
-                                           (_start), (_flags))) ?:     \
-                     PTR_ERR_OR_ZERO(((_k) =                           \
-                       __bch2_btree_iter_peek(_iter, _flags)).k);      \
-            !_ret && (_k).k;                                           \
-            (_ret) = PTR_ERR_OR_ZERO(((_k) =                           \
-                       __bch2_btree_iter_next(_iter, _flags)).k))
+       for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id),       \
+                                          (_start), (_flags)),         \
+            (_k) = __bch2_btree_iter_peek(_iter, _flags);              \
+            !((_ret) = bkey_err(_k)) && (_k).k;                        \
+            (_k) = __bch2_btree_iter_next(_iter, _flags))
 
 #define for_each_btree_key_continue(_iter, _flags, _k, _ret)           \
        for ((_k) = __bch2_btree_iter_peek(_iter, _flags);              \
@@ -261,17 +263,18 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
 void bch2_trans_unlink_iters(struct btree_trans *);
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
-                                        struct bpos, unsigned);
+                                        struct bpos, unsigned,
+                                        unsigned, unsigned);
 
 static inline struct btree_iter *
 bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
                    struct bpos pos, unsigned flags)
 {
        struct btree_iter *iter =
-               __bch2_trans_get_iter(trans, btree_id, pos, flags);
-
-       if (!IS_ERR(iter))
-               iter->ip_allocated = _THIS_IP_;
+               __bch2_trans_get_iter(trans, btree_id, pos,
+                                     (flags & BTREE_ITER_INTENT) != 0, 0,
+                                     flags);
+       iter->ip_allocated = _THIS_IP_;
        return iter;
 }
 
@@ -283,17 +286,32 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
        struct btree_iter *iter =
                __bch2_trans_copy_iter(trans, src);
 
-       if (!IS_ERR(iter))
-               iter->ip_allocated = _THIS_IP_;
+       iter->ip_allocated = _THIS_IP_;
        return iter;
-
 }
 
 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
                                enum btree_id, struct bpos,
                                unsigned, unsigned, unsigned);
 
+static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
+{
+       return (trans->iters_live & (1ULL << iter->idx)) != 0;
+}
+
+static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
+{
+       return btree_iter_live(trans, iter) ||
+               (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+}
+
+static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
+{
+       trans->iters_touched &= ~(1ULL << iter->idx);
+}
+
 #define TRANS_RESET_NOTRAVERSE         (1 << 0)
+#define TRANS_RESET_NOUNLOCK           (1 << 1)
 
 void bch2_trans_reset(struct btree_trans *, unsigned);
 
index 61662750dfc046583d2d4c6092854e6a2c948335..0d3c0a40d64964fd5738f3a010cd5d841b7b2ec9 100644 (file)
@@ -9,8 +9,11 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 
+#include <linux/sched/mm.h>
 #include <trace/events/bcachefs.h>
 
+static struct kmem_cache *bch2_key_cache;
+
 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
                                       const void *obj)
 {
@@ -18,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
        const struct bkey_cached_key *key = arg->key;
 
        return cmp_int(ck->key.btree_id, key->btree_id) ?:
-               bkey_cmp(ck->key.pos, key->pos);
+               bpos_cmp(ck->key.pos, key->pos);
 }
 
 static const struct rhashtable_params bch2_btree_key_cache_params = {
@@ -29,8 +32,8 @@ static const struct rhashtable_params bch2_btree_key_cache_params = {
 };
 
 __flatten
-static inline struct bkey_cached *
-btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
 {
        struct bkey_cached_key key = {
                .btree_id       = btree_id,
@@ -66,12 +69,22 @@ static void bkey_cached_evict(struct btree_key_cache *c,
        BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
                                      bch2_btree_key_cache_params));
        memset(&ck->key, ~0, sizeof(ck->key));
+
+       atomic_long_dec(&c->nr_keys);
 }
 
-static void bkey_cached_free(struct btree_key_cache *c,
+static void bkey_cached_free(struct btree_key_cache *bc,
                             struct bkey_cached *ck)
 {
-       list_move(&ck->list, &c->freed);
+       struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+       BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+       ck->btree_trans_barrier_seq =
+               start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+       list_move_tail(&ck->list, &bc->freed);
+       bc->nr_freed++;
 
        kfree(ck->k);
        ck->k           = NULL;
@@ -86,26 +99,50 @@ bkey_cached_alloc(struct btree_key_cache *c)
 {
        struct bkey_cached *ck;
 
-       list_for_each_entry(ck, &c->freed, list)
-               if (bkey_cached_lock_for_evict(ck))
-                       return ck;
+       ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
+       if (likely(ck)) {
+               INIT_LIST_HEAD(&ck->list);
+               six_lock_init(&ck->c.lock);
+               BUG_ON(!six_trylock_intent(&ck->c.lock));
+               BUG_ON(!six_trylock_write(&ck->c.lock));
+               return ck;
+       }
+
+       return NULL;
+}
 
-       list_for_each_entry(ck, &c->clean, list)
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+       struct bucket_table *tbl;
+       struct rhash_head *pos;
+       struct bkey_cached *ck;
+       unsigned i;
+
+       mutex_lock(&c->lock);
+       list_for_each_entry_reverse(ck, &c->freed, list)
                if (bkey_cached_lock_for_evict(ck)) {
-                       bkey_cached_evict(c, ck);
+                       c->nr_freed--;
+                       list_del(&ck->list);
+                       mutex_unlock(&c->lock);
                        return ck;
                }
+       mutex_unlock(&c->lock);
 
-       ck = kzalloc(sizeof(*ck), GFP_NOFS);
-       if (!ck)
-               return NULL;
-
-       INIT_LIST_HEAD(&ck->list);
-       six_lock_init(&ck->c.lock);
-       BUG_ON(!six_trylock_intent(&ck->c.lock));
-       BUG_ON(!six_trylock_write(&ck->c.lock));
+       rcu_read_lock();
+       tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+       for (i = 0; i < tbl->size; i++)
+               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+                       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+                           bkey_cached_lock_for_evict(ck)) {
+                               bkey_cached_evict(c, ck);
+                               rcu_read_unlock();
+                               return ck;
+                       }
+               }
+       rcu_read_unlock();
 
-       return ck;
+       return NULL;
 }
 
 static struct bkey_cached *
@@ -114,28 +151,45 @@ btree_key_cache_create(struct btree_key_cache *c,
                       struct bpos pos)
 {
        struct bkey_cached *ck;
+       bool was_new = true;
 
        ck = bkey_cached_alloc(c);
-       if (!ck)
-               return ERR_PTR(-ENOMEM);
+
+       if (unlikely(!ck)) {
+               ck = bkey_cached_reuse(c);
+               if (unlikely(!ck))
+                       return ERR_PTR(-ENOMEM);
+
+               was_new = false;
+       }
 
        ck->c.level             = 0;
        ck->c.btree_id          = btree_id;
        ck->key.btree_id        = btree_id;
        ck->key.pos             = pos;
        ck->valid               = false;
+       ck->flags               = 1U << BKEY_CACHED_ACCESSED;
 
-       BUG_ON(ck->flags);
-
-       if (rhashtable_lookup_insert_fast(&c->table,
+       if (unlikely(rhashtable_lookup_insert_fast(&c->table,
                                          &ck->hash,
-                                         bch2_btree_key_cache_params)) {
+                                         bch2_btree_key_cache_params))) {
                /* We raced with another fill: */
-               bkey_cached_free(c, ck);
+
+               if (likely(was_new)) {
+                       six_unlock_write(&ck->c.lock);
+                       six_unlock_intent(&ck->c.lock);
+                       kfree(ck);
+               } else {
+                       mutex_lock(&c->lock);
+                       bkey_cached_free(c, ck);
+                       mutex_unlock(&c->lock);
+               }
+
                return NULL;
        }
 
-       list_move(&ck->list, &c->clean);
+       atomic_long_inc(&c->nr_keys);
+
        six_unlock_write(&ck->c.lock);
 
        return ck;
@@ -153,28 +207,23 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
        iter = bch2_trans_get_iter(trans, ck->key.btree_id,
                                   ck->key.pos, BTREE_ITER_SLOTS);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
-
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
-       if (ret) {
-               bch2_trans_iter_put(trans, iter);
-               return ret;
-       }
+       if (ret)
+               goto err;
 
        if (!bch2_btree_node_relock(ck_iter, 0)) {
-               bch2_trans_iter_put(trans, iter);
                trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-               return -EINTR;
+               ret = -EINTR;
+               goto err;
        }
 
        if (k.k->u64s > ck->u64s) {
                new_u64s = roundup_pow_of_two(k.k->u64s);
                new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
                if (!new_k) {
-                       bch2_trans_iter_put(trans, iter);
-                       return -ENOMEM;
+                       ret = -ENOMEM;
+                       goto err;
                }
        }
 
@@ -190,9 +239,10 @@ static int btree_key_cache_fill(struct btree_trans *trans,
        bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
 
        /* We're not likely to need this iterator again: */
-       bch2_trans_iter_free(trans, iter);
-
-       return 0;
+       set_btree_iter_dontneed(trans, iter);
+err:
+       bch2_trans_iter_put(trans, iter);
+       return ret;
 }
 
 static int bkey_cached_check_fn(struct six_lock *lock, void *p)
@@ -201,9 +251,10 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p)
        const struct btree_iter *iter = p;
 
        return ck->key.btree_id == iter->btree_id &&
-               !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+               !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
 }
 
+__flatten
 int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
 {
        struct btree_trans *trans = iter->trans;
@@ -218,18 +269,15 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
                goto fill;
        }
 retry:
-       ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
+       ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
        if (!ck) {
                if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
                        iter->l[0].b = NULL;
                        return 0;
                }
 
-               mutex_lock(&c->btree_key_cache.lock);
                ck = btree_key_cache_create(&c->btree_key_cache,
                                            iter->btree_id, iter->pos);
-               mutex_unlock(&c->btree_key_cache.lock);
-
                ret = PTR_ERR_OR_ZERO(ck);
                if (ret)
                        goto err;
@@ -242,9 +290,9 @@ retry:
                enum six_lock_type lock_want = __btree_lock_want(iter, 0);
 
                if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
-                                    bkey_cached_check_fn, iter)) {
+                                    bkey_cached_check_fn, iter, _THIS_IP_)) {
                        if (ck->key.btree_id != iter->btree_id ||
-                           bkey_cmp(ck->key.pos, iter->pos)) {
+                           bpos_cmp(ck->key.pos, iter->pos)) {
                                goto retry;
                        }
 
@@ -254,7 +302,7 @@ retry:
                }
 
                if (ck->key.btree_id != iter->btree_id ||
-                   bkey_cmp(ck->key.pos, iter->pos)) {
+                   bpos_cmp(ck->key.pos, iter->pos)) {
                        six_unlock_type(&ck->c.lock, lock_want);
                        goto retry;
                }
@@ -279,8 +327,18 @@ fill:
                        goto err;
        }
 
+       if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+               set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
        iter->uptodate = BTREE_ITER_NEED_PEEK;
-       bch2_btree_iter_downgrade(iter);
+
+       if (!(iter->flags & BTREE_ITER_INTENT))
+               bch2_btree_iter_downgrade(iter);
+       else if (!iter->locks_want) {
+               if (!__bch2_btree_iter_upgrade(iter, 1))
+                       ret = -EINTR;
+       }
+
        return ret;
 err:
        if (ret != -EINTR) {
@@ -294,29 +352,23 @@ err:
 static int btree_key_cache_flush_pos(struct btree_trans *trans,
                                     struct bkey_cached_key key,
                                     u64 journal_seq,
+                                    unsigned commit_flags,
                                     bool evict)
 {
        struct bch_fs *c = trans->c;
        struct journal *j = &c->journal;
        struct btree_iter *c_iter = NULL, *b_iter = NULL;
-       struct bkey_cached *ck;
+       struct bkey_cached *ck = NULL;
        int ret;
 
        b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
                                     BTREE_ITER_SLOTS|
                                     BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(b_iter);
-       if (ret)
-               goto out;
-
        c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
                                     BTREE_ITER_CACHED|
                                     BTREE_ITER_CACHED_NOFILL|
                                     BTREE_ITER_CACHED_NOCREATE|
                                     BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(c_iter);
-       if (ret)
-               goto out;
 retry:
        ret = bch2_btree_iter_traverse(c_iter);
        if (ret)
@@ -339,27 +391,33 @@ retry:
                                  BTREE_INSERT_NOUNLOCK|
                                  BTREE_INSERT_NOCHECK_RW|
                                  BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_USE_RESERVE|
-                                 BTREE_INSERT_USE_ALLOC_RESERVE|
-                                 BTREE_INSERT_JOURNAL_RESERVED|
-                                 BTREE_INSERT_JOURNAL_RECLAIM);
+                                 (ck->journal.seq == journal_last_seq(j)
+                                  ? BTREE_INSERT_JOURNAL_RESERVED
+                                  : 0)|
+                                 commit_flags);
 err:
        if (ret == -EINTR)
                goto retry;
 
-       BUG_ON(ret && !bch2_journal_error(j));
+       if (ret == -EAGAIN)
+               goto out;
 
-       if (ret)
+       if (ret) {
+               bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
+                       "error flushing key cache: %i", ret);
                goto out;
+       }
 
        bch2_journal_pin_drop(j, &ck->journal);
        bch2_journal_preres_put(j, &ck->res);
-       clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+
+       BUG_ON(!btree_node_locked(c_iter, 0));
 
        if (!evict) {
-               mutex_lock(&c->btree_key_cache.lock);
-               list_move_tail(&ck->list, &c->btree_key_cache.clean);
-               mutex_unlock(&c->btree_key_cache.lock);
+               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                       clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+                       atomic_long_dec(&c->btree_key_cache.nr_dirty);
+               }
        } else {
 evict:
                BUG_ON(!btree_node_intent_locked(c_iter, 0));
@@ -369,8 +427,14 @@ evict:
 
                six_lock_write(&ck->c.lock, NULL, NULL);
 
-               mutex_lock(&c->btree_key_cache.lock);
+               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                       clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+                       atomic_long_dec(&c->btree_key_cache.nr_dirty);
+               }
+
                bkey_cached_evict(&c->btree_key_cache, ck);
+
+               mutex_lock(&c->btree_key_cache.lock);
                bkey_cached_free(&c->btree_key_cache, ck);
                mutex_unlock(&c->btree_key_cache.lock);
        }
@@ -380,15 +444,17 @@ out:
        return ret;
 }
 
-static void btree_key_cache_journal_flush(struct journal *j,
-                                         struct journal_entry_pin *pin,
-                                         u64 seq)
+int bch2_btree_key_cache_journal_flush(struct journal *j,
+                               struct journal_entry_pin *pin, u64 seq)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bkey_cached *ck =
                container_of(pin, struct bkey_cached, journal);
        struct bkey_cached_key key;
        struct btree_trans trans;
+       int ret = 0;
+
+       int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
        six_lock_read(&ck->c.lock, NULL, NULL);
        key = ck->key;
@@ -396,13 +462,18 @@ static void btree_key_cache_journal_flush(struct journal *j,
        if (ck->journal.seq != seq ||
            !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
                six_unlock_read(&ck->c.lock);
-               return;
+               goto unlock;
        }
        six_unlock_read(&ck->c.lock);
 
        bch2_trans_init(&trans, c, 0, 0);
-       btree_key_cache_flush_pos(&trans, key, seq, false);
+       ret = btree_key_cache_flush_pos(&trans, key, seq,
+                                 BTREE_INSERT_JOURNAL_RECLAIM, false);
        bch2_trans_exit(&trans);
+unlock:
+       srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+       return ret;
 }
 
 /*
@@ -415,10 +486,10 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
        struct bkey_cached_key key = { id, pos };
 
        /* Fastpath - assume it won't be found: */
-       if (!btree_key_cache_find(c, id, pos))
+       if (!bch2_btree_key_cache_find(c, id, pos))
                return 0;
 
-       return btree_key_cache_flush_pos(trans, key, 0, true);
+       return btree_key_cache_flush_pos(trans, key, 0, 0, true);
 }
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
@@ -427,6 +498,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct bkey_cached *ck = (void *) iter->l[0].b;
+       bool kick_reclaim = false;
 
        BUG_ON(insert->u64s > ck->u64s);
 
@@ -446,15 +518,18 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
        ck->valid = true;
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               mutex_lock(&c->btree_key_cache.lock);
-               list_del_init(&ck->list);
-
                set_bit(BKEY_CACHED_DIRTY, &ck->flags);
-               mutex_unlock(&c->btree_key_cache.lock);
+               atomic_long_inc(&c->btree_key_cache.nr_dirty);
+
+               if (bch2_nr_btree_keys_need_flush(c))
+                       kick_reclaim = true;
        }
 
        bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
-                               &ck->journal, btree_key_cache_journal_flush);
+                               &ck->journal, bch2_btree_key_cache_journal_flush);
+
+       if (kick_reclaim)
+               journal_reclaim_kick(&c->journal);
        return true;
 }
 
@@ -462,58 +537,192 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
                               enum btree_id id, struct bpos pos)
 {
-       BUG_ON(btree_key_cache_find(trans->c, id, pos));
+       BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
 }
 #endif
 
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+                                          struct shrink_control *sc)
+{
+       struct bch_fs *c = container_of(shrink, struct bch_fs,
+                                       btree_key_cache.shrink);
+       struct btree_key_cache *bc = &c->btree_key_cache;
+       struct bucket_table *tbl;
+       struct bkey_cached *ck, *t;
+       size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+       unsigned start, flags;
+       int srcu_idx;
+
+       /* Return -1 if we can't do anything right now */
+       if (sc->gfp_mask & __GFP_FS)
+               mutex_lock(&bc->lock);
+       else if (!mutex_trylock(&bc->lock))
+               return -1;
+
+       srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+       flags = memalloc_nofs_save();
+
+       /*
+        * Newest freed entries are at the end of the list - once we hit one
+        * that's too new to be freed, we can bail out:
+        */
+       list_for_each_entry_safe(ck, t, &bc->freed, list) {
+               if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+                                                ck->btree_trans_barrier_seq))
+                       break;
+
+               list_del(&ck->list);
+               kmem_cache_free(bch2_key_cache, ck);
+               bc->nr_freed--;
+               scanned++;
+               freed++;
+       }
+
+       if (scanned >= nr)
+               goto out;
+
+       rcu_read_lock();
+       tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+       if (bc->shrink_iter >= tbl->size)
+               bc->shrink_iter = 0;
+       start = bc->shrink_iter;
+
+       do {
+               struct rhash_head *pos, *next;
+
+               pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+
+               while (!rht_is_a_nulls(pos)) {
+                       next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+                       ck = container_of(pos, struct bkey_cached, hash);
+
+                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+                               goto next;
+
+                       if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+                               clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+                       else if (bkey_cached_lock_for_evict(ck)) {
+                               bkey_cached_evict(bc, ck);
+                               bkey_cached_free(bc, ck);
+                       }
+
+                       scanned++;
+                       if (scanned >= nr)
+                               break;
+next:
+                       pos = next;
+               }
+
+               bc->shrink_iter++;
+               if (bc->shrink_iter >= tbl->size)
+                       bc->shrink_iter = 0;
+       } while (scanned < nr && bc->shrink_iter != start);
+
+       rcu_read_unlock();
+out:
+       memalloc_nofs_restore(flags);
+       srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+       mutex_unlock(&bc->lock);
+
+       return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+                                           struct shrink_control *sc)
 {
+       struct bch_fs *c = container_of(shrink, struct bch_fs,
+                                       btree_key_cache.shrink);
+       struct btree_key_cache *bc = &c->btree_key_cache;
+
+       return atomic_long_read(&bc->nr_keys);
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+{
+       struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+       struct bucket_table *tbl;
        struct bkey_cached *ck, *n;
+       struct rhash_head *pos;
+       unsigned i;
 
-       mutex_lock(&c->lock);
-       list_for_each_entry_safe(ck, n, &c->clean, list) {
+       if (bc->shrink.list.next)
+               unregister_shrinker(&bc->shrink);
+
+       mutex_lock(&bc->lock);
+
+       rcu_read_lock();
+       tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+       for (i = 0; i < tbl->size; i++)
+               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+                       bkey_cached_evict(bc, ck);
+                       list_add(&ck->list, &bc->freed);
+               }
+       rcu_read_unlock();
+
+       list_for_each_entry_safe(ck, n, &bc->freed, list) {
+               cond_resched();
+
+               bch2_journal_pin_drop(&c->journal, &ck->journal);
+               bch2_journal_preres_put(&c->journal, &ck->res);
+
+               list_del(&ck->list);
                kfree(ck->k);
-               kfree(ck);
+               kmem_cache_free(bch2_key_cache, ck);
        }
-       list_for_each_entry_safe(ck, n, &c->freed, list)
-               kfree(ck);
-       mutex_unlock(&c->lock);
 
-       rhashtable_destroy(&c->table);
+       BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal));
+       BUG_ON(atomic_long_read(&bc->nr_keys));
+
+       mutex_unlock(&bc->lock);
+
+       if (bc->table_init_done)
+               rhashtable_destroy(&bc->table);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
        mutex_init(&c->lock);
        INIT_LIST_HEAD(&c->freed);
-       INIT_LIST_HEAD(&c->clean);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
 {
-       return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+       int ret;
+
+       c->shrink.seeks                 = 1;
+       c->shrink.count_objects         = bch2_btree_key_cache_count;
+       c->shrink.scan_objects          = bch2_btree_key_cache_scan;
+
+       ret = register_shrinker(&c->shrink);
+       if (ret)
+               return ret;
+
+       ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+       if (ret)
+               return ret;
+
+       c->table_init_done = true;
+       return 0;
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-       struct bucket_table *tbl;
-       struct bkey_cached *ck;
-       struct rhash_head *pos;
-       size_t i;
+       pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
+       pr_buf(out, "nr_keys:\t%zu\n",  atomic_long_read(&c->nr_keys));
+       pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty));
+}
 
-       mutex_lock(&c->lock);
-       tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+void bch2_btree_key_cache_exit(void)
+{
+       if (bch2_key_cache)
+               kmem_cache_destroy(bch2_key_cache);
+}
 
-       for (i = 0; i < tbl->size; i++) {
-               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-                       pr_buf(out, "%s:",
-                              bch2_btree_ids[ck->key.btree_id]);
-                       bch2_bpos_to_text(out, ck->key.pos);
+int __init bch2_btree_key_cache_init(void)
+{
+       bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
+       if (!bch2_key_cache)
+               return -ENOMEM;
 
-                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
-                               pr_buf(out, " journal seq %llu", ck->journal.seq);
-                       pr_buf(out, "\n");
-               }
-       }
-       mutex_unlock(&c->lock);
+       return 0;
 }
index b1756c6c622cec53fbad64f08b2600982ec38167..7e2b0a08f745255b3b5c6986f4837d1674e00ec1 100644 (file)
@@ -1,6 +1,31 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 #define _BCACHEFS_BTREE_KEY_CACHE_H
 
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+       size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+       size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+       size_t max_dirty = 1024 + nr_keys  / 2;
+
+       return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+       size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+       size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+       size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+       return nr_dirty > max_dirty &&
+               test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *,
+                               struct journal_entry_pin *, u64);
+
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
 int bch2_btree_iter_traverse_cached(struct btree_iter *);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *,
@@ -22,4 +47,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
 
 void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
 
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
 #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
index 81fbf3e186473da3ef818ffcfb2d07df2141e5fb..7532bcdef96732b44bafac02aeb77ae92a82ebbd 100644 (file)
@@ -95,7 +95,7 @@ btree_lock_want(struct btree_iter *iter, int level)
        return BTREE_NODE_UNLOCKED;
 }
 
-static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 {
        int lock_type = btree_node_locked_type(iter, level);
 
@@ -106,13 +106,6 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
        mark_btree_node_unlocked(iter, level);
 }
 
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
-{
-       EBUG_ON(!level && iter->trans->nounlock);
-
-       __btree_node_unlock(iter, level);
-}
-
 static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
 {
        btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
@@ -176,36 +169,25 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
 
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
                            struct btree_iter *, enum six_lock_type,
-                           six_lock_should_sleep_fn, void *);
+                           six_lock_should_sleep_fn, void *,
+                           unsigned long);
 
 static inline bool btree_node_lock(struct btree *b,
                        struct bpos pos, unsigned level,
                        struct btree_iter *iter,
                        enum six_lock_type type,
-                       six_lock_should_sleep_fn should_sleep_fn, void *p)
+                       six_lock_should_sleep_fn should_sleep_fn, void *p,
+                       unsigned long ip)
 {
        struct btree_trans *trans = iter->trans;
-       bool ret;
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
        EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans->locking          = b;
-       trans->locking_iter_idx = iter->idx;
-       trans->locking_pos      = pos;
-       trans->locking_btree_id = iter->btree_id;
-       trans->locking_level    = level;
-#endif
-       ret   = likely(six_trylock_type(&b->c.lock, type)) ||
+       return likely(six_trylock_type(&b->c.lock, type)) ||
                btree_node_lock_increment(trans, b, level, type) ||
                __bch2_btree_node_lock(b, pos, level, iter, type,
-                                      should_sleep_fn, p);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans->locking = NULL;
-#endif
-       return ret;
+                                      should_sleep_fn, p, ip);
 }
 
 bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
index cc01baeec138daaf5e06c3e030e095e4e20b7443..39e93da10ac4b455ea65bdc347da486fc7a1c5b8 100644 (file)
@@ -47,8 +47,6 @@ struct bset_tree {
        u16                     data_offset;
        u16                     aux_data_offset;
        u16                     end_offset;
-
-       struct bpos             max_key;
 };
 
 struct btree_write {
@@ -57,7 +55,7 @@ struct btree_write {
 
 struct btree_alloc {
        struct open_buckets     ob;
-       BKEY_PADDED(k);
+       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
 
 struct btree_bkey_cached_common {
@@ -76,6 +74,7 @@ struct btree {
        u16                     written;
        u8                      nsets;
        u8                      nr_key_bits;
+       u16                     version_ondisk;
 
        struct bkey_format      format;
 
@@ -97,6 +96,11 @@ struct btree {
        u8                      byte_order;
        u8                      unpack_fn_len;
 
+       struct btree_write      writes[2];
+
+       /* Key/pointer for this btree node */
+       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
        /*
         * XXX: add a delete sequence number, so when bch2_btree_node_relock()
         * fails because the lock sequence number has changed - i.e. the
@@ -127,15 +131,6 @@ struct btree {
 
        /* lru list */
        struct list_head        list;
-
-       struct btree_write      writes[2];
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       bool                    *expensive_debug_checks;
-#endif
-
-       /* Key/pointer for this btree node */
-       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
 
 struct btree_cache {
@@ -162,6 +157,7 @@ struct btree_cache {
        /* Number of elements in live + freeable lists */
        unsigned                used;
        unsigned                reserve;
+       atomic_t                dirty;
        struct shrinker         shrink;
 
        /*
@@ -217,13 +213,8 @@ enum btree_iter_type {
 #define BTREE_ITER_SET_POS_AFTER_COMMIT        (1 << 8)
 #define BTREE_ITER_CACHED_NOFILL       (1 << 9)
 #define BTREE_ITER_CACHED_NOCREATE     (1 << 10)
-
-#define BTREE_ITER_USER_FLAGS                          \
-       (BTREE_ITER_SLOTS                               \
-       |BTREE_ITER_INTENT                              \
-       |BTREE_ITER_PREFETCH                            \
-       |BTREE_ITER_CACHED_NOFILL                       \
-       |BTREE_ITER_CACHED_NOCREATE)
+#define BTREE_ITER_NOT_EXTENTS         (1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 12)
 
 enum btree_iter_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -250,7 +241,11 @@ enum btree_iter_uptodate {
 struct btree_iter {
        struct btree_trans      *trans;
        struct bpos             pos;
+       /* what we're searching for/what the iterator actually points to: */
+       struct bpos             real_pos;
        struct bpos             pos_after_commit;
+       /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+       unsigned                snapshot;
 
        u16                     flags;
        u8                      idx;
@@ -283,6 +278,11 @@ btree_iter_type(const struct btree_iter *iter)
        return iter->flags & BTREE_ITER_TYPE;
 }
 
+static inline bool btree_iter_is_cached(const struct btree_iter *iter)
+{
+       return btree_iter_type(iter) == BTREE_ITER_CACHED;
+}
+
 static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
 {
        return iter->l + iter->level;
@@ -291,8 +291,14 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
 struct btree_key_cache {
        struct mutex            lock;
        struct rhashtable       table;
+       bool                    table_init_done;
        struct list_head        freed;
-       struct list_head        clean;
+       struct shrinker         shrink;
+       unsigned                shrink_iter;
+
+       size_t                  nr_freed;
+       atomic_long_t           nr_keys;
+       atomic_long_t           nr_dirty;
 };
 
 struct bkey_cached_key {
@@ -300,7 +306,8 @@ struct bkey_cached_key {
        struct bpos             pos;
 } __attribute__((packed, aligned(4)));
 
-#define BKEY_CACHED_DIRTY              0
+#define BKEY_CACHED_ACCESSED           0
+#define BKEY_CACHED_DIRTY              1
 
 struct bkey_cached {
        struct btree_bkey_cached_common c;
@@ -308,6 +315,7 @@ struct bkey_cached {
        unsigned long           flags;
        u8                      u64s;
        bool                    valid;
+       u32                     btree_trans_barrier_seq;
        struct bkey_cached_key  key;
 
        struct rhash_head       hash;
@@ -321,7 +329,11 @@ struct bkey_cached {
 
 struct btree_insert_entry {
        unsigned                trigger_flags;
+       u8                      bkey_type;
+       enum btree_id           btree_id:8;
+       u8                      level;
        unsigned                trans_triggers_run:1;
+       unsigned                is_extent:1;
        struct bkey_i           *k;
        struct btree_iter       *iter;
 };
@@ -332,6 +344,14 @@ struct btree_insert_entry {
 #define BTREE_ITER_MAX         32
 #endif
 
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+       btree_trans_commit_hook_fn      *fn;
+       struct btree_trans_commit_hook  *next;
+};
+
 struct btree_trans {
        struct bch_fs           *c;
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -344,21 +364,18 @@ struct btree_trans {
        pid_t                   pid;
 #endif
        unsigned long           ip;
+       int                     srcu_idx;
 
-       u64                     iters_linked;
-       u64                     iters_live;
-       u64                     iters_touched;
-
-       u8                      nr_iters;
        u8                      nr_updates;
        u8                      nr_updates2;
-       u8                      size;
        unsigned                used_mempool:1;
        unsigned                error:1;
-       unsigned                nounlock:1;
-       unsigned                need_reset:1;
        unsigned                in_traverse_all:1;
 
+       u64                     iters_linked;
+       u64                     iters_live;
+       u64                     iters_touched;
+
        unsigned                mem_top;
        unsigned                mem_bytes;
        void                    *mem;
@@ -368,6 +385,7 @@ struct btree_trans {
        struct btree_insert_entry *updates2;
 
        /* update path: */
+       struct btree_trans_commit_hook *hooks;
        struct jset_entry       *extra_journal_entries;
        unsigned                extra_journal_entry_u64s;
        struct journal_entry_pin *journal_pin;
@@ -380,10 +398,6 @@ struct btree_trans {
        unsigned                journal_u64s;
        unsigned                journal_preres_u64s;
        struct replicas_delta_list *fs_usage_deltas;
-
-       struct btree_iter       iters_onstack[2];
-       struct btree_insert_entry updates_onstack[2];
-       struct btree_insert_entry updates2_onstack[2];
 };
 
 #define BTREE_FLAG(flag)                                               \
@@ -408,13 +422,12 @@ enum btree_flags {
        BTREE_NODE_just_written,
        BTREE_NODE_dying,
        BTREE_NODE_fake,
-       BTREE_NODE_old_extent_overwrite,
        BTREE_NODE_need_rewrite,
+       BTREE_NODE_never_write,
 };
 
 BTREE_FLAG(read_in_flight);
 BTREE_FLAG(read_error);
-BTREE_FLAG(dirty);
 BTREE_FLAG(need_write);
 BTREE_FLAG(noevict);
 BTREE_FLAG(write_idx);
@@ -423,8 +436,8 @@ BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
 BTREE_FLAG(dying);
 BTREE_FLAG(fake);
-BTREE_FLAG(old_extent_overwrite);
 BTREE_FLAG(need_rewrite);
+BTREE_FLAG(never_write);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
@@ -536,16 +549,16 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
 }
 
 enum btree_node_type {
-#define x(kwd, val, name) BKEY_TYPE_##kwd = val,
+#define x(kwd, val) BKEY_TYPE_##kwd = val,
        BCH_BTREE_IDS()
 #undef x
-       BKEY_TYPE_BTREE,
+       BKEY_TYPE_btree,
 };
 
 /* Type of a key in btree @id at level @level: */
 static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
 {
-       return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id;
+       return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
 }
 
 /* Type of keys @b contains: */
@@ -557,8 +570,8 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
 static inline bool btree_node_type_is_extents(enum btree_node_type type)
 {
        switch (type) {
-       case BKEY_TYPE_EXTENTS:
-       case BKEY_TYPE_REFLINK:
+       case BKEY_TYPE_extents:
+       case BKEY_TYPE_reflink:
                return true;
        default:
                return false;
@@ -580,19 +593,31 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
        return btree_node_type_is_extents(btree_iter_key_type(iter));
 }
 
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
+       ((1U << BKEY_TYPE_extents)|                     \
+        (1U << BKEY_TYPE_inodes)|                      \
+        (1U << BKEY_TYPE_stripes)|                     \
+        (1U << BKEY_TYPE_reflink)|                     \
+        (1U << BKEY_TYPE_btree))
+
+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS               \
+       ((1U << BKEY_TYPE_alloc)|                       \
+        (1U << BKEY_TYPE_stripes))
+
 #define BTREE_NODE_TYPE_HAS_TRIGGERS                   \
-       ((1U << BKEY_TYPE_EXTENTS)|                     \
-        (1U << BKEY_TYPE_ALLOC)|                       \
-        (1U << BKEY_TYPE_INODES)|                      \
-        (1U << BKEY_TYPE_REFLINK)|                     \
-        (1U << BKEY_TYPE_EC)|                          \
-        (1U << BKEY_TYPE_BTREE))
+       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
+        BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
-       ((1U << BKEY_TYPE_EXTENTS)|                     \
-        (1U << BKEY_TYPE_INODES)|                      \
-        (1U << BKEY_TYPE_EC)|                          \
-        (1U << BKEY_TYPE_REFLINK))
+#define BTREE_ID_HAS_SNAPSHOTS                         \
+       ((1U << BTREE_ID_extents)|                      \
+        (1U << BTREE_ID_inodes)|                       \
+        (1U << BTREE_ID_dirents)|                      \
+        (1U << BTREE_ID_xattrs))
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+       return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
+}
 
 enum btree_trigger_flags {
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
@@ -643,6 +668,7 @@ enum btree_insert_ret {
        BTREE_INSERT_ENOSPC,
        BTREE_INSERT_NEED_MARK_REPLICAS,
        BTREE_INSERT_NEED_JOURNAL_RES,
+       BTREE_INSERT_NEED_JOURNAL_RECLAIM,
 };
 
 enum btree_gc_coalesce_fail_reason {
index e0b1bde37484d990528aa9f78237d5b76bf2d440..4ce12ae29a556ff3edcfe881c40787c071996dd8 100644 (file)
@@ -20,7 +20,6 @@ enum btree_insert_flags {
        __BTREE_INSERT_NOCHECK_RW,
        __BTREE_INSERT_LAZY_RW,
        __BTREE_INSERT_USE_RESERVE,
-       __BTREE_INSERT_USE_ALLOC_RESERVE,
        __BTREE_INSERT_JOURNAL_REPLAY,
        __BTREE_INSERT_JOURNAL_RESERVED,
        __BTREE_INSERT_JOURNAL_RECLAIM,
@@ -43,7 +42,6 @@ enum btree_insert_flags {
 
 /* for copygc, or when merging btree nodes */
 #define BTREE_INSERT_USE_RESERVE       (1 << __BTREE_INSERT_USE_RESERVE)
-#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
 
 /* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY    (1 << __BTREE_INSERT_JOURNAL_REPLAY)
@@ -67,8 +65,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
                     struct disk_reservation *, u64 *, int flags);
 
-int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *,
-                              struct bpos, u64 *);
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+                                 struct bpos, struct bpos, u64 *);
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
                            struct bpos, struct bpos, u64 *);
 
@@ -79,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 
 int bch2_trans_update(struct btree_trans *, struct btree_iter *,
                      struct bkey_i *, enum btree_trigger_flags);
+void bch2_trans_commit_hook(struct btree_trans *,
+                           struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
 
 /**
index a2604b0ce2d83eb91bba44b78f3350703e112ea0..00144707988f6b475d1b89c560e05fa72f072d4d 100644 (file)
@@ -11,6 +11,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "buckets.h"
+#include "error.h"
 #include "extents.h"
 #include "journal.h"
 #include "journal_reclaim.h"
@@ -34,6 +35,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
        struct bkey_s_c k;
        struct bkey_s_c_btree_ptr_v2 bp;
        struct bkey unpacked;
+       char buf1[100], buf2[100];
 
        BUG_ON(!b->c.level);
 
@@ -48,16 +50,26 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
                        break;
                bp = bkey_s_c_to_btree_ptr_v2(k);
 
-               BUG_ON(bkey_cmp(next_node, bp.v->min_key));
+               if (bpos_cmp(next_node, bp.v->min_key)) {
+                       bch2_dump_btree_node(c, b);
+                       panic("expected next min_key %s got %s\n",
+                             (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
+                             (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
+               }
 
                bch2_btree_node_iter_advance(&iter, b);
 
                if (bch2_btree_node_iter_end(&iter)) {
-                       BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
+                       if (bpos_cmp(k.k->p, b->key.k.p)) {
+                               bch2_dump_btree_node(c, b);
+                               panic("expected end %s got %s\n",
+                                     (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
+                                     (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
+                       }
                        break;
                }
 
-               next_node = bkey_successor(k.k->p);
+               next_node = bpos_successor(k.k->p);
        }
 #endif
 }
@@ -70,11 +82,9 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
        struct bset_tree *t;
        struct bkey uk;
 
-       bch2_bkey_format_add_pos(s, b->data->min_key);
-
        for_each_bset(b, t)
                bset_tree_for_each_key(b, t, k)
-                       if (!bkey_whiteout(k)) {
+                       if (!bkey_deleted(k)) {
                                uk = bkey_unpack_key(b, k);
                                bch2_bkey_format_add_key(s, &uk);
                        }
@@ -85,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b)
        struct bkey_format_state s;
 
        bch2_bkey_format_init(&s);
+       bch2_bkey_format_add_pos(&s, b->data->min_key);
+       bch2_bkey_format_add_pos(&s, b->data->max_key);
        __bch2_btree_calc_format(&s, b);
 
        return bch2_bkey_format_done(&s);
@@ -149,7 +161,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 
        b->ob.nr = 0;
 
-       clear_btree_node_dirty(b);
+       clear_btree_node_dirty(c, b);
 
        btree_node_lock_type(c, b, SIX_LOCK_write);
        __btree_node_free(c, b);
@@ -179,21 +191,18 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 {
        struct write_point *wp;
        struct btree *b;
-       BKEY_PADDED(k) tmp;
+       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
        struct open_buckets ob = { .nr = 0 };
        struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
        unsigned nr_reserve;
        enum alloc_reserve alloc_reserve;
 
-       if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+       if (flags & BTREE_INSERT_USE_RESERVE) {
                nr_reserve      = 0;
-               alloc_reserve   = RESERVE_ALLOC;
-       } else if (flags & BTREE_INSERT_USE_RESERVE) {
-               nr_reserve      = BTREE_NODE_RESERVE / 2;
-               alloc_reserve   = RESERVE_BTREE;
+               alloc_reserve   = RESERVE_BTREE_MOVINGGC;
        } else {
                nr_reserve      = BTREE_NODE_RESERVE;
-               alloc_reserve   = RESERVE_NONE;
+               alloc_reserve   = RESERVE_BTREE;
        }
 
        mutex_lock(&c->btree_reserve_cache_lock);
@@ -209,7 +218,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+       wp = bch2_alloc_sectors_start(c,
+                                     c->opts.metadata_target ?:
+                                     c->opts.foreground_target,
+                                     0,
                                      writepoint_ptr(&c->btree_write_point),
                                      &devs_have,
                                      res->nr_replicas,
@@ -264,19 +276,19 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
        set_btree_node_accessed(b);
-       set_btree_node_dirty(b);
+       set_btree_node_dirty(c, b);
        set_btree_node_need_write(b);
 
        bch2_bset_init_first(b, &b->data->keys);
        b->c.level      = level;
        b->c.btree_id   = as->btree_id;
+       b->version_ondisk = c->sb.version;
 
        memset(&b->nr, 0, sizeof(b->nr));
        b->data->magic = cpu_to_le64(bset_magic(c));
        b->data->flags = 0;
        SET_BTREE_NODE_ID(b->data, as->btree_id);
        SET_BTREE_NODE_LEVEL(b->data, level);
-       b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
 
        if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
                struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
@@ -284,17 +296,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
                bp->v.mem_ptr           = 0;
                bp->v.seq               = b->data->keys.seq;
                bp->v.sectors_written   = 0;
-               bp->v.sectors           = cpu_to_le16(c->opts.btree_node_size);
        }
 
-       if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
-               SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
-
-       if (btree_node_is_extents(b) &&
-           !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
-               set_btree_node_old_extent_overwrite(b);
-               set_btree_node_need_rewrite(b);
-       }
+       SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
 
        bch2_btree_build_aux_trees(b);
 
@@ -433,10 +437,6 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
                        goto err_free;
                }
 
-               ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
-               if (ret)
-                       goto err_free;
-
                as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
        }
 
@@ -454,6 +454,10 @@ static void bch2_btree_update_free(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
 
+       if (as->took_gc_lock)
+               up_read(&c->gc_lock);
+       as->took_gc_lock = false;
+
        bch2_journal_preres_put(&c->journal, &as->journal_preres);
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -503,14 +507,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
        trans->journal_pin = &as->journal;
 
        for_each_keylist_key(&as->new_keys, k) {
-               ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+               ret = bch2_trans_mark_key(trans,
+                                         bkey_s_c_null,
+                                         bkey_i_to_s_c(k),
                                          0, 0, BTREE_TRIGGER_INSERT);
                if (ret)
                        return ret;
        }
 
        for_each_keylist_key(&as->old_keys, k) {
-               ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+               ret = bch2_trans_mark_key(trans,
+                                         bkey_i_to_s_c(k),
+                                         bkey_s_c_null,
                                          0, 0, BTREE_TRIGGER_OVERWRITE);
                if (ret)
                        return ret;
@@ -523,10 +531,24 @@ static void btree_update_nodes_written(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
        struct btree *b = as->b;
+       struct btree_trans trans;
        u64 journal_seq = 0;
        unsigned i;
        int ret;
 
+       /*
+        * If we're already in an error state, it might be because a btree node
+        * was never written, and we might be trying to free that same btree
+        * node here, but it won't have been marked as allocated and we'll see
+        * spurious disk usage inconsistencies in the transactional part below
+        * if we don't skip it:
+        */
+       ret = bch2_journal_error(&c->journal);
+       if (ret)
+               goto err;
+
+       BUG_ON(!journal_pin_active(&as->journal));
+
        /*
         * We did an update to a parent node where the pointers we added pointed
         * to child nodes that weren't written yet: now, the child nodes have
@@ -540,16 +562,18 @@ static void btree_update_nodes_written(struct btree_update *as)
         * journal reclaim does btree updates when flushing bkey_cached entries,
         * which may require allocations as well.
         */
-       ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
-                           BTREE_INSERT_NOFAIL|
-                           BTREE_INSERT_USE_RESERVE|
-                           BTREE_INSERT_USE_ALLOC_RESERVE|
-                           BTREE_INSERT_NOCHECK_RW|
-                           BTREE_INSERT_JOURNAL_RECLAIM|
-                           BTREE_INSERT_JOURNAL_RESERVED,
-                           btree_update_nodes_written_trans(&trans, as));
-       BUG_ON(ret && !bch2_journal_error(&c->journal));
-
+       bch2_trans_init(&trans, c, 0, 512);
+       ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
+                             BTREE_INSERT_NOFAIL|
+                             BTREE_INSERT_NOCHECK_RW|
+                             BTREE_INSERT_JOURNAL_RECLAIM|
+                             BTREE_INSERT_JOURNAL_RESERVED,
+                             btree_update_nodes_written_trans(&trans, as));
+       bch2_trans_exit(&trans);
+
+       bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
+                            "error %i in btree_update_nodes_written()", ret);
+err:
        if (b) {
                /*
                 * @b is the node we did the final insert into:
@@ -569,17 +593,30 @@ static void btree_update_nodes_written(struct btree_update *as)
 
                list_del(&as->write_blocked_list);
 
-               if (!ret && as->b == b) {
+               /*
+                * Node might have been freed, recheck under
+                * btree_interior_update_lock:
+                */
+               if (as->b == b) {
                        struct bset *i = btree_bset_last(b);
 
                        BUG_ON(!b->c.level);
                        BUG_ON(!btree_node_dirty(b));
 
-                       i->journal_seq = cpu_to_le64(
-                               max(journal_seq,
-                                   le64_to_cpu(i->journal_seq)));
-
-                       bch2_btree_add_journal_pin(c, b, journal_seq);
+                       if (!ret) {
+                               i->journal_seq = cpu_to_le64(
+                                       max(journal_seq,
+                                           le64_to_cpu(i->journal_seq)));
+
+                               bch2_btree_add_journal_pin(c, b, journal_seq);
+                       } else {
+                               /*
+                                * If we didn't get a journal sequence number we
+                                * can't write this btree node, because recovery
+                                * won't know to ignore this write:
+                                */
+                               set_btree_node_never_write(b);
+                       }
                }
 
                mutex_unlock(&c->btree_interior_update_lock);
@@ -680,17 +717,7 @@ static void btree_update_reparent(struct btree_update *as,
        child->b = NULL;
        child->mode = BTREE_INTERIOR_UPDATING_AS;
 
-       /*
-        * When we write a new btree root, we have to drop our journal pin
-        * _before_ the new nodes are technically reachable; see
-        * btree_update_nodes_written().
-        *
-        * This goes for journal pins that are recursively blocked on us - so,
-        * just transfer the journal pin to the new interior update so
-        * btree_update_nodes_written() can drop it.
-        */
        bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
-       bch2_journal_pin_drop(&c->journal, &child->journal);
 }
 
 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -827,7 +854,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                closure_wake_up(&c->btree_interior_update_wait);
        }
 
-       clear_btree_node_dirty(b);
+       clear_btree_node_dirty(c, b);
        clear_btree_node_need_write(b);
 
        /*
@@ -866,24 +893,33 @@ void bch2_btree_update_done(struct btree_update *as)
 {
        BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 
+       if (as->took_gc_lock)
+               up_read(&as->c->gc_lock);
+       as->took_gc_lock = false;
+
        bch2_btree_reserve_put(as);
 
        continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
 }
 
 struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
-                       unsigned nr_nodes, unsigned flags,
-                       struct closure *cl)
+bch2_btree_update_start(struct btree_iter *iter, unsigned level,
+                       unsigned nr_nodes, unsigned flags)
 {
+       struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
        struct btree_update *as;
+       struct closure cl;
        int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
                ? BCH_DISK_RESERVATION_NOFAIL : 0;
-       int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
-               ? JOURNAL_RES_GET_RECLAIM : 0;
+       int journal_flags = 0;
        int ret = 0;
 
+       if (flags & BTREE_INSERT_JOURNAL_RESERVED)
+               journal_flags |= JOURNAL_RES_GET_RESERVED;
+
+       closure_init_stack(&cl);
+retry:
        /*
         * This check isn't necessary for correctness - it's just to potentially
         * prevent us from doing a lot of work that'll end up being wasted:
@@ -892,12 +928,36 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
        if (ret)
                return ERR_PTR(ret);
 
+       /*
+        * XXX: figure out how far we might need to split,
+        * instead of locking/reserving all the way to the root:
+        */
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+               trace_trans_restart_iter_upgrade(trans->ip);
+               return ERR_PTR(-EINTR);
+       }
+
+       if (flags & BTREE_INSERT_GC_LOCK_HELD)
+               lockdep_assert_held(&c->gc_lock);
+       else if (!down_read_trylock(&c->gc_lock)) {
+               if (flags & BTREE_INSERT_NOUNLOCK)
+                       return ERR_PTR(-EINTR);
+
+               bch2_trans_unlock(trans);
+               down_read(&c->gc_lock);
+               if (!bch2_trans_relock(trans)) {
+                       up_read(&c->gc_lock);
+                       return ERR_PTR(-EINTR);
+               }
+       }
+
        as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
        memset(as, 0, sizeof(*as));
        closure_init(&as->cl, NULL);
        as->c           = c;
        as->mode        = BTREE_INTERIOR_NO_UPDATE;
-       as->btree_id    = id;
+       as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+       as->btree_id    = iter->btree_id;
        INIT_LIST_HEAD(&as->list);
        INIT_LIST_HEAD(&as->unwritten_list);
        INIT_LIST_HEAD(&as->write_blocked_list);
@@ -909,16 +969,25 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
                                      BTREE_UPDATE_JOURNAL_RES,
                                      journal_flags|JOURNAL_RES_GET_NONBLOCK);
        if (ret == -EAGAIN) {
-               if (flags & BTREE_INSERT_NOUNLOCK)
-                       return ERR_PTR(-EINTR);
+               /*
+                * this would be cleaner if bch2_journal_preres_get() took a
+                * closure argument
+                */
+               if (flags & BTREE_INSERT_NOUNLOCK) {
+                       ret = -EINTR;
+                       goto err;
+               }
 
                bch2_trans_unlock(trans);
 
+               if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+                       goto err;
+
                ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
                                BTREE_UPDATE_JOURNAL_RES,
                                journal_flags);
                if (ret)
-                       return ERR_PTR(ret);
+                       goto err;
 
                if (!bch2_trans_relock(trans)) {
                        ret = -EINTR;
@@ -933,10 +1002,15 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
        if (ret)
                goto err;
 
-       ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
+       ret = bch2_btree_reserve_get(as, nr_nodes, flags,
+               !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
        if (ret)
                goto err;
 
+       bch2_journal_pin_add(&c->journal,
+                            atomic64_read(&c->journal.seq),
+                            &as->journal, NULL);
+
        mutex_lock(&c->btree_interior_update_lock);
        list_add_tail(&as->list, &c->btree_interior_update_list);
        mutex_unlock(&c->btree_interior_update_lock);
@@ -944,6 +1018,18 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
        return as;
 err:
        bch2_btree_update_free(as);
+
+       if (ret == -EAGAIN) {
+               BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
+
+               bch2_trans_unlock(trans);
+               closure_sync(&cl);
+               ret = -EINTR;
+       }
+
+       if (ret == -EINTR && bch2_trans_relock(trans))
+               goto retry;
+
        return ERR_PTR(ret);
 }
 
@@ -956,6 +1042,11 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        list_del_init(&b->list);
        mutex_unlock(&c->btree_cache.lock);
 
+       if (b->c.level)
+               six_lock_pcpu_alloc(&b->c.lock);
+       else
+               six_lock_pcpu_free(&b->c.lock);
+
        mutex_lock(&c->btree_root_lock);
        BUG_ON(btree_node_root(c, b) &&
               (b->c.level < btree_node_root(c, b)->c.level ||
@@ -1018,7 +1109,19 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
                                        struct bkey_i *insert,
                                        struct btree_node_iter *node_iter)
 {
+       struct bch_fs *c = as->c;
        struct bkey_packed *k;
+       const char *invalid;
+
+       invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
+               bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
+       if (invalid) {
+               char buf[160];
+
+               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
+               bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+               dump_stack();
+       }
 
        BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
               ARRAY_SIZE(as->journal_entries));
@@ -1034,7 +1137,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
                bch2_btree_node_iter_advance(node_iter, b);
 
        bch2_btree_bset_insert_key(iter, b, node_iter, insert);
-       set_btree_node_dirty(b);
+       set_btree_node_dirty(c, b);
        set_btree_node_need_write(b);
 }
 
@@ -1046,10 +1149,12 @@ static struct btree *__btree_split_node(struct btree_update *as,
                                        struct btree *n1,
                                        struct btree_iter *iter)
 {
+       struct bkey_format_state s;
        size_t nr_packed = 0, nr_unpacked = 0;
        struct btree *n2;
        struct bset *set1, *set2;
-       struct bkey_packed *k, *prev = NULL;
+       struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
+       struct bpos n1_pos;
 
        n2 = bch2_btree_node_alloc(as, n1->c.level);
        bch2_btree_update_add_new_node(as, n2);
@@ -1059,8 +1164,6 @@ static struct btree *__btree_split_node(struct btree_update *as,
        SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
        n2->key.k.p = n1->key.k.p;
 
-       btree_node_set_format(n2, n2->data->format);
-
        set1 = btree_bset_first(n1);
        set2 = btree_bset_first(n2);
 
@@ -1070,7 +1173,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
         */
        k = set1->start;
        while (1) {
-               struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+               struct bkey_packed *n = bkey_next(k);
 
                if (n == vstruct_last(set1))
                        break;
@@ -1087,33 +1190,53 @@ static struct btree *__btree_split_node(struct btree_update *as,
        }
 
        BUG_ON(!prev);
+       set2_start      = k;
+       set2_end        = vstruct_last(set1);
 
-       btree_set_max(n1, bkey_unpack_pos(n1, prev));
-       btree_set_min(n2, bkey_successor(n1->key.k.p));
-
-       set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
-       set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
-
+       set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
        set_btree_bset_end(n1, n1->set);
-       set_btree_bset_end(n2, n2->set);
-
-       n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
-       n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
-       n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
-       n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
 
        n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
        n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
        n1->nr.packed_keys      = nr_packed;
        n1->nr.unpacked_keys    = nr_unpacked;
 
+       n1_pos = bkey_unpack_pos(n1, prev);
+       if (as->c->sb.version < bcachefs_metadata_version_snapshot)
+               n1_pos.snapshot = U32_MAX;
+
+       btree_set_max(n1, n1_pos);
+       btree_set_min(n2, bpos_successor(n1->key.k.p));
+
+       bch2_bkey_format_init(&s);
+       bch2_bkey_format_add_pos(&s, n2->data->min_key);
+       bch2_bkey_format_add_pos(&s, n2->data->max_key);
+
+       for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+               struct bkey uk = bkey_unpack_key(n1, k);
+               bch2_bkey_format_add_key(&s, &uk);
+       }
+
+       n2->data->format = bch2_bkey_format_done(&s);
+       btree_node_set_format(n2, n2->data->format);
+
+       out = set2->start;
+       memset(&n2->nr, 0, sizeof(n2->nr));
+
+       for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+               BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
+                                      ? &n1->format : &bch2_bkey_format_current, k));
+               out->format = KEY_FORMAT_LOCAL_BTREE;
+               btree_keys_account_key_add(&n2->nr, 0, out);
+               out = bkey_next(out);
+       }
+
+       set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
+       set_btree_bset_end(n2, n2->set);
+
        BUG_ON(!set1->u64s);
        BUG_ON(!set2->u64s);
 
-       memcpy_u64s(set2->start,
-                   vstruct_end(set1),
-                   le16_to_cpu(set2->u64s));
-
        btree_node_reset_sib_u64s(n1);
        btree_node_reset_sib_u64s(n2);
 
@@ -1148,7 +1271,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
        struct bkey_packed *src, *dst, *n;
        struct bset *i;
 
-       BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+       BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
 
        bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
@@ -1167,7 +1290,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
        i = btree_bset_first(b);
        src = dst = i->start;
        while (src != vstruct_last(i)) {
-               n = bkey_next_skip_noops(src, vstruct_last(i));
+               n = bkey_next(src);
                if (!bkey_deleted(src)) {
                        memmove_u64s_down(dst, src, src->u64s);
                        dst = bkey_next(dst);
@@ -1175,6 +1298,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
                src = n;
        }
 
+       /* Also clear out the unwritten whiteouts area: */
+       b->whiteout_u64s = 0;
+
        i->u64s = cpu_to_le16((u64 *) dst - i->_data);
        set_btree_bset_end(b, b->set);
 
@@ -1313,7 +1439,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
         * the node the iterator points to:
         */
        while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-              (bkey_cmp_packed(b, k, &insert->k) >= 0))
+              (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
                ;
 
        for_each_keylist_key(keys, insert)
@@ -1348,14 +1474,12 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
        int old_live_u64s = b->nr.live_u64s;
        int live_u64s_added, u64s_added;
 
+       lockdep_assert_held(&c->gc_lock);
        BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
        BUG_ON(!b->c.level);
        BUG_ON(!as || as->b);
        bch2_verify_keylist_sorted(keys);
 
-       if (as->must_rewrite)
-               goto split;
-
        bch2_btree_node_lock_for_insert(c, b, iter);
 
        if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
@@ -1363,6 +1487,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
                goto split;
        }
 
+       btree_node_interior_verify(c, b);
+
        bch2_btree_insert_keys_interior(as, b, iter, keys);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
@@ -1380,14 +1506,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
        bch2_btree_node_unlock_write(b, iter);
 
        btree_node_interior_verify(c, b);
-
-       /*
-        * when called from the btree_split path the new nodes aren't added to
-        * the btree iterator yet, so the merge path's unlock/wait/relock dance
-        * won't work:
-        */
-       bch2_foreground_maybe_merge(c, iter, b->c.level,
-                                   flags|BTREE_INSERT_NOUNLOCK);
        return;
 split:
        btree_split(as, b, iter, keys, flags);
@@ -1396,118 +1514,73 @@ split:
 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
                          unsigned flags)
 {
-       struct btree_trans *trans = iter->trans;
        struct btree *b = iter_l(iter)->b;
        struct btree_update *as;
-       struct closure cl;
+       unsigned l;
        int ret = 0;
-       struct btree_insert_entry *i;
-
-       /*
-        * We already have a disk reservation and open buckets pinned; this
-        * allocation must not block:
-        */
-       trans_for_each_update(trans, i)
-               if (btree_node_type_needs_gc(i->iter->btree_id))
-                       flags |= BTREE_INSERT_USE_RESERVE;
-
-       closure_init_stack(&cl);
-
-       /* Hack, because gc and splitting nodes doesn't mix yet: */
-       if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
-           !down_read_trylock(&c->gc_lock)) {
-               if (flags & BTREE_INSERT_NOUNLOCK) {
-                       trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-                       return -EINTR;
-               }
-
-               bch2_trans_unlock(trans);
-               down_read(&c->gc_lock);
 
-               if (!bch2_trans_relock(trans))
-                       ret = -EINTR;
-       }
-
-       /*
-        * XXX: figure out how far we might need to split,
-        * instead of locking/reserving all the way to the root:
-        */
-       if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
-               trace_trans_restart_iter_upgrade(trans->ip);
-               ret = -EINTR;
-               goto out;
-       }
-
-       as = bch2_btree_update_start(trans, iter->btree_id,
-               btree_update_reserve_required(c, b), flags,
-               !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
-       if (IS_ERR(as)) {
-               ret = PTR_ERR(as);
-               if (ret == -EAGAIN) {
-                       BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
-                       bch2_trans_unlock(trans);
-                       ret = -EINTR;
-
-                       trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-               }
-               goto out;
-       }
+       as = bch2_btree_update_start(iter, iter->level,
+               btree_update_reserve_required(c, b), flags);
+       if (IS_ERR(as))
+               return PTR_ERR(as);
 
        btree_split(as, b, iter, NULL, flags);
        bch2_btree_update_done(as);
 
-       /*
-        * We haven't successfully inserted yet, so don't downgrade all the way
-        * back to read locks;
-        */
-       __bch2_btree_iter_downgrade(iter, 1);
-out:
-       if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-               up_read(&c->gc_lock);
-       closure_sync(&cl);
+       for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
+               ret = bch2_foreground_maybe_merge(c, iter, l, flags);
+
        return ret;
 }
 
-void __bch2_foreground_maybe_merge(struct bch_fs *c,
-                                  struct btree_iter *iter,
-                                  unsigned level,
-                                  unsigned flags,
-                                  enum btree_node_sibling sib)
+int __bch2_foreground_maybe_merge(struct bch_fs *c,
+                                 struct btree_iter *iter,
+                                 unsigned level,
+                                 unsigned flags,
+                                 enum btree_node_sibling sib)
 {
        struct btree_trans *trans = iter->trans;
+       struct btree_iter *sib_iter = NULL;
        struct btree_update *as;
        struct bkey_format_state new_s;
        struct bkey_format new_f;
        struct bkey_i delete;
        struct btree *b, *m, *n, *prev, *next, *parent;
-       struct closure cl;
+       struct bpos sib_pos;
        size_t sib_u64s;
-       int ret = 0;
+       int ret = 0, ret2 = 0;
 
        BUG_ON(!btree_node_locked(iter, level));
-
-       closure_init_stack(&cl);
 retry:
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               goto err;
+
        BUG_ON(!btree_node_locked(iter, level));
 
        b = iter->l[level].b;
 
-       parent = btree_node_parent(iter, b);
-       if (!parent)
+       if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
+           (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
+               b->sib_u64s[sib] = U16_MAX;
                goto out;
+       }
 
-       if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
-               goto out;
+       sib_pos = sib == btree_prev_sib
+               ? bpos_predecessor(b->data->min_key)
+               : bpos_successor(b->data->max_key);
 
-       /* XXX: can't be holding read locks */
-       m = bch2_btree_node_get_sibling(c, iter, b, sib);
-       if (IS_ERR(m)) {
-               ret = PTR_ERR(m);
+       sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
+                                           sib_pos, U8_MAX, level,
+                                           BTREE_ITER_INTENT);
+       ret = bch2_btree_iter_traverse(sib_iter);
+       if (ret)
                goto err;
-       }
 
-       /* NULL means no sibling: */
-       if (!m) {
+       m = sib_iter->l[level].b;
+
+       if (btree_node_parent(iter, b) !=
+           btree_node_parent(sib_iter, m)) {
                b->sib_u64s[sib] = U16_MAX;
                goto out;
        }
@@ -1520,9 +1593,13 @@ retry:
                next = m;
        }
 
+       BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
+
        bch2_bkey_format_init(&new_s);
-       __bch2_btree_calc_format(&new_s, b);
-       __bch2_btree_calc_format(&new_s, m);
+       bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+       __bch2_btree_calc_format(&new_s, prev);
+       __bch2_btree_calc_format(&new_s, next);
+       bch2_bkey_format_add_pos(&new_s, next->data->max_key);
        new_f = bch2_bkey_format_done(&new_s);
 
        sib_u64s = btree_node_u64s_with_format(b, &new_f) +
@@ -1535,33 +1612,21 @@ retry:
        }
 
        sib_u64s = min(sib_u64s, btree_max_u64s(c));
+       sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
        b->sib_u64s[sib] = sib_u64s;
 
-       if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
-               six_unlock_intent(&m->c.lock);
+       if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
                goto out;
-       }
 
-       /* We're changing btree topology, doesn't mix with gc: */
-       if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
-           !down_read_trylock(&c->gc_lock))
-               goto err_cycle_gc_lock;
-
-       if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
-               ret = -EINTR;
-               goto err_unlock;
-       }
-
-       as = bch2_btree_update_start(trans, iter->btree_id,
+       parent = btree_node_parent(iter, b);
+       as = bch2_btree_update_start(iter, level,
                         btree_update_reserve_required(c, parent) + 1,
                         flags|
                         BTREE_INSERT_NOFAIL|
-                        BTREE_INSERT_USE_RESERVE,
-                        !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
-       if (IS_ERR(as)) {
-               ret = PTR_ERR(as);
-               goto err_unlock;
-       }
+                        BTREE_INSERT_USE_RESERVE);
+       ret = PTR_ERR_OR_ZERO(as);
+       if (ret)
+               goto err;
 
        trace_btree_merge(c, b);
 
@@ -1595,6 +1660,7 @@ retry:
        bch2_btree_update_get_open_buckets(as, n);
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
+       six_lock_increment(&m->c.lock, SIX_LOCK_intent);
        bch2_btree_iter_node_drop(iter, b);
        bch2_btree_iter_node_drop(iter, m);
 
@@ -1608,11 +1674,9 @@ retry:
        six_unlock_intent(&n->c.lock);
 
        bch2_btree_update_done(as);
-
-       if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-               up_read(&c->gc_lock);
 out:
        bch2_btree_trans_verify_locks(trans);
+       bch2_trans_iter_free(trans, sib_iter);
 
        /*
         * Don't downgrade locks here: we're called after successful insert,
@@ -1623,58 +1687,56 @@ out:
         * split path, and downgrading to read locks in there is potentially
         * confusing:
         */
-       closure_sync(&cl);
-       return;
-
-err_cycle_gc_lock:
-       six_unlock_intent(&m->c.lock);
-
-       if (flags & BTREE_INSERT_NOUNLOCK)
-               goto out;
-
-       bch2_trans_unlock(trans);
-
-       down_read(&c->gc_lock);
-       up_read(&c->gc_lock);
-       ret = -EINTR;
-       goto err;
-
-err_unlock:
-       six_unlock_intent(&m->c.lock);
-       if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-               up_read(&c->gc_lock);
+       return ret ?: ret2;
 err:
-       BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
-
-       if ((ret == -EAGAIN || ret == -EINTR) &&
-           !(flags & BTREE_INSERT_NOUNLOCK)) {
-               bch2_trans_unlock(trans);
-               closure_sync(&cl);
-               ret = bch2_btree_iter_traverse(iter);
-               if (ret)
-                       goto out;
+       bch2_trans_iter_put(trans, sib_iter);
+       sib_iter = NULL;
 
+       if (ret == -EINTR && bch2_trans_relock(trans))
                goto retry;
+
+       if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
+               ret2 = ret;
+               ret = bch2_btree_iter_traverse_all(trans);
+               if (!ret)
+                       goto retry;
        }
 
        goto out;
 }
 
-static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
-                               struct btree *b, unsigned flags,
-                               struct closure *cl)
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ */
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+                           __le64 seq, unsigned flags)
 {
-       struct btree *n, *parent = btree_node_parent(iter, b);
+       struct btree *b, *n, *parent;
        struct btree_update *as;
+       int ret;
 
-       as = bch2_btree_update_start(iter->trans, iter->btree_id,
+       flags |= BTREE_INSERT_NOFAIL;
+retry:
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               goto out;
+
+       b = bch2_btree_iter_peek_node(iter);
+       if (!b || b->data->keys.seq != seq)
+               goto out;
+
+       parent = btree_node_parent(iter, b);
+       as = bch2_btree_update_start(iter, b->c.level,
                (parent
                 ? btree_update_reserve_required(c, parent)
                 : 0) + 1,
-               flags, cl);
-       if (IS_ERR(as)) {
+               flags);
+       ret = PTR_ERR_OR_ZERO(as);
+       if (ret == -EINTR)
+               goto retry;
+       if (ret) {
                trace_btree_gc_rewrite_node_fail(c, b);
-               return PTR_ERR(as);
+               goto out;
        }
 
        bch2_btree_interior_update_will_free_node(as, b);
@@ -1705,60 +1767,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
        six_unlock_intent(&n->c.lock);
 
        bch2_btree_update_done(as);
-       return 0;
-}
-
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- *
- * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
- * btree_check_reserve() has to wait)
- */
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
-                           __le64 seq, unsigned flags)
-{
-       struct btree_trans *trans = iter->trans;
-       struct closure cl;
-       struct btree *b;
-       int ret;
-
-       flags |= BTREE_INSERT_NOFAIL;
-
-       closure_init_stack(&cl);
-
-       bch2_btree_iter_upgrade(iter, U8_MAX);
-
-       if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
-               if (!down_read_trylock(&c->gc_lock)) {
-                       bch2_trans_unlock(trans);
-                       down_read(&c->gc_lock);
-               }
-       }
-
-       while (1) {
-               ret = bch2_btree_iter_traverse(iter);
-               if (ret)
-                       break;
-
-               b = bch2_btree_iter_peek_node(iter);
-               if (!b || b->data->keys.seq != seq)
-                       break;
-
-               ret = __btree_node_rewrite(c, iter, b, flags, &cl);
-               if (ret != -EAGAIN &&
-                   ret != -EINTR)
-                       break;
-
-               bch2_trans_unlock(trans);
-               closure_sync(&cl);
-       }
-
+out:
        bch2_btree_iter_downgrade(iter);
-
-       if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
-               up_read(&c->gc_lock);
-
-       closure_sync(&cl);
        return ret;
 }
 
@@ -1829,74 +1839,34 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
        struct btree_update *as = NULL;
        struct btree *new_hash = NULL;
        struct closure cl;
-       int ret;
+       int ret = 0;
 
        closure_init_stack(&cl);
 
-       if (!bch2_btree_iter_upgrade(iter, U8_MAX))
-               return -EINTR;
-
-       if (!down_read_trylock(&c->gc_lock)) {
-               bch2_trans_unlock(iter->trans);
-               down_read(&c->gc_lock);
-
-               if (!bch2_trans_relock(iter->trans)) {
-                       ret = -EINTR;
-                       goto err;
-               }
-       }
-
        /*
         * check btree_ptr_hash_val() after @b is locked by
         * btree_iter_traverse():
         */
        if (btree_ptr_hash_val(new_key) != b->hash_val) {
-               /* bch2_btree_reserve_get will unlock */
                ret = bch2_btree_cache_cannibalize_lock(c, &cl);
                if (ret) {
                        bch2_trans_unlock(iter->trans);
-                       up_read(&c->gc_lock);
                        closure_sync(&cl);
-                       down_read(&c->gc_lock);
-
-                       if (!bch2_trans_relock(iter->trans)) {
-                               ret = -EINTR;
-                               goto err;
-                       }
+                       if (!bch2_trans_relock(iter->trans))
+                               return -EINTR;
                }
 
                new_hash = bch2_btree_node_mem_alloc(c);
        }
-retry:
-       as = bch2_btree_update_start(iter->trans, iter->btree_id,
-               parent ? btree_update_reserve_required(c, parent) : 0,
-               BTREE_INSERT_NOFAIL|
-               BTREE_INSERT_USE_RESERVE|
-               BTREE_INSERT_USE_ALLOC_RESERVE,
-               &cl);
 
+       as = bch2_btree_update_start(iter, b->c.level,
+               parent ? btree_update_reserve_required(c, parent) : 0,
+               BTREE_INSERT_NOFAIL);
        if (IS_ERR(as)) {
                ret = PTR_ERR(as);
-               if (ret == -EAGAIN)
-                       ret = -EINTR;
-
-               if (ret == -EINTR) {
-                       bch2_trans_unlock(iter->trans);
-                       up_read(&c->gc_lock);
-                       closure_sync(&cl);
-                       down_read(&c->gc_lock);
-
-                       if (bch2_trans_relock(iter->trans))
-                               goto retry;
-               }
-
                goto err;
        }
 
-       ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
-       if (ret)
-               goto err_free_update;
-
        __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
 
        bch2_btree_iter_downgrade(iter);
@@ -1909,12 +1879,9 @@ err:
                six_unlock_write(&new_hash->c.lock);
                six_unlock_intent(&new_hash->c.lock);
        }
-       up_read(&c->gc_lock);
        closure_sync(&cl);
+       bch2_btree_cache_cannibalize_unlock(c);
        return ret;
-err_free_update:
-       bch2_btree_update_free(as);
-       goto err;
 }
 
 /* Init code: */
index 7668225e72c66b386aabdb8ab6778497094c27c7..f2925b0d7f17eacfd7773cc4a7e57b3180cc4024 100644 (file)
@@ -47,8 +47,8 @@ struct btree_update {
                BTREE_INTERIOR_UPDATING_AS,
        } mode;
 
-       unsigned                        must_rewrite:1;
        unsigned                        nodes_written:1;
+       unsigned                        took_gc_lock:1;
 
        enum btree_id                   btree_id;
 
@@ -121,8 +121,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 
 void bch2_btree_update_done(struct btree_update *);
 struct btree_update *
-bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
-                       unsigned, struct closure *);
+bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
 
 void bch2_btree_interior_update_will_free_node(struct btree_update *,
                                               struct btree *);
@@ -133,10 +132,10 @@ void bch2_btree_insert_node(struct btree_update *, struct btree *,
                            unsigned);
 int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
 
-void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
-                                  unsigned, unsigned, enum btree_node_sibling);
+int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+                                 unsigned, unsigned, enum btree_node_sibling);
 
-static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
                                        struct btree_iter *iter,
                                        unsigned level, unsigned flags,
                                        enum btree_node_sibling sib)
@@ -144,27 +143,27 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
        struct btree *b;
 
        if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-               return;
+               return 0;
 
        if (!bch2_btree_node_relock(iter, level))
-               return;
+               return 0;
 
        b = iter->l[level].b;
        if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
-               return;
+               return 0;
 
-       __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+       return __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
 }
 
-static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge(struct bch_fs *c,
                                               struct btree_iter *iter,
                                               unsigned level,
                                               unsigned flags)
 {
-       bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
-                                           btree_prev_sib);
-       bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
-                                           btree_next_sib);
+       return  bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+                                                   btree_prev_sib) ?:
+               bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+                                                   btree_next_sib);
 }
 
 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
@@ -237,6 +236,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
                b->whiteout_u64s;
        ssize_t total = c->opts.btree_node_size << 6;
 
+       /* Always leave one extra u64 for bch2_varint_decode: */
+       used++;
+
        return total - used;
 }
 
index 49995cd00c16c26c1f4f77c9d9b5eb4da83d3634..e258cf893076853b2f800b51bb02e71c80583cc4 100644 (file)
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+                                        const struct btree_insert_entry *r)
+{
+       return   cmp_int(l->btree_id,   r->btree_id) ?:
+                -cmp_int(l->level,     r->level) ?:
+                bpos_cmp(l->k->k.p,    r->k->k.p);
+}
+
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
                                     struct btree_insert_entry *i)
 {
@@ -62,27 +70,24 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
        EBUG_ON(btree_node_just_written(b));
        EBUG_ON(bset_written(b, btree_bset_last(b)));
        EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-       EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
-               bkey_cmp(bkey_start_pos(&insert->k),
-                        bkey_predecessor(b->data->min_key)) < 0);
-       EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
-       EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
+       EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
+       EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
        EBUG_ON(insert->k.u64s >
                bch_btree_keys_u64s_remaining(iter->trans->c, b));
        EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
 
        k = bch2_btree_node_iter_peek_all(node_iter, b);
-       if (k && bkey_cmp_packed(b, k, &insert->k))
+       if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
                k = NULL;
 
        /* @k is the key being overwritten/deleted, if any: */
-       EBUG_ON(k && bkey_whiteout(k));
+       EBUG_ON(k && bkey_deleted(k));
 
        /* Deleting, but not found? nothing to do: */
-       if (bkey_whiteout(&insert->k) && !k)
+       if (bkey_deleted(&insert->k) && !k)
                return false;
 
-       if (bkey_whiteout(&insert->k)) {
+       if (bkey_deleted(&insert->k)) {
                /* Deleting: */
                btree_account_key_drop(b, k);
                k->type = KEY_TYPE_deleted;
@@ -129,7 +134,7 @@ fix_iter:
        return true;
 }
 
-static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
                               unsigned i, u64 seq)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -140,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        bch2_btree_node_write_cond(c, b,
                (btree_current_write(b) == w && w->journal.seq == seq));
        six_unlock_read(&b->c.lock);
+       return 0;
 }
 
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
        return __btree_node_flush(j, pin, 0, seq);
 }
 
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
        return __btree_node_flush(j, pin, 1, seq);
 }
@@ -191,7 +197,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
        bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
 
        if (unlikely(!btree_node_dirty(b)))
-               set_btree_node_dirty(b);
+               set_btree_node_dirty(c, b);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
        u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -214,15 +220,23 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 /* Normal update interface: */
 
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
-                                            struct btree_iter *iter,
-                                            struct bkey_i *insert)
+                                            struct btree_insert_entry *i)
 {
        struct bch_fs *c = trans->c;
 
-       BUG_ON(bkey_cmp(insert->k.p, iter->pos));
-       BUG_ON(debug_check_bkeys(c) &&
-              bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
-                                __btree_node_type(iter->level, iter->btree_id)));
+       if (bch2_debug_check_bkeys) {
+               const char *invalid = bch2_bkey_invalid(c,
+                               bkey_i_to_s_c(i->k), i->bkey_type);
+               if (invalid) {
+                       char buf[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+                       panic("invalid bkey %s on insert: %s\n", buf, invalid);
+               }
+       }
+       BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
+       BUG_ON(i->level         != i->iter->level);
+       BUG_ON(i->btree_id      != i->iter->btree_id);
 }
 
 static noinline int
@@ -286,6 +300,11 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
        BUG_ON(iter->level);
 
+       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+           bch2_btree_key_cache_must_wait(trans->c) &&
+           !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
+               return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
+
        if (u64s <= ck->u64s)
                return BTREE_INSERT_OK;
 
@@ -330,19 +349,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
        }
 }
 
-static inline bool iter_has_trans_triggers(struct btree_iter *iter)
-{
-       return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id);
-}
-
-static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
-{
-       return (((BTREE_NODE_TYPE_HAS_TRIGGERS &
-                 ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) |
-               (1U << BTREE_ID_EC)) &
-               (1U << iter->btree_id);
-}
-
 static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
 {
        __bch2_btree_iter_unlock(iter);
@@ -370,8 +376,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                               struct btree_insert_entry **stopped_at)
 {
        struct bch_fs *c = trans->c;
-       struct bch_fs_usage *fs_usage = NULL;
        struct btree_insert_entry *i;
+       struct btree_trans_commit_hook *h;
        unsigned u64s = 0;
        bool marking = false;
        int ret;
@@ -389,6 +395,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
        prefetch(&trans->c->journal.flags);
 
+       h = trans->hooks;
+       while (h) {
+               ret = h->fn(trans, h);
+               if (ret)
+                       return ret;
+               h = h->next;
+       }
+
        trans_for_each_update2(trans, i) {
                /* Multiple inserts might go to same leaf: */
                if (!same_leaf_as_prev(trans, i))
@@ -403,13 +417,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                        return ret;
                }
 
-               if (btree_node_type_needs_gc(i->iter->btree_id))
+               if (btree_node_type_needs_gc(i->bkey_type))
                        marking = true;
        }
 
        if (marking) {
                percpu_down_read(&c->mark_lock);
-               fs_usage = bch2_fs_usage_scratch_get(c);
+       }
+
+       /* Must be called under mark_lock: */
+       if (marking && trans->fs_usage_deltas &&
+           !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
+               ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+               goto err;
        }
 
        /*
@@ -440,29 +460,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
         */
 
        if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
-               if (journal_seq_verify(c))
+               if (bch2_journal_seq_verify)
                        trans_for_each_update2(trans, i)
                                i->k->k.version.lo = trans->journal_res.seq;
-               else if (inject_invalid_keys(c))
+               else if (bch2_inject_invalid_keys)
                        trans_for_each_update2(trans, i)
                                i->k->k.version = MAX_VERSION;
        }
 
-       /* Must be called under mark_lock: */
-       if (marking && trans->fs_usage_deltas &&
-           bch2_replicas_delta_list_apply(c, fs_usage,
-                                          trans->fs_usage_deltas)) {
-               ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-               goto err;
-       }
-
        trans_for_each_update(trans, i)
-               if (iter_has_nontrans_triggers(i->iter))
+               if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
                        bch2_mark_update(trans, i->iter, i->k,
-                                        fs_usage, i->trigger_flags);
+                                        NULL, i->trigger_flags);
 
-       if (marking)
-               bch2_trans_fs_usage_apply(trans, fs_usage);
+       if (marking && trans->fs_usage_deltas)
+               bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
 
        if (unlikely(c->gc_pos.phase))
                bch2_trans_mark_gc(trans);
@@ -471,31 +483,85 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                do_btree_insert_one(trans, i->iter, i->k);
 err:
        if (marking) {
-               bch2_fs_usage_scratch_put(c, fs_usage);
                percpu_up_read(&c->mark_lock);
        }
 
        return ret;
 }
 
+static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
+{
+       struct btree_insert_entry *i;
+       struct btree *b = iter_l(iter)->b;
+       struct bkey_s_c old;
+       int u64s_delta = 0;
+       int ret;
+
+       /*
+        * Inserting directly into interior nodes is an uncommon operation with
+        * various weird edge cases: also, a lot of things about
+        * BTREE_ITER_NODES iters need to be audited
+        */
+       if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
+               return 0;
+
+       BUG_ON(iter->level);
+
+       trans_for_each_update2(trans, i) {
+               if (iter_l(i->iter)->b != b)
+                       continue;
+
+               old = bch2_btree_iter_peek_slot(i->iter);
+               ret = bkey_err(old);
+               if (ret)
+                       return ret;
+
+               u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+               u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+       }
+
+       return u64s_delta <= 0
+               ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level,
+                               trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR)
+               : 0;
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
 static inline int do_bch2_trans_commit(struct btree_trans *trans,
                                       struct btree_insert_entry **stopped_at)
 {
+       struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
        struct btree_iter *iter;
        int ret;
 
+       trans_for_each_update2(trans, i) {
+               struct btree *b;
+
+               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+
+               if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
+                       continue;
+
+               b = iter_l(i->iter)->b;
+               if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
+                   b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
+                       ret = maybe_do_btree_merge(trans, i->iter);
+                       if (unlikely(ret))
+                               return ret;
+               }
+       }
+
        trans_for_each_update2(trans, i)
-               BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
+               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
 
-       ret = bch2_journal_preres_get(&trans->c->journal,
+       ret = bch2_journal_preres_get(&c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
                        JOURNAL_RES_GET_NONBLOCK|
-                       ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
-                        ? JOURNAL_RES_GET_RECLAIM : 0));
+                       ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+                        ? JOURNAL_RES_GET_RESERVED : 0));
        if (unlikely(ret == -EAGAIN))
                ret = bch2_trans_journal_preres_get_cold(trans,
                                                trans->journal_preres_u64s);
@@ -504,6 +570,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
        /*
         * Can't be holding any read locks when we go to take write locks:
+        * another thread could be holding an intent lock on the same node we
+        * have a read lock on, and it'll block trying to take a write lock
+        * (because we hold a read lock) and it could be blocking us by holding
+        * its own read lock (while we're trying to to take write locks).
         *
         * note - this must be done after bch2_trans_journal_preres_get_cold()
         * or anything else that might call bch2_trans_relock(), since that
@@ -511,20 +581,25 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
         */
        trans_for_each_iter(trans, iter) {
                if (iter->nodes_locked != iter->nodes_intent_locked) {
-                       EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-                       EBUG_ON(trans->iters_live & (1ULL << iter->idx));
-                       bch2_btree_iter_unlock_noinline(iter);
+                       if (btree_iter_keep(trans, iter)) {
+                               if (!bch2_btree_iter_upgrade(iter, 1)) {
+                                       trace_trans_restart_upgrade(trans->ip);
+                                       return -EINTR;
+                               }
+                       } else {
+                               bch2_btree_iter_unlock_noinline(iter);
+                       }
                }
        }
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
                trans_for_each_update2(trans, i)
-                       btree_insert_entry_checks(trans, i->iter, i->k);
+                       btree_insert_entry_checks(trans, i);
        bch2_btree_trans_verify_locks(trans);
 
        trans_for_each_update2(trans, i)
                if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_lock_for_insert(trans->c,
+                       bch2_btree_node_lock_for_insert(c,
                                        iter_l(i->iter)->b, i->iter);
 
        ret = bch2_trans_commit_write_locked(trans, stopped_at);
@@ -535,32 +610,43 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
                                                             i->iter);
 
        if (!ret && trans->journal_pin)
-               bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq,
+               bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
                                     trans->journal_pin, NULL);
 
        /*
         * Drop journal reservation after dropping write locks, since dropping
         * the journal reservation may kick off a journal write:
         */
-       bch2_journal_res_put(&trans->c->journal, &trans->journal_res);
+       bch2_journal_res_put(&c->journal, &trans->journal_res);
 
        if (unlikely(ret))
                return ret;
 
-       if (trans->flags & BTREE_INSERT_NOUNLOCK)
-               trans->nounlock = true;
+       bch2_trans_downgrade(trans);
 
-       trans_for_each_update2(trans, i)
-               if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
-                   !same_leaf_as_prev(trans, i))
-                       bch2_foreground_maybe_merge(trans->c, i->iter,
-                                                   0, trans->flags);
+       return 0;
+}
 
-       trans->nounlock = false;
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+       int ret;
 
-       bch2_trans_downgrade(trans);
+       ret = bch2_journal_error(&c->journal);
+       if (ret)
+               return ret;
 
-       return 0;
+       ret = !bch2_btree_key_cache_must_wait(c);
+       if (ret)
+               return ret;
+
+       if (mutex_trylock(&c->journal.reclaim_lock)) {
+               ret = bch2_journal_reclaim(&c->journal);
+               mutex_unlock(&c->journal.reclaim_lock);
+       }
+
+       if (!ret)
+               ret = !bch2_btree_key_cache_must_wait(c);
+       return ret;
 }
 
 static noinline
@@ -617,11 +703,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
        case BTREE_INSERT_NEED_MARK_REPLICAS:
                bch2_trans_unlock(trans);
 
-               trans_for_each_update(trans, i) {
-                       ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
-                       if (ret)
-                               return ret;
-               }
+               ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
+               if (ret)
+                       return ret;
 
                if (bch2_trans_relock(trans))
                        return 0;
@@ -632,6 +716,10 @@ int bch2_trans_commit_error(struct btree_trans *trans,
        case BTREE_INSERT_NEED_JOURNAL_RES:
                bch2_trans_unlock(trans);
 
+               if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+                   !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
+                       return -EAGAIN;
+
                ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
                if (ret)
                        return ret;
@@ -642,20 +730,21 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                trace_trans_restart_journal_res_get(trans->ip);
                ret = -EINTR;
                break;
-       default:
-               BUG_ON(ret >= 0);
-               break;
-       }
+       case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
+               bch2_trans_unlock(trans);
 
-       if (ret == -EINTR) {
-               int ret2 = bch2_btree_iter_traverse_all(trans);
+               wait_event(c->journal.reclaim_wait,
+                          (ret = journal_reclaim_wait_done(c)));
 
-               if (ret2) {
-                       trace_trans_restart_traverse(trans->ip);
-                       return ret2;
-               }
+               if (!ret && bch2_trans_relock(trans))
+                       return 0;
 
-               trace_trans_restart_atomic(trans->ip);
+               trace_trans_restart_journal_reclaim(trans->ip);
+               ret = -EINTR;
+               break;
+       default:
+               BUG_ON(ret >= 0);
+               break;
        }
 
        return ret;
@@ -680,137 +769,134 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
        return 0;
 }
 
-static void bch2_trans_update2(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct bkey_i *insert)
+static void __bch2_trans_update2(struct btree_trans *trans,
+                                struct btree_insert_entry n)
 {
-       struct btree_insert_entry *i, n = (struct btree_insert_entry) {
-               .iter = iter, .k = insert
-       };
-
-       btree_insert_entry_checks(trans, n.iter, n.k);
-
-       BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+       struct btree_insert_entry *i;
 
-       EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
+       btree_insert_entry_checks(trans, &n);
 
-       iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+       EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
 
-       trans_for_each_update2(trans, i) {
-               if (btree_iter_cmp(n.iter, i->iter) == 0) {
-                       *i = n;
-                       return;
-               }
+       n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
-               if (btree_iter_cmp(n.iter, i->iter) <= 0)
+       trans_for_each_update2(trans, i)
+               if (btree_insert_entry_cmp(&n, i) <= 0)
                        break;
-       }
 
-       array_insert_item(trans->updates2, trans->nr_updates2,
-                         i - trans->updates2, n);
+       if (i < trans->updates2 + trans->nr_updates2 &&
+           !btree_insert_entry_cmp(&n, i))
+               *i = n;
+       else
+               array_insert_item(trans->updates2, trans->nr_updates2,
+                                 i - trans->updates2, n);
+}
+
+static void bch2_trans_update2(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_i *insert)
+{
+       __bch2_trans_update2(trans, (struct btree_insert_entry) {
+               .bkey_type      = __btree_node_type(iter->level, iter->btree_id),
+               .btree_id       = iter->btree_id,
+               .level          = iter->level,
+               .iter           = iter,
+               .k              = insert,
+       });
 }
 
 static int extent_update_to_keys(struct btree_trans *trans,
-                                struct btree_iter *orig_iter,
-                                struct bkey_i *insert)
+                                struct btree_insert_entry n)
 {
-       struct btree_iter *iter;
        int ret;
 
-       ret = bch2_extent_can_insert(trans, orig_iter, insert);
+       if (bkey_deleted(&n.k->k))
+               return 0;
+
+       ret = bch2_extent_can_insert(trans, n.iter, n.k);
        if (ret)
                return ret;
 
-       if (bkey_deleted(&insert->k))
-               return 0;
-
-       iter = bch2_trans_copy_iter(trans, orig_iter);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
+       n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
+                                    BTREE_ITER_INTENT|
+                                    BTREE_ITER_NOT_EXTENTS);
+       n.is_extent = false;
 
-       iter->flags |= BTREE_ITER_INTENT;
-       __bch2_btree_iter_set_pos(iter, insert->k.p, false);
-       bch2_trans_update2(trans, iter, insert);
-       bch2_trans_iter_put(trans, iter);
+       __bch2_trans_update2(trans, n);
+       bch2_trans_iter_put(trans, n.iter);
        return 0;
 }
 
 static int extent_handle_overwrites(struct btree_trans *trans,
                                    enum btree_id btree_id,
-                                   struct bpos start, struct bpos end)
+                                   struct bkey_i *insert)
 {
-       struct btree_iter *iter = NULL, *update_iter;
+       struct btree_iter *iter, *update_iter;
+       struct bpos start = bkey_start_pos(&insert->k);
        struct bkey_i *update;
        struct bkey_s_c k;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(iter);
-       if (ret)
-               return ret;
-
+       iter = bch2_trans_get_iter(trans, btree_id, start,
+                                  BTREE_ITER_INTENT);
        k = bch2_btree_iter_peek_with_updates(iter);
 
        while (k.k && !(ret = bkey_err(k))) {
-               if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
+               if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
                        break;
 
                if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
-                       update_iter = bch2_trans_copy_iter(trans, iter);
-                       if ((ret = PTR_ERR_OR_ZERO(update_iter)))
-                               goto err;
-
                        update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
                        if ((ret = PTR_ERR_OR_ZERO(update)))
-                               goto err;
+                               break;
 
                        bkey_reassemble(update, k);
+
                        bch2_cut_back(start, update);
 
-                       __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+                       update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+                                                         BTREE_ITER_NOT_EXTENTS|
+                                                         BTREE_ITER_INTENT);
                        bch2_trans_update2(trans, update_iter, update);
                        bch2_trans_iter_put(trans, update_iter);
                }
 
-               if (bkey_cmp(k.k->p, end) > 0) {
-                       update_iter = bch2_trans_copy_iter(trans, iter);
-                       if ((ret = PTR_ERR_OR_ZERO(update_iter)))
-                               goto err;
-
-                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+               if (bkey_cmp(k.k->p, insert->k.p) < 0 ||
+                   (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) {
+                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
                        if ((ret = PTR_ERR_OR_ZERO(update)))
-                               goto err;
+                               break;
 
-                       bkey_reassemble(update, k);
-                       bch2_cut_front(end, update);
+                       bkey_init(&update->k);
+                       update->k.p = k.k->p;
 
-                       __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+                       update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+                                                         BTREE_ITER_NOT_EXTENTS|
+                                                         BTREE_ITER_INTENT);
                        bch2_trans_update2(trans, update_iter, update);
                        bch2_trans_iter_put(trans, update_iter);
-               } else {
-                       update_iter = bch2_trans_copy_iter(trans, iter);
-                       if ((ret = PTR_ERR_OR_ZERO(update_iter)))
-                               goto err;
+               }
 
-                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
+               if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
                        if ((ret = PTR_ERR_OR_ZERO(update)))
-                               goto err;
+                               break;
 
-                       update->k = *k.k;
-                       set_bkey_val_u64s(&update->k, 0);
-                       update->k.type = KEY_TYPE_deleted;
-                       update->k.size = 0;
+                       bkey_reassemble(update, k);
+                       bch2_cut_front(insert->k.p, update);
 
-                       __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+                       update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+                                                         BTREE_ITER_NOT_EXTENTS|
+                                                         BTREE_ITER_INTENT);
                        bch2_trans_update2(trans, update_iter, update);
                        bch2_trans_iter_put(trans, update_iter);
+                       break;
                }
 
                k = bch2_btree_iter_next_with_updates(iter);
        }
-err:
-       if (!IS_ERR_OR_NULL(iter))
-               bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_put(trans, iter);
+
        return ret;
 }
 
@@ -819,13 +905,11 @@ int __bch2_trans_commit(struct btree_trans *trans)
        struct btree_insert_entry *i = NULL;
        struct btree_iter *iter;
        bool trans_trigger_run;
-       unsigned u64s;
+       unsigned u64s, reset_flags = 0;
        int ret = 0;
 
-       BUG_ON(trans->need_reset);
-
        if (!trans->nr_updates)
-               goto out_noupdates;
+               goto out_reset;
 
        if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
                lockdep_assert_held(&trans->c->gc_lock);
@@ -839,7 +923,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
            unlikely(!percpu_ref_tryget(&trans->c->writes))) {
                ret = bch2_trans_commit_get_rw_cold(trans);
                if (ret)
-                       return ret;
+                       goto out_reset;
        }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -847,7 +931,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
                if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
                    !(i->trigger_flags & BTREE_TRIGGER_NORUN))
                        bch2_btree_key_cache_verify_clean(trans,
-                                       i->iter->btree_id, i->iter->pos);
+                                       i->btree_id, i->k->k.p);
 #endif
 
        /*
@@ -858,24 +942,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
                trans_trigger_run = false;
 
                trans_for_each_update(trans, i) {
-                       if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK &&
-                                    (ret = bch2_btree_iter_traverse(i->iter)))) {
-                               trace_trans_restart_traverse(trans->ip);
-                               goto out;
-                       }
-
-                       /*
-                        * We're not using bch2_btree_iter_upgrade here because
-                        * we know trans->nounlock can't be set:
-                        */
-                       if (unlikely(i->iter->locks_want < 1 &&
-                                    !__bch2_btree_iter_upgrade(i->iter, 1))) {
-                               trace_trans_restart_upgrade(trans->ip);
-                               ret = -EINTR;
-                               goto out;
-                       }
-
-                       if (iter_has_trans_triggers(i->iter) &&
+                       if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
                            !i->trans_triggers_run) {
                                i->trans_triggers_run = true;
                                trans_trigger_run = true;
@@ -893,33 +960,34 @@ int __bch2_trans_commit(struct btree_trans *trans)
 
        /* Turn extents updates into keys: */
        trans_for_each_update(trans, i)
-               if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
-                       struct bpos start = bkey_start_pos(&i->k->k);
-
-                       while (i + 1 < trans->updates + trans->nr_updates &&
-                              i[0].iter->btree_id == i[1].iter->btree_id &&
-                              !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
-                               i++;
-
-                       ret = extent_handle_overwrites(trans, i->iter->btree_id,
-                                                      start, i->k->k.p);
-                       if (ret)
+               if (i->is_extent) {
+                       ret = extent_handle_overwrites(trans, i->btree_id, i->k);
+                       if (unlikely(ret))
                                goto out;
                }
 
        trans_for_each_update(trans, i) {
-               if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
-                       ret = extent_update_to_keys(trans, i->iter, i->k);
-                       if (ret)
-                               goto out;
-               } else {
-                       bch2_trans_update2(trans, i->iter, i->k);
-               }
+               ret = i->is_extent
+                       ? extent_update_to_keys(trans, *i)
+                       : (__bch2_trans_update2(trans, *i), 0);
+               if (unlikely(ret))
+                       goto out;
        }
 
        trans_for_each_update2(trans, i) {
-               BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
-               BUG_ON(i->iter->locks_want < 1);
+               ret = bch2_btree_iter_traverse(i->iter);
+               if (unlikely(ret)) {
+                       trace_trans_restart_traverse(trans->ip);
+                       goto out;
+               }
+
+               if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
+                       trace_trans_restart_upgrade(trans->ip);
+                       ret = -EINTR;
+                       goto out;
+               }
+
+               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
 
                u64s = jset_u64s(i->k->k.u64s);
                if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
@@ -939,20 +1007,20 @@ retry:
                goto err;
 
        trans_for_each_iter(trans, iter)
-               if ((trans->iters_live & (1ULL << iter->idx)) &&
-                   (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) {
-                       if (trans->flags & BTREE_INSERT_NOUNLOCK)
-                               bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit);
-                       else
-                               bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
-               }
+               if (btree_iter_live(trans, iter) &&
+                   (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
+                       bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
 out:
        bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
        if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
                percpu_ref_put(&trans->c->writes);
-out_noupdates:
-       bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
+out_reset:
+       if (!ret)
+               reset_flags |= TRANS_RESET_NOTRAVERSE;
+       if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
+               reset_flags |= TRANS_RESET_NOUNLOCK;
+       bch2_trans_reset(trans, reset_flags);
 
        return ret;
 err:
@@ -967,75 +1035,111 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                      struct bkey_i *k, enum btree_trigger_flags flags)
 {
        struct btree_insert_entry *i, n = (struct btree_insert_entry) {
-               .trigger_flags = flags, .iter = iter, .k = k
+               .trigger_flags  = flags,
+               .bkey_type      = __btree_node_type(iter->level, iter->btree_id),
+               .btree_id       = iter->btree_id,
+               .level          = iter->level,
+               .is_extent      = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0,
+               .iter           = iter,
+               .k              = k
        };
 
-       EBUG_ON(bkey_cmp(iter->pos,
-                        (iter->flags & BTREE_ITER_IS_EXTENTS)
-                        ? bkey_start_pos(&k->k)
-                        : k->k.p));
+       BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       BUG_ON(bkey_cmp(iter->pos,
+                       n.is_extent ? bkey_start_pos(&k->k) : k->k.p));
+
+       trans_for_each_update(trans, i) {
+               BUG_ON(bkey_cmp(i->iter->pos,
+                               i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p));
+
+               BUG_ON(i != trans->updates &&
+                      btree_insert_entry_cmp(i - 1, i) >= 0);
+       }
+#endif
 
        iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
 
-       if (btree_node_type_is_extents(iter->btree_id)) {
+       if (n.is_extent) {
                iter->pos_after_commit = k->k.p;
                iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
        }
 
        /*
-        * Pending updates are kept sorted: first, find position of new update:
+        * Pending updates are kept sorted: first, find position of new update,
+        * then delete/trim any updates the new update overwrites:
         */
-       trans_for_each_update(trans, i)
-               if (btree_iter_cmp(iter, i->iter) <= 0)
-                       break;
+       if (!n.is_extent) {
+               trans_for_each_update(trans, i)
+                       if (btree_insert_entry_cmp(&n, i) <= 0)
+                               break;
 
-       /*
-        * Now delete/trim any updates the new update overwrites:
-        */
-       if (i > trans->updates &&
-           i[-1].iter->btree_id == iter->btree_id &&
-           bkey_cmp(iter->pos, i[-1].k->k.p) < 0)
-               bch2_cut_back(n.iter->pos, i[-1].k);
-
-       while (i < trans->updates + trans->nr_updates &&
-              iter->btree_id == i->iter->btree_id &&
-              bkey_cmp(n.k->k.p, i->k->k.p) >= 0)
-               array_remove_item(trans->updates, trans->nr_updates,
-                                 i - trans->updates);
-
-       if (i < trans->updates + trans->nr_updates &&
-           iter->btree_id == i->iter->btree_id &&
-           bkey_cmp(n.k->k.p, i->iter->pos) > 0) {
-               /*
-                * When we have an extent that overwrites the start of another
-                * update, trimming that extent will mean the iterator's
-                * position has to change since the iterator position has to
-                * match the extent's start pos - but we don't want to change
-                * the iterator pos if some other code is using it, so we may
-                * need to clone it:
-                */
-               if (trans->iters_live & (1ULL << i->iter->idx)) {
-                       i->iter = bch2_trans_copy_iter(trans, i->iter);
-                       if (IS_ERR(i->iter)) {
-                               trans->need_reset = true;
-                               return PTR_ERR(i->iter);
+               if (i < trans->updates + trans->nr_updates &&
+                   !btree_insert_entry_cmp(&n, i))
+                       *i = n;
+               else
+                       array_insert_item(trans->updates, trans->nr_updates,
+                                         i - trans->updates, n);
+       } else {
+               trans_for_each_update(trans, i)
+                       if (btree_insert_entry_cmp(&n, i) < 0)
+                               break;
+
+               while (i > trans->updates &&
+                      i[-1].btree_id == n.btree_id &&
+                      bkey_cmp(bkey_start_pos(&n.k->k),
+                               bkey_start_pos(&i[-1].k->k)) <= 0) {
+                       --i;
+                       array_remove_item(trans->updates, trans->nr_updates,
+                                         i - trans->updates);
+               }
+
+               if (i > trans->updates &&
+                   i[-1].btree_id == n.btree_id &&
+                   bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0)
+                       bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k);
+
+               if (i < trans->updates + trans->nr_updates &&
+                   i->btree_id == n.btree_id &&
+                   bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) {
+                       /* We don't handle splitting extents here: */
+                       BUG_ON(bkey_cmp(bkey_start_pos(&n.k->k),
+                                       bkey_start_pos(&i->k->k)) > 0);
+
+                       /*
+                        * When we have an extent that overwrites the start of another
+                        * update, trimming that extent will mean the iterator's
+                        * position has to change since the iterator position has to
+                        * match the extent's start pos - but we don't want to change
+                        * the iterator pos if some other code is using it, so we may
+                        * need to clone it:
+                        */
+                       if (btree_iter_live(trans, i->iter)) {
+                               i->iter = bch2_trans_copy_iter(trans, i->iter);
+
+                               i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+                               bch2_trans_iter_put(trans, i->iter);
                        }
 
-                       i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-                       bch2_trans_iter_put(trans, i->iter);
+                       bch2_cut_front(n.k->k.p, i->k);
+                       bch2_btree_iter_set_pos(i->iter, n.k->k.p);
                }
 
-               bch2_cut_front(n.k->k.p, i->k);
-               bch2_btree_iter_set_pos(i->iter, n.k->k.p);
+               array_insert_item(trans->updates, trans->nr_updates,
+                                 i - trans->updates, n);
        }
 
-       EBUG_ON(trans->nr_updates >= trans->nr_iters);
-
-       array_insert_item(trans->updates, trans->nr_updates,
-                         i - trans->updates, n);
        return 0;
 }
 
+void bch2_trans_commit_hook(struct btree_trans *trans,
+                           struct btree_trans_commit_hook *h)
+{
+       h->next = trans->hooks;
+       trans->hooks = h;
+}
+
 int __bch2_btree_insert(struct btree_trans *trans,
                        enum btree_id id, struct bkey_i *k)
 {
@@ -1044,8 +1148,6 @@ int __bch2_btree_insert(struct btree_trans *trans,
 
        iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
                                   BTREE_ITER_INTENT);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
 
        ret   = bch2_btree_iter_traverse(iter) ?:
                bch2_trans_update(trans, iter, k, 0);
@@ -1069,13 +1171,28 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
                             __bch2_btree_insert(&trans, id, k));
 }
 
-int bch2_btree_delete_at_range(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct bpos end,
-                              u64 *journal_seq)
+int bch2_btree_delete_at(struct btree_trans *trans,
+                        struct btree_iter *iter, unsigned flags)
+{
+       struct bkey_i k;
+
+       bkey_init(&k.k);
+       k.k.p = iter->pos;
+
+       bch2_trans_update(trans, iter, &k, 0);
+       return bch2_trans_commit(trans, NULL, NULL,
+                                BTREE_INSERT_NOFAIL|flags);
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+                                 struct bpos start, struct bpos end,
+                                 u64 *journal_seq)
 {
+       struct btree_iter *iter;
        struct bkey_s_c k;
        int ret = 0;
+
+       iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
 retry:
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k)) &&
@@ -1086,6 +1203,10 @@ retry:
 
                bkey_init(&delete.k);
 
+               /*
+                * This could probably be more efficient for extents:
+                */
+
                /*
                 * For extents, iter.pos won't necessarily be the same as
                 * bkey_start_pos(k.k) (for non extents they always will be the
@@ -1125,22 +1246,8 @@ retry:
                goto retry;
        }
 
+       bch2_trans_iter_free(trans, iter);
        return ret;
-
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-                        struct btree_iter *iter, unsigned flags)
-{
-       struct bkey_i k;
-
-       bkey_init(&k.k);
-       k.k.p = iter->pos;
-
-       bch2_trans_update(trans, iter, &k, 0);
-       return bch2_trans_commit(trans, NULL, NULL,
-                                BTREE_INSERT_NOFAIL|
-                                BTREE_INSERT_USE_RESERVE|flags);
 }
 
 /*
@@ -1152,21 +1259,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                            struct bpos start, struct bpos end,
                            u64 *journal_seq)
 {
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       int ret = 0;
-
-       /*
-        * XXX: whether we need mem/more iters depends on whether this btree id
-        * has triggers
-        */
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
-
-       iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
-
-       ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
-       ret = bch2_trans_exit(&trans) ?: ret;
-
-       BUG_ON(ret == -EINTR);
-       return ret;
+       return bch2_trans_do(c, NULL, journal_seq, 0,
+                            bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
 }
index 82f1cc4ca6931f47a72f8b9dc25a588f34bbbc19..31f7617e85122f70608147b913bd8dca94d80158 100644 (file)
@@ -137,13 +137,14 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
        struct bch_fs_usage *usage;
+       struct bch_dev *ca;
        unsigned i;
 
        percpu_down_write(&c->mark_lock);
        usage = c->usage_base;
 
-       bch2_fs_usage_acc_to_base(c, 0);
-       bch2_fs_usage_acc_to_base(c, 1);
+       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+               bch2_fs_usage_acc_to_base(c, i);
 
        for (i = 0; i < BCH_REPLICAS_MAX; i++)
                usage->reserved += usage->persistent_reserved[i];
@@ -155,48 +156,38 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
                fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
        }
 
-       percpu_up_write(&c->mark_lock);
-}
+       for_each_member_device(ca, c, i) {
+               struct bch_dev_usage dev = bch2_dev_usage_read(ca);
 
-void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage)
-{
-       if (fs_usage == c->usage_scratch)
-               mutex_unlock(&c->usage_scratch_lock);
-       else
-               kfree(fs_usage);
+               usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+                                 dev.d[BCH_DATA_journal].buckets) *
+                       ca->mi.bucket_size;
+       }
+
+       percpu_up_write(&c->mark_lock);
 }
 
-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c)
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+                                                 unsigned journal_seq,
+                                                 bool gc)
 {
-       struct bch_fs_usage *ret;
-       unsigned bytes = fs_usage_u64s(c) * sizeof(u64);
-
-       ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN);
-       if (ret)
-               return ret;
-
-       if (mutex_trylock(&c->usage_scratch_lock))
-               goto out_pool;
-
-       ret = kzalloc(bytes, GFP_NOFS);
-       if (ret)
-               return ret;
-
-       mutex_lock(&c->usage_scratch_lock);
-out_pool:
-       ret = c->usage_scratch;
-       memset(ret, 0, bytes);
-       return ret;
+       return this_cpu_ptr(gc
+                           ? ca->usage_gc
+                           : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
 }
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 {
+       struct bch_fs *c = ca->fs;
        struct bch_dev_usage ret;
+       unsigned seq, i, u64s = dev_usage_u64s();
 
-       memset(&ret, 0, sizeof(ret));
-       acc_u64s_percpu((u64 *) &ret,
-                       (u64 __percpu *) ca->usage[0],
-                       sizeof(ret) / sizeof(u64));
+       do {
+               seq = read_seqcount_begin(&c->usage_lock);
+               memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+               for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+                       acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+       } while (read_seqcount_retry(&c->usage_lock, seq));
 
        return ret;
 }
@@ -207,13 +198,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
 {
        return this_cpu_ptr(gc
                            ? c->usage_gc
-                           : c->usage[journal_seq & 1]);
+                           : c->usage[journal_seq & JOURNAL_BUF_MASK]);
 }
 
 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 {
        ssize_t offset = v - (u64 *) c->usage_base;
-       unsigned seq;
+       unsigned i, seq;
        u64 ret;
 
        BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
@@ -221,38 +212,37 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 
        do {
                seq = read_seqcount_begin(&c->usage_lock);
-               ret = *v +
-                       percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
-                       percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
+               ret = *v;
+
+               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+                       ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
        } while (read_seqcount_retry(&c->usage_lock, seq));
 
        return ret;
 }
 
-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 {
-       struct bch_fs_usage *ret;
-       unsigned seq, v, u64s = fs_usage_u64s(c);
-retry:
-       ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
-       if (unlikely(!ret))
-               return NULL;
+       struct bch_fs_usage_online *ret;
+       unsigned seq, i, u64s;
 
        percpu_down_read(&c->mark_lock);
 
-       v = fs_usage_u64s(c);
-       if (unlikely(u64s != v)) {
-               u64s = v;
+       ret = kmalloc(sizeof(struct bch_fs_usage_online) +
+                     sizeof(u64) + c->replicas.nr, GFP_NOFS);
+       if (unlikely(!ret)) {
                percpu_up_read(&c->mark_lock);
-               kfree(ret);
-               goto retry;
+               return NULL;
        }
 
+       ret->online_reserved = percpu_u64_get(c->online_reserved);
+
+       u64s = fs_usage_u64s(c);
        do {
                seq = read_seqcount_begin(&c->usage_lock);
-               memcpy(ret, c->usage_base, u64s * sizeof(u64));
-               acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
-               acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s);
+               memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
+               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+                       acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
        } while (read_seqcount_retry(&c->usage_lock, seq));
 
        return ret;
@@ -260,9 +250,10 @@ retry:
 
 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 {
-       unsigned u64s = fs_usage_u64s(c);
+       struct bch_dev *ca;
+       unsigned i, u64s = fs_usage_u64s(c);
 
-       BUG_ON(idx >= 2);
+       BUG_ON(idx >= ARRAY_SIZE(c->usage));
 
        preempt_disable();
        write_seqcount_begin(&c->usage_lock);
@@ -271,37 +262,47 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
                        (u64 __percpu *) c->usage[idx], u64s);
        percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
 
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i, NULL) {
+               u64s = dev_usage_u64s();
+
+               acc_u64s_percpu((u64 *) ca->usage_base,
+                               (u64 __percpu *) ca->usage[idx], u64s);
+               percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+       }
+       rcu_read_unlock();
+
        write_seqcount_end(&c->usage_lock);
        preempt_enable();
 }
 
 void bch2_fs_usage_to_text(struct printbuf *out,
                           struct bch_fs *c,
-                          struct bch_fs_usage *fs_usage)
+                          struct bch_fs_usage_online *fs_usage)
 {
        unsigned i;
 
        pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
 
        pr_buf(out, "hidden:\t\t\t\t%llu\n",
-              fs_usage->hidden);
+              fs_usage->u.hidden);
        pr_buf(out, "data:\t\t\t\t%llu\n",
-              fs_usage->data);
+              fs_usage->u.data);
        pr_buf(out, "cached:\t\t\t\t%llu\n",
-              fs_usage->cached);
+              fs_usage->u.cached);
        pr_buf(out, "reserved:\t\t\t%llu\n",
-              fs_usage->reserved);
+              fs_usage->u.reserved);
        pr_buf(out, "nr_inodes:\t\t\t%llu\n",
-              fs_usage->nr_inodes);
+              fs_usage->u.nr_inodes);
        pr_buf(out, "online reserved:\t\t%llu\n",
               fs_usage->online_reserved);
 
        for (i = 0;
-            i < ARRAY_SIZE(fs_usage->persistent_reserved);
+            i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
             i++) {
                pr_buf(out, "%u replicas:\n", i + 1);
                pr_buf(out, "\treserved:\t\t%llu\n",
-                      fs_usage->persistent_reserved[i]);
+                      fs_usage->u.persistent_reserved[i]);
        }
 
        for (i = 0; i < c->replicas.nr; i++) {
@@ -310,7 +311,7 @@ void bch2_fs_usage_to_text(struct printbuf *out,
 
                pr_buf(out, "\t");
                bch2_replicas_entry_to_text(out, e);
-               pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]);
+               pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
        }
 }
 
@@ -323,15 +324,15 @@ static u64 reserve_factor(u64 r)
 
 static u64 avail_factor(u64 r)
 {
-       return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
+       return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
 }
 
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
 {
-       return min(fs_usage->hidden +
-                  fs_usage->btree +
-                  fs_usage->data +
-                  reserve_factor(fs_usage->reserved +
+       return min(fs_usage->u.hidden +
+                  fs_usage->u.btree +
+                  fs_usage->u.data +
+                  reserve_factor(fs_usage->u.reserved +
                                  fs_usage->online_reserved),
                   c->capacity);
 }
@@ -348,7 +349,7 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
        data            = bch2_fs_usage_read_one(c, &c->usage_base->data) +
                bch2_fs_usage_read_one(c, &c->usage_base->btree);
        reserved        = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
-               bch2_fs_usage_read_one(c, &c->usage_base->online_reserved);
+               percpu_u64_get(c->online_reserved);
 
        ret.used        = min(ret.capacity, data + reserve_factor(reserved));
        ret.free        = ret.capacity - ret.used;
@@ -375,15 +376,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
        return !is_available_bucket(m);
 }
 
-static inline int is_fragmented_bucket(struct bucket_mark m,
-                                      struct bch_dev *ca)
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+                                           struct bucket_mark m)
 {
-       if (!m.owned_by_allocator &&
-           m.data_type == BCH_DATA_user &&
-           bucket_sectors_used(m))
-               return max_t(int, 0, (int) ca->mi.bucket_size -
-                            bucket_sectors_used(m));
-       return 0;
+       return bucket_sectors_used(m)
+               ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+               : 0;
 }
 
 static inline int is_stripe_data_bucket(struct bucket_mark m)
@@ -391,11 +389,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m)
        return m.stripe && m.data_type != BCH_DATA_parity;
 }
 
-static inline int bucket_stripe_sectors(struct bucket_mark m)
-{
-       return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
-}
-
 static inline enum bch_data_type bucket_type(struct bucket_mark m)
 {
        return m.cached_sectors && !m.dirty_sectors
@@ -410,43 +403,6 @@ static bool bucket_became_unavailable(struct bucket_mark old,
               !is_available_bucket(new);
 }
 
-int bch2_fs_usage_apply(struct bch_fs *c,
-                       struct bch_fs_usage *fs_usage,
-                       struct disk_reservation *disk_res,
-                       unsigned journal_seq)
-{
-       s64 added = fs_usage->data + fs_usage->reserved;
-       s64 should_not_have_added;
-       int ret = 0;
-
-       percpu_rwsem_assert_held(&c->mark_lock);
-
-       /*
-        * Not allowed to reduce sectors_available except by getting a
-        * reservation:
-        */
-       should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
-       if (WARN_ONCE(should_not_have_added > 0,
-                     "disk usage increased by %lli more than reservation of %llu",
-                     added, disk_res ? disk_res->sectors : 0)) {
-               atomic64_sub(should_not_have_added, &c->sectors_available);
-               added -= should_not_have_added;
-               ret = -1;
-       }
-
-       if (added > 0) {
-               disk_res->sectors               -= added;
-               fs_usage->online_reserved       -= added;
-       }
-
-       preempt_disable();
-       acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false),
-                (u64 *) fs_usage, fs_usage_u64s(c));
-       preempt_enable();
-
-       return ret;
-}
-
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
                                  struct bch_dev_usage *dev_usage,
                                  enum bch_data_type type,
@@ -455,20 +411,22 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
        if (type == BCH_DATA_sb || type == BCH_DATA_journal)
                fs_usage->hidden        += size;
 
-       dev_usage->buckets[type]        += nr;
+       dev_usage->d[type].buckets      += nr;
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
                                  struct bch_fs_usage *fs_usage,
                                  struct bucket_mark old, struct bucket_mark new,
-                                 bool gc)
+                                 u64 journal_seq, bool gc)
 {
        struct bch_dev_usage *u;
 
        percpu_rwsem_assert_held(&c->mark_lock);
 
        preempt_disable();
-       u = this_cpu_ptr(ca->usage[gc]);
+       if (!fs_usage)
+               fs_usage = fs_usage_ptr(c, journal_seq, gc);
+       u = dev_usage_ptr(ca, journal_seq, gc);
 
        if (bucket_type(old))
                account_bucket(fs_usage, u, bucket_type(old),
@@ -478,68 +436,35 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
                account_bucket(fs_usage, u, bucket_type(new),
                               1, ca->mi.bucket_size);
 
-       u->buckets_alloc +=
-               (int) new.owned_by_allocator - (int) old.owned_by_allocator;
+       u->buckets_ec += (int) new.stripe - (int) old.stripe;
        u->buckets_unavailable +=
                is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
-       u->buckets_ec += (int) new.stripe - (int) old.stripe;
-       u->sectors_ec += bucket_stripe_sectors(new) -
-                        bucket_stripe_sectors(old);
-
-       u->sectors[old.data_type] -= old.dirty_sectors;
-       u->sectors[new.data_type] += new.dirty_sectors;
-       u->sectors[BCH_DATA_cached] +=
+       u->d[old.data_type].sectors -= old.dirty_sectors;
+       u->d[new.data_type].sectors += new.dirty_sectors;
+       u->d[BCH_DATA_cached].sectors +=
                (int) new.cached_sectors - (int) old.cached_sectors;
-       u->sectors_fragmented +=
-               is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+
+       u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+       u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
        preempt_enable();
 
        if (!is_available_bucket(old) && is_available_bucket(new))
                bch2_wake_allocator(ca);
 }
 
-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
-       struct bch_dev *ca;
-       struct bucket_mark old = { .v.counter = 0 };
-       struct bucket_array *buckets;
-       struct bucket *g;
-       unsigned i;
-       int cpu;
-
-       c->usage_base->hidden = 0;
-
-       for_each_member_device(ca, c, i) {
-               for_each_possible_cpu(cpu)
-                       memset(per_cpu_ptr(ca->usage[0], cpu), 0,
-                              sizeof(*ca->usage[0]));
-
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets)
-                       bch2_dev_usage_update(c, ca, c->usage_base,
-                                             old, g->mark, false);
-       }
-}
-
-static inline int update_replicas(struct bch_fs *c,
-                                 struct bch_fs_usage *fs_usage,
-                                 struct bch_replicas_entry *r,
-                                 s64 sectors)
+static inline void update_replicas(struct bch_fs *c,
+                                  struct bch_fs_usage *fs_usage,
+                                  struct bch_replicas_entry *r,
+                                  s64 sectors)
 {
        int idx = bch2_replicas_entry_idx(c, r);
 
-       if (idx < 0)
-               return -1;
-
-       if (!fs_usage)
-               return 0;
+       BUG_ON(idx < 0);
 
        fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
        fs_usage->replicas[idx]         += sectors;
-       return 0;
 }
 
 static inline void update_cached_sectors(struct bch_fs *c,
@@ -586,6 +511,7 @@ static inline void update_replicas_list(struct btree_trans *trans,
        n = (void *) d->d + d->used;
        n->delta = sectors;
        memcpy(&n->r, r, replicas_entry_bytes(r));
+       bch2_replicas_entry_sort(&n->r);
        d->used += b;
 }
 
@@ -599,43 +525,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
        update_replicas_list(trans, &r.e, sectors);
 }
 
-static inline struct replicas_delta *
-replicas_delta_next(struct replicas_delta *d)
-{
-       return (void *) d + replicas_entry_bytes(&d->r) + 8;
-}
-
-int bch2_replicas_delta_list_apply(struct bch_fs *c,
-                                  struct bch_fs_usage *fs_usage,
-                                  struct replicas_delta_list *r)
-{
-       struct replicas_delta *d = r->d;
-       struct replicas_delta *top = (void *) r->d + r->used;
-       unsigned i;
-
-       for (d = r->d; d != top; d = replicas_delta_next(d))
-               if (update_replicas(c, fs_usage, &d->r, d->delta)) {
-                       top = d;
-                       goto unwind;
-               }
-
-       if (!fs_usage)
-               return 0;
-
-       fs_usage->nr_inodes += r->nr_inodes;
-
-       for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-               fs_usage->reserved += r->persistent_reserved[i];
-               fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
-       }
-
-       return 0;
-unwind:
-       for (d = r->d; d != top; d = replicas_delta_next(d))
-               update_replicas(c, fs_usage, &d->r, -d->delta);
-       return -1;
-}
-
 #define do_mark_fn(fn, c, pos, flags, ...)                             \
 ({                                                                     \
        int gc, ret = 0;                                                \
@@ -649,51 +538,10 @@ unwind:
        ret;                                                            \
 })
 
-static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                   size_t b, struct bucket_mark *ret,
-                                   bool gc)
-{
-       struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
-       struct bucket *g = __bucket(ca, b, gc);
-       struct bucket_mark old, new;
-
-       old = bucket_cmpxchg(g, new, ({
-               BUG_ON(!is_available_bucket(new));
-
-               new.owned_by_allocator  = true;
-               new.data_type           = 0;
-               new.cached_sectors      = 0;
-               new.dirty_sectors       = 0;
-               new.gen++;
-       }));
-
-       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
-       if (old.cached_sectors)
-               update_cached_sectors(c, fs_usage, ca->dev_idx,
-                                     -((s64) old.cached_sectors));
-
-       if (!gc)
-               *ret = old;
-       return 0;
-}
-
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-                           size_t b, struct bucket_mark *old)
-{
-       do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
-                  ca, b, old);
-
-       if (!old->owned_by_allocator && old->cached_sectors)
-               trace_invalidate(ca, bucket_to_sector(ca, b),
-                                old->cached_sectors);
-}
-
 static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
                                    size_t b, bool owned_by_allocator,
                                    bool gc)
 {
-       struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
        struct bucket *g = __bucket(ca, b, gc);
        struct bucket_mark old, new;
 
@@ -701,8 +549,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
                new.owned_by_allocator  = owned_by_allocator;
        }));
 
-       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
        BUG_ON(!gc &&
               !owned_by_allocator && !old.owned_by_allocator);
 
@@ -733,7 +579,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
        struct bucket_mark old_m, m;
 
        /* We don't do anything for deletions - do we?: */
-       if (new.k->type != KEY_TYPE_alloc)
+       if (new.k->type != KEY_TYPE_alloc &&
+           new.k->type != KEY_TYPE_alloc_v2)
                return 0;
 
        /*
@@ -756,6 +603,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
                m.data_type             = u.data_type;
                m.dirty_sectors         = u.dirty_sectors;
                m.cached_sectors        = u.cached_sectors;
+               m.stripe                = u.stripe != 0;
 
                if (journal_seq) {
                        m.journal_seq_valid     = 1;
@@ -763,12 +611,14 @@ static int bch2_mark_alloc(struct bch_fs *c,
                }
        }));
 
-       bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+       bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
 
        g->io_time[READ]        = u.read_time;
        g->io_time[WRITE]       = u.write_time;
        g->oldest_gen           = u.oldest_gen;
        g->gen_valid            = 1;
+       g->stripe               = u.stripe;
+       g->stripe_redundancy    = u.stripe_redundancy;
 
        /*
         * need to know if we're getting called from the invalidate path or
@@ -826,7 +676,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
        if (c)
                bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
-                                     old, new, gc);
+                                     old, new, 0, gc);
 
        return 0;
 }
@@ -963,11 +813,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
        return 0;
 }
 
-static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
                             unsigned ptr_idx,
                             struct bch_fs_usage *fs_usage,
-                            u64 journal_seq, unsigned flags,
-                            bool enabled)
+                            u64 journal_seq, unsigned flags)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
        unsigned nr_data = s->nr_blocks - s->nr_redundant;
@@ -980,8 +829,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
        char buf[200];
        int ret;
 
-       if (enabled)
-               g->ec_redundancy = s->nr_redundant;
+       if (g->stripe && g->stripe != k.k->p.offset) {
+               bch2_fs_inconsistent(c,
+                             "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+                             ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+                             (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+               return -EINVAL;
+       }
 
        old = bucket_cmpxchg(g, new, ({
                ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
@@ -989,23 +843,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
                if (ret)
                        return ret;
 
-               if (new.stripe && enabled)
-                       bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                                     "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-                                     ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
-                                     (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-               if (!new.stripe && !enabled)
-                       bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                                     "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
-                                     ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
-                                     (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-               new.stripe                      = enabled;
-
-               if ((flags & BTREE_TRIGGER_GC) && parity) {
-                       new.data_type = enabled ? BCH_DATA_parity : 0;
-                       new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+               if (parity) {
+                       new.data_type           = BCH_DATA_parity;
+                       new.dirty_sectors       = le16_to_cpu(s->sectors);
                }
 
                if (journal_seq) {
@@ -1014,10 +854,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
                }
        }));
 
-       if (!enabled)
-               g->ec_redundancy = 0;
+       g->stripe               = k.k->p.offset;
+       g->stripe_redundancy    = s->nr_redundant;
 
-       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+       bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
        return 0;
 }
 
@@ -1084,7 +924,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
                              old.v.counter,
                              new.v.counter)) != old.v.counter);
 
-       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+       bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
 
        BUG_ON(!gc && bucket_became_unavailable(old, new));
 
@@ -1211,6 +1051,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
        unsigned i;
        int ret;
 
+       BUG_ON(gc && old_s);
+
        if (!m || (old_s && !m->alive)) {
                bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
                                    idx);
@@ -1218,48 +1060,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
        }
 
        if (!new_s) {
-               /* Deleting: */
-               for (i = 0; i < old_s->nr_blocks; i++) {
-                       ret = bucket_set_stripe(c, old, i, fs_usage,
-                                               journal_seq, flags, false);
-                       if (ret)
-                               return ret;
-               }
-
-               if (!gc && m->on_heap) {
-                       spin_lock(&c->ec_stripes_heap_lock);
-                       bch2_stripes_heap_del(c, m, idx);
-                       spin_unlock(&c->ec_stripes_heap_lock);
-               }
-
-               if (gc)
-                       update_replicas(c, fs_usage, &m->r.e,
-                                       -((s64) m->sectors * m->nr_redundant));
+               spin_lock(&c->ec_stripes_heap_lock);
+               bch2_stripes_heap_del(c, m, idx);
+               spin_unlock(&c->ec_stripes_heap_lock);
 
                memset(m, 0, sizeof(*m));
        } else {
-               BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
-               BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
-
-               for (i = 0; i < new_s->nr_blocks; i++) {
-                       if (!old_s ||
-                           memcmp(new_s->ptrs + i,
-                                  old_s->ptrs + i,
-                                  sizeof(struct bch_extent_ptr))) {
-
-                               if (old_s) {
-                                       bucket_set_stripe(c, old, i, fs_usage,
-                                                         journal_seq, flags, false);
-                                       if (ret)
-                                               return ret;
-                               }
-                               ret = bucket_set_stripe(c, new, i, fs_usage,
-                                                       journal_seq, flags, true);
-                               if (ret)
-                                       return ret;
-                       }
-               }
-
                m->alive        = true;
                m->sectors      = le16_to_cpu(new_s->sectors);
                m->algorithm    = new_s->algorithm;
@@ -1271,18 +1077,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
                        m->block_sectors[i] =
                                stripe_blockcount_get(new_s, i);
                        m->blocks_nonempty += !!m->block_sectors[i];
-               }
 
-               if (gc && old_s)
-                       update_replicas(c, fs_usage, &m->r.e,
-                                       -((s64) m->sectors * m->nr_redundant));
+                       m->ptrs[i] = new_s->ptrs[i];
+               }
 
                bch2_bkey_to_replicas(&m->r.e, new);
 
-               if (gc)
-                       update_replicas(c, fs_usage, &m->r.e,
-                                       ((s64) m->sectors * m->nr_redundant));
-
                if (!gc) {
                        spin_lock(&c->ec_stripes_heap_lock);
                        bch2_stripes_heap_update(c, m, idx);
@@ -1290,6 +1090,25 @@ static int bch2_mark_stripe(struct bch_fs *c,
                }
        }
 
+       if (gc) {
+               /*
+                * gc recalculates this field from stripe ptr
+                * references:
+                */
+               memset(m->block_sectors, 0, sizeof(m->block_sectors));
+               m->blocks_nonempty = 0;
+
+               for (i = 0; i < new_s->nr_blocks; i++) {
+                       ret = mark_stripe_bucket(c, new, i, fs_usage,
+                                                journal_seq, flags);
+                       if (ret)
+                               return ret;
+               }
+
+               update_replicas(c, fs_usage, &m->r.e,
+                               ((s64) m->sectors * m->nr_redundant));
+       }
+
        return 0;
 }
 
@@ -1313,6 +1132,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 
        switch (k.k->type) {
        case KEY_TYPE_alloc:
+       case KEY_TYPE_alloc_v2:
                ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
                break;
        case KEY_TYPE_btree_ptr:
@@ -1333,10 +1153,8 @@ static int bch2_mark_key_locked(struct bch_fs *c,
                ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
                break;
        case KEY_TYPE_inode:
-               if (!(flags & BTREE_TRIGGER_OVERWRITE))
-                       fs_usage->nr_inodes++;
-               else
-                       fs_usage->nr_inodes--;
+               fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
+               fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
                break;
        case KEY_TYPE_reservation: {
                unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
@@ -1383,9 +1201,6 @@ int bch2_mark_update(struct btree_trans *trans,
                     unsigned flags)
 {
        struct bch_fs           *c = trans->c;
-       struct btree            *b = iter_l(iter)->b;
-       struct btree_node_iter  node_iter = iter_l(iter)->iter;
-       struct bkey_packed      *_old;
        struct bkey_s_c         old;
        struct bkey             unpacked;
        int ret = 0;
@@ -1400,10 +1215,10 @@ int bch2_mark_update(struct btree_trans *trans,
        old = (struct bkey_s_c) { &unpacked, NULL };
 
        if (!btree_node_type_is_extents(iter->btree_id)) {
+               /* iterators should be uptodate, shouldn't get errors here: */
                if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
-                       _old = bch2_btree_node_iter_peek(&node_iter, b);
-                       if (_old)
-                               old = bkey_disassemble(b, _old, &unpacked);
+                       old = bch2_btree_iter_peek_slot(iter);
+                       BUG_ON(bkey_err(old));
                } else {
                        struct bkey_cached *ck = (void *) iter->l[0].b;
 
@@ -1425,23 +1240,24 @@ int bch2_mark_update(struct btree_trans *trans,
                                BTREE_TRIGGER_OVERWRITE|flags);
                }
        } else {
+               struct btree_iter *copy;
+
                BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
                bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
                        0, new->k.size,
                        fs_usage, trans->journal_res.seq,
                        BTREE_TRIGGER_INSERT|flags);
 
-               while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
-                       unsigned offset = 0;
-                       s64 sectors;
+               copy = bch2_trans_copy_iter(trans, iter);
 
-                       old = bkey_disassemble(b, _old, &unpacked);
-                       sectors = -((s64) old.k->size);
+               for_each_btree_key_continue(copy, 0, old, ret) {
+                       unsigned offset = 0;
+                       s64 sectors = -((s64) old.k->size);
 
                        flags |= BTREE_TRIGGER_OVERWRITE;
 
                        if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
-                               return 0;
+                               break;
 
                        switch (bch2_extent_overlap(&new->k, old.k)) {
                        case BCH_EXTENT_OVERLAP_ALL:
@@ -1474,30 +1290,22 @@ int bch2_mark_update(struct btree_trans *trans,
                                        trans->journal_res.seq, flags) ?: 1;
                        if (ret <= 0)
                                break;
-
-                       bch2_btree_node_iter_advance(&node_iter, b);
                }
+               bch2_trans_iter_put(trans, copy);
        }
 
        return ret;
 }
 
-void bch2_trans_fs_usage_apply(struct btree_trans *trans,
-                              struct bch_fs_usage *fs_usage)
+static noinline __cold
+void fs_usage_apply_warn(struct btree_trans *trans,
+                        unsigned disk_res_sectors)
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
-       static int warned_disk_usage = 0;
-       u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
        char buf[200];
 
-       if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res,
-                                trans->journal_res.seq) ||
-           warned_disk_usage ||
-           xchg(&warned_disk_usage, 1))
-               return;
-
-       bch_err(c, "disk usage increased more than %llu sectors reserved",
+       bch_err(c, "disk usage increased more than %u sectors reserved",
                disk_res_sectors);
 
        trans_for_each_update(trans, i) {
@@ -1507,27 +1315,20 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
                pr_err("overlapping with");
 
                if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
-                       struct btree            *b = iter_l(i->iter)->b;
-                       struct btree_node_iter  node_iter = iter_l(i->iter)->iter;
-                       struct bkey_packed      *_k;
-
-                       while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
-                               struct bkey             unpacked;
-                               struct bkey_s_c         k;
-
-                               pr_info("_k %px format %u", _k, _k->format);
-                               k = bkey_disassemble(b, _k, &unpacked);
+                       struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
+                       struct bkey_s_c k;
+                       int ret;
 
-                               if (btree_node_is_extents(b)
+                       for_each_btree_key_continue(copy, 0, k, ret) {
+                               if (btree_node_type_is_extents(i->iter->btree_id)
                                    ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
                                    : bkey_cmp(i->k->k.p, k.k->p))
                                        break;
 
                                bch2_bkey_val_to_text(&PBUF(buf), c, k);
                                pr_err("%s", buf);
-
-                               bch2_btree_node_iter_advance(&node_iter, b);
                        }
+                       bch2_trans_iter_put(trans, copy);
                } else {
                        struct bkey_cached *ck = (void *) i->iter->l[0].b;
 
@@ -1539,6 +1340,65 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
        }
 }
 
+void bch2_trans_fs_usage_apply(struct btree_trans *trans,
+                              struct replicas_delta_list *deltas)
+{
+       struct bch_fs *c = trans->c;
+       static int warned_disk_usage = 0;
+       bool warn = false;
+       unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+       struct replicas_delta *d = deltas->d;
+       struct replicas_delta *top = (void *) deltas->d + deltas->used;
+       struct bch_fs_usage *dst;
+       s64 added = 0, should_not_have_added;
+       unsigned i;
+
+       percpu_rwsem_assert_held(&c->mark_lock);
+
+       preempt_disable();
+       dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+       for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+               switch (d->r.data_type) {
+               case BCH_DATA_btree:
+               case BCH_DATA_user:
+               case BCH_DATA_parity:
+                       added += d->delta;
+               }
+
+               update_replicas(c, dst, &d->r, d->delta);
+       }
+
+       dst->nr_inodes += deltas->nr_inodes;
+
+       for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+               added                           += deltas->persistent_reserved[i];
+               dst->reserved                   += deltas->persistent_reserved[i];
+               dst->persistent_reserved[i]     += deltas->persistent_reserved[i];
+       }
+
+       /*
+        * Not allowed to reduce sectors_available except by getting a
+        * reservation:
+        */
+       should_not_have_added = added - (s64) disk_res_sectors;
+       if (unlikely(should_not_have_added > 0)) {
+               atomic64_sub(should_not_have_added, &c->sectors_available);
+               added -= should_not_have_added;
+               warn = true;
+       }
+
+       if (added > 0) {
+               trans->disk_res->sectors -= added;
+               this_cpu_sub(*c->online_reserved, added);
+       }
+
+       preempt_enable();
+
+       if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
+               fs_usage_apply_warn(trans, disk_res_sectors);
+}
+
 /* trans_mark: */
 
 static struct btree_iter *trans_get_update(struct btree_trans *trans,
@@ -1554,6 +1414,10 @@ static struct btree_iter *trans_get_update(struct btree_trans *trans,
                       bkey_cmp(pos, i->k->k.p) < 0
                     : !bkey_cmp(pos, i->iter->pos))) {
                        *k = bkey_i_to_s_c(i->k);
+
+                       /* ugly hack.. */
+                       BUG_ON(btree_iter_live(trans, i->iter));
+                       trans->iters_live |= 1ULL << i->iter->idx;
                        return i->iter;
                }
 
@@ -1565,7 +1429,7 @@ static int trans_get_key(struct btree_trans *trans,
                         struct btree_iter **iter,
                         struct bkey_s_c *k)
 {
-       unsigned flags = btree_id != BTREE_ID_ALLOC
+       unsigned flags = btree_id != BTREE_ID_alloc
                ? BTREE_ITER_SLOTS
                : BTREE_ITER_CACHED;
        int ret;
@@ -1576,9 +1440,6 @@ static int trans_get_key(struct btree_trans *trans,
 
        *iter = bch2_trans_get_iter(trans, btree_id, pos,
                                    flags|BTREE_ITER_INTENT);
-       if (IS_ERR(*iter))
-               return PTR_ERR(*iter);
-
        *k = __bch2_btree_iter_peek(*iter, flags);
        ret = bkey_err(*k);
        if (ret)
@@ -1586,9 +1447,10 @@ static int trans_get_key(struct btree_trans *trans,
        return ret;
 }
 
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
-                                        const struct bch_extent_ptr *ptr,
-                                        struct bkey_alloc_unpacked *u)
+static struct bkey_alloc_buf *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+                             const struct bch_extent_ptr *ptr,
+                             struct bkey_alloc_unpacked *u)
 {
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -1596,33 +1458,35 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
        struct bucket *g;
        struct btree_iter *iter;
        struct bkey_s_c k;
+       struct bkey_alloc_buf *a;
        int ret;
 
-       iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
+       a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+       if (IS_ERR(a))
+               return a;
+
+       iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
        if (iter) {
                *u = bch2_alloc_unpack(k);
        } else {
-               iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
+               iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
                                           BTREE_ITER_CACHED|
                                           BTREE_ITER_CACHED_NOFILL|
                                           BTREE_ITER_INTENT);
-               if (IS_ERR(iter))
-                       return PTR_ERR(iter);
-
                ret = bch2_btree_iter_traverse(iter);
                if (ret) {
                        bch2_trans_iter_put(trans, iter);
-                       return ret;
+                       return ERR_PTR(ret);
                }
 
                percpu_down_read(&c->mark_lock);
                g = bucket(ca, pos.offset);
-               *u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+               *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
                percpu_up_read(&c->mark_lock);
        }
 
        *_iter = iter;
-       return 0;
+       return a;
 }
 
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
@@ -1632,34 +1496,27 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter *iter;
        struct bkey_alloc_unpacked u;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf *a;
        int ret;
 
-       ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
-       if (ret)
-               return ret;
+       a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
 
        ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
                             &u.dirty_sectors, &u.cached_sectors);
        if (ret)
                goto out;
 
-       a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-       ret = PTR_ERR_OR_ZERO(a);
-       if (ret)
-               goto out;
-
-       bkey_alloc_init(&a->k_i);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, u);
-       bch2_trans_update(trans, iter, &a->k_i, 0);
+       bch2_alloc_pack(c, a, u);
+       bch2_trans_update(trans, iter, &a->k, 0);
 out:
        bch2_trans_iter_put(trans, iter);
        return ret;
 }
 
 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
-                       struct bch_extent_stripe_ptr p,
+                       struct extent_ptr_decoded p,
                        s64 sectors, enum bch_data_type data_type)
 {
        struct bch_fs *c = trans->c;
@@ -1669,14 +1526,22 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
        struct bch_replicas_padded r;
        int ret = 0;
 
-       ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
+       ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k);
        if (ret < 0)
                return ret;
 
        if (k.k->type != KEY_TYPE_stripe) {
                bch2_fs_inconsistent(c,
                        "pointer to nonexistent stripe %llu",
-                       (u64) p.idx);
+                       (u64) p.ec.idx);
+               ret = -EIO;
+               goto out;
+       }
+
+       if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
+               bch2_fs_inconsistent(c,
+                       "stripe pointer doesn't match stripe %llu",
+                       (u64) p.ec.idx);
                ret = -EIO;
                goto out;
        }
@@ -1687,8 +1552,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                goto out;
 
        bkey_reassemble(&s->k_i, k);
-       stripe_blockcount_set(&s->v, p.block,
-               stripe_blockcount_get(&s->v, p.block) +
+       stripe_blockcount_set(&s->v, p.ec.block,
+               stripe_blockcount_get(&s->v, p.ec.block) +
                sectors);
        bch2_trans_update(trans, iter, &s->k_i, 0);
 
@@ -1739,7 +1604,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
                        dirty_sectors          += disk_sectors;
                        r.e.devs[r.e.nr_devs++] = p.ptr.dev;
                } else {
-                       ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
+                       ret = bch2_trans_mark_stripe_ptr(trans, p,
                                        disk_sectors, data_type);
                        if (ret)
                                return ret;
@@ -1754,59 +1619,108 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
        return 0;
 }
 
+static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
+                                           struct bkey_s_c_stripe s,
+                                           unsigned idx, bool deleting)
+{
+       struct bch_fs *c = trans->c;
+       const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+       struct bkey_alloc_buf *a;
+       struct btree_iter *iter;
+       struct bkey_alloc_unpacked u;
+       bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+       int ret = 0;
+
+       a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
+
+       if (parity) {
+               s64 sectors = le16_to_cpu(s.v->sectors);
+
+               if (deleting)
+                       sectors = -sectors;
+
+               u.dirty_sectors += sectors;
+               u.data_type = u.dirty_sectors
+                       ? BCH_DATA_parity
+                       : 0;
+       }
+
+       if (!deleting) {
+               if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
+                               "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+                               iter->pos.inode, iter->pos.offset, u.gen,
+                               u.stripe, s.k->p.offset)) {
+                       ret = -EIO;
+                       goto err;
+               }
+
+               u.stripe                = s.k->p.offset;
+               u.stripe_redundancy     = s.v->nr_redundant;
+       } else {
+               u.stripe                = 0;
+               u.stripe_redundancy     = 0;
+       }
+
+       bch2_alloc_pack(c, a, u);
+       bch2_trans_update(trans, iter, &a->k, 0);
+err:
+       bch2_trans_iter_put(trans, iter);
+       return ret;
+}
+
 static int bch2_trans_mark_stripe(struct btree_trans *trans,
-                                 struct bkey_s_c k,
+                                 struct bkey_s_c old, struct bkey_s_c new,
                                  unsigned flags)
 {
-       const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-       unsigned nr_data = s->nr_blocks - s->nr_redundant;
+       struct bkey_s_c_stripe old_s = { NULL };
+       struct bkey_s_c_stripe new_s = { NULL };
        struct bch_replicas_padded r;
-       struct bkey_alloc_unpacked u;
-       struct bkey_i_alloc *a;
-       struct btree_iter *iter;
-       bool deleting = flags & BTREE_TRIGGER_OVERWRITE;
-       s64 sectors = le16_to_cpu(s->sectors);
        unsigned i;
        int ret = 0;
 
-       if (deleting)
-               sectors = -sectors;
-
-       bch2_bkey_to_replicas(&r.e, k);
-       update_replicas_list(trans, &r.e, sectors * s->nr_redundant);
+       if (old.k->type == KEY_TYPE_stripe)
+               old_s = bkey_s_c_to_stripe(old);
+       if (new.k->type == KEY_TYPE_stripe)
+               new_s = bkey_s_c_to_stripe(new);
 
        /*
-        * The allocator code doesn't necessarily update bucket gens in the
-        * btree when incrementing them, right before handing out new buckets -
-        * we just need to persist those updates here along with the new stripe:
+        * If the pointers aren't changing, we don't need to do anything:
         */
+       if (new_s.k && old_s.k &&
+           new_s.v->nr_blocks          == old_s.v->nr_blocks &&
+           new_s.v->nr_redundant       == old_s.v->nr_redundant &&
+           !memcmp(old_s.v->ptrs, new_s.v->ptrs,
+                   new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
+               return 0;
 
-       for (i = 0; i < s->nr_blocks && !ret; i++) {
-               bool parity = i >= nr_data;
+       if (new_s.k) {
+               s64 sectors = le16_to_cpu(new_s.v->sectors);
 
-               ret = bch2_trans_start_alloc_update(trans, &iter,
-                                                   &s->ptrs[i], &u);
-               if (ret)
-                       break;
+               bch2_bkey_to_replicas(&r.e, new);
+               update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
 
-               if (parity) {
-                       u.dirty_sectors += sectors;
-                       u.data_type = u.dirty_sectors
-                               ? BCH_DATA_parity
-                               : 0;
+               for (i = 0; i < new_s.v->nr_blocks; i++) {
+                       ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
+                                                              i, false);
+                       if (ret)
+                               return ret;
                }
+       }
 
-               a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-               ret = PTR_ERR_OR_ZERO(a);
-               if (ret)
-                       goto put_iter;
-
-               bkey_alloc_init(&a->k_i);
-               a->k.p = iter->pos;
-               bch2_alloc_pack(a, u);
-               bch2_trans_update(trans, iter, &a->k_i, 0);
-put_iter:
-               bch2_trans_iter_put(trans, iter);
+       if (old_s.k) {
+               s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
+
+               bch2_bkey_to_replicas(&r.e, old);
+               update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+
+               for (i = 0; i < old_s.v->nr_blocks; i++) {
+                       ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
+                                                              i, true);
+                       if (ret)
+                               return ret;
+               }
        }
 
        return ret;
@@ -1836,7 +1750,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
        __le64 *refcount;
        s64 ret;
 
-       ret = trans_get_key(trans, BTREE_ID_REFLINK,
+       ret = trans_get_key(trans, BTREE_ID_reflink,
                            POS(0, idx), &iter, &k);
        if (ret < 0)
                return ret;
@@ -1872,8 +1786,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
        }
 
        bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
-       BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
-
        bch2_trans_update(trans, iter, n, 0);
 out:
        ret = sectors;
@@ -1905,11 +1817,16 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
        return ret;
 }
 
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+int bch2_trans_mark_key(struct btree_trans *trans,
+                       struct bkey_s_c old,
+                       struct bkey_s_c new,
                        unsigned offset, s64 sectors, unsigned flags)
 {
-       struct replicas_delta_list *d;
        struct bch_fs *c = trans->c;
+       struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+       struct replicas_delta_list *d;
+
+       BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
 
        switch (k.k->type) {
        case KEY_TYPE_btree_ptr:
@@ -1925,15 +1842,18 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
                return bch2_trans_mark_extent(trans, k, offset, sectors,
                                              flags, BCH_DATA_user);
        case KEY_TYPE_stripe:
-               return bch2_trans_mark_stripe(trans, k, flags);
-       case KEY_TYPE_inode:
-               d = replicas_deltas_realloc(trans, 0);
+               return bch2_trans_mark_stripe(trans, old, new, flags);
+       case KEY_TYPE_inode: {
+               int nr = (new.k->type == KEY_TYPE_inode) -
+                        (old.k->type == KEY_TYPE_inode);
+
+               if (nr) {
+                       d = replicas_deltas_realloc(trans, 0);
+                       d->nr_inodes += nr;
+               }
 
-               if (!(flags & BTREE_TRIGGER_OVERWRITE))
-                       d->nr_inodes++;
-               else
-                       d->nr_inodes--;
                return 0;
+       }
        case KEY_TYPE_reservation: {
                unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
@@ -1957,12 +1877,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
 
 int bch2_trans_mark_update(struct btree_trans *trans,
                           struct btree_iter *iter,
-                          struct bkey_i *insert,
+                          struct bkey_i *new,
                           unsigned flags)
 {
-       struct btree            *b = iter_l(iter)->b;
-       struct btree_node_iter  node_iter = iter_l(iter)->iter;
-       struct bkey_packed      *_k;
+       struct bkey_s_c old;
        int ret;
 
        if (unlikely(flags & BTREE_TRIGGER_NORUN))
@@ -1971,93 +1889,251 @@ int bch2_trans_mark_update(struct btree_trans *trans,
        if (!btree_node_type_needs_gc(iter->btree_id))
                return 0;
 
-       ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
-                       0, insert->k.size, BTREE_TRIGGER_INSERT);
-       if (ret)
-               return ret;
+       if (!btree_node_type_is_extents(iter->btree_id)) {
+               /* iterators should be uptodate, shouldn't get errors here: */
+               if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
+                       old = bch2_btree_iter_peek_slot(iter);
+                       BUG_ON(bkey_err(old));
+               } else {
+                       struct bkey_cached *ck = (void *) iter->l[0].b;
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
-               struct bkey_cached *ck = (void *) iter->l[0].b;
+                       BUG_ON(!ck->valid);
+                       old = bkey_i_to_s_c(ck->k);
+               }
 
-               return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k),
-                                          0, 0, BTREE_TRIGGER_OVERWRITE);
-       }
+               if (old.k->type == new->k.type) {
+                       ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+                                       BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+               } else {
+                       ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+                                       BTREE_TRIGGER_INSERT|flags) ?:
+                               bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+                                       BTREE_TRIGGER_OVERWRITE|flags);
+               }
+       } else {
+               struct btree_iter *copy;
+               struct bkey _old;
 
-       while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
-               struct bkey             unpacked;
-               struct bkey_s_c         k;
-               unsigned                offset = 0;
-               s64                     sectors = 0;
-               unsigned                flags = BTREE_TRIGGER_OVERWRITE;
+               EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
 
-               k = bkey_disassemble(b, _k, &unpacked);
+               bkey_init(&_old);
+               old = (struct bkey_s_c) { &_old, NULL };
 
-               if (btree_node_is_extents(b)
-                   ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0
-                   : bkey_cmp(insert->k.p, k.k->p))
-                       break;
+               ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
+                                         0, new->k.size,
+                                         BTREE_TRIGGER_INSERT);
+               if (ret)
+                       return ret;
 
-               if (btree_node_is_extents(b)) {
-                       switch (bch2_extent_overlap(&insert->k, k.k)) {
+               copy = bch2_trans_copy_iter(trans, iter);
+
+               for_each_btree_key_continue(copy, 0, old, ret) {
+                       unsigned offset = 0;
+                       s64 sectors = -((s64) old.k->size);
+
+                       flags |= BTREE_TRIGGER_OVERWRITE;
+
+                       if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
+                               break;
+
+                       switch (bch2_extent_overlap(&new->k, old.k)) {
                        case BCH_EXTENT_OVERLAP_ALL:
                                offset = 0;
-                               sectors = -((s64) k.k->size);
+                               sectors = -((s64) old.k->size);
                                break;
                        case BCH_EXTENT_OVERLAP_BACK:
-                               offset = bkey_start_offset(&insert->k) -
-                                       bkey_start_offset(k.k);
-                               sectors = bkey_start_offset(&insert->k) -
-                                       k.k->p.offset;
+                               offset = bkey_start_offset(&new->k) -
+                                       bkey_start_offset(old.k);
+                               sectors = bkey_start_offset(&new->k) -
+                                       old.k->p.offset;
                                break;
                        case BCH_EXTENT_OVERLAP_FRONT:
                                offset = 0;
-                               sectors = bkey_start_offset(k.k) -
-                                       insert->k.p.offset;
+                               sectors = bkey_start_offset(old.k) -
+                                       new->k.p.offset;
                                break;
                        case BCH_EXTENT_OVERLAP_MIDDLE:
-                               offset = bkey_start_offset(&insert->k) -
-                                       bkey_start_offset(k.k);
-                               sectors = -((s64) insert->k.size);
+                               offset = bkey_start_offset(&new->k) -
+                                       bkey_start_offset(old.k);
+                               sectors = -((s64) new->k.size);
                                flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
                                break;
                        }
 
                        BUG_ON(sectors >= 0);
+
+                       ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
+                                       offset, sectors, flags);
+                       if (ret)
+                               break;
                }
+               bch2_trans_iter_put(trans, copy);
+       }
 
-               ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
-               if (ret)
-                       return ret;
+       return ret;
+}
+
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+                                   struct bch_dev *ca, size_t b,
+                                   enum bch_data_type type,
+                                   unsigned sectors)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter *iter;
+       struct bkey_alloc_unpacked u;
+       struct bkey_alloc_buf *a;
+       struct bch_extent_ptr ptr = {
+               .dev = ca->dev_idx,
+               .offset = bucket_to_sector(ca, b),
+       };
+       int ret = 0;
+
+       a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
 
-               bch2_btree_node_iter_advance(&node_iter, b);
+       if (u.data_type && u.data_type != type) {
+               bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                       "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+                       "while marking %s",
+                       iter->pos.inode, iter->pos.offset, u.gen,
+                       bch2_data_types[u.data_type],
+                       bch2_data_types[type],
+                       bch2_data_types[type]);
+               ret = -EIO;
+               goto out;
        }
 
-       return 0;
+       if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
+               bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                       "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
+                       "while marking %s",
+                       iter->pos.inode, iter->pos.offset, u.gen,
+                       bch2_data_types[u.data_type ?: type],
+                       u.dirty_sectors, sectors, ca->mi.bucket_size,
+                       bch2_data_types[type]);
+               ret = -EIO;
+               goto out;
+       }
+
+       if (u.data_type         == type &&
+           u.dirty_sectors     == sectors)
+               goto out;
+
+       u.data_type     = type;
+       u.dirty_sectors = sectors;
+
+       bch2_alloc_pack(c, a, u);
+       bch2_trans_update(trans, iter, &a->k, 0);
+out:
+       bch2_trans_iter_put(trans, iter);
+       return ret;
 }
 
-/* Disk reservations: */
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+                                   struct disk_reservation *res,
+                                   struct bch_dev *ca, size_t b,
+                                   enum bch_data_type type,
+                                   unsigned sectors)
+{
+       return __bch2_trans_do(trans, res, NULL, 0,
+                       __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
+                                                       ca->mi.bucket_size));
 
-static u64 bch2_recalc_sectors_available(struct bch_fs *c)
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+                                           struct disk_reservation *res,
+                                           struct bch_dev *ca,
+                                           u64 start, u64 end,
+                                           enum bch_data_type type,
+                                           u64 *bucket, unsigned *bucket_sectors)
 {
-       percpu_u64_set(&c->pcpu->sectors_available, 0);
+       int ret;
+
+       do {
+               u64 b = sector_to_bucket(ca, start);
+               unsigned sectors =
+                       min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+               if (b != *bucket) {
+                       if (*bucket_sectors) {
+                               ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+                                               *bucket, type, *bucket_sectors);
+                               if (ret)
+                                       return ret;
+                       }
+
+                       *bucket         = b;
+                       *bucket_sectors = 0;
+               }
+
+               *bucket_sectors += sectors;
+               start += sectors;
+       } while (!ret && start < end);
 
-       return avail_factor(__bch2_fs_usage_read_short(c).free);
+       return 0;
 }
 
-void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+                            struct disk_reservation *res,
+                            struct bch_dev *ca)
 {
-       percpu_down_read(&c->mark_lock);
-       this_cpu_sub(c->usage[0]->online_reserved,
-                    res->sectors);
-       percpu_up_read(&c->mark_lock);
+       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+       u64 bucket = 0;
+       unsigned i, bucket_sectors = 0;
+       int ret;
+
+       for (i = 0; i < layout->nr_superblocks; i++) {
+               u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+               if (offset == BCH_SB_SECTOR) {
+                       ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
+                                               0, BCH_SB_SECTOR,
+                                               BCH_DATA_sb, &bucket, &bucket_sectors);
+                       if (ret)
+                               return ret;
+               }
 
-       res->sectors = 0;
+               ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
+                                     offset + (1 << layout->sb_max_size_bits),
+                                     BCH_DATA_sb, &bucket, &bucket_sectors);
+               if (ret)
+                       return ret;
+       }
+
+       if (bucket_sectors) {
+               ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+                               bucket, BCH_DATA_sb, bucket_sectors);
+               if (ret)
+                       return ret;
+       }
+
+       for (i = 0; i < ca->journal.nr; i++) {
+               ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+                               ca->journal.buckets[i],
+                               BCH_DATA_journal, ca->mi.bucket_size);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
 }
 
+int bch2_trans_mark_dev_sb(struct bch_fs *c,
+                          struct disk_reservation *res,
+                          struct bch_dev *ca)
+{
+       return bch2_trans_do(c, res, NULL, 0,
+                       __bch2_trans_mark_dev_sb(&trans, res, ca));
+}
+
+/* Disk reservations: */
+
 #define SECTORS_CACHE  1024
 
 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
-                             unsigned sectors, int flags)
+                             u64 sectors, int flags)
 {
        struct bch_fs_pcpu *pcpu;
        u64 old, v, get;
@@ -2078,7 +2154,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
                if (get < sectors) {
                        preempt_enable();
-                       percpu_up_read(&c->mark_lock);
                        goto recalculate;
                }
        } while ((v = atomic64_cmpxchg(&c->sectors_available,
@@ -2088,7 +2163,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 out:
        pcpu->sectors_available         -= sectors;
-       this_cpu_add(c->usage[0]->online_reserved, sectors);
+       this_cpu_add(*c->online_reserved, sectors);
        res->sectors                    += sectors;
 
        preempt_enable();
@@ -2096,15 +2171,16 @@ out:
        return 0;
 
 recalculate:
-       percpu_down_write(&c->mark_lock);
+       mutex_lock(&c->sectors_available_lock);
 
-       sectors_available = bch2_recalc_sectors_available(c);
+       percpu_u64_set(&c->pcpu->sectors_available, 0);
+       sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
 
        if (sectors <= sectors_available ||
            (flags & BCH_DISK_RESERVATION_NOFAIL)) {
                atomic64_set(&c->sectors_available,
                             max_t(s64, 0, sectors_available - sectors));
-               this_cpu_add(c->usage[0]->online_reserved, sectors);
+               this_cpu_add(*c->online_reserved, sectors);
                res->sectors                    += sectors;
                ret = 0;
        } else {
@@ -2112,7 +2188,8 @@ recalculate:
                ret = -ENOSPC;
        }
 
-       percpu_up_write(&c->mark_lock);
+       mutex_unlock(&c->sectors_available_lock);
+       percpu_up_read(&c->mark_lock);
 
        return ret;
 }
@@ -2141,7 +2218,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                             ca->mi.bucket_size / c->opts.btree_node_size);
        /* XXX: these should be tunable */
        size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
-       size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 7);
+       size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 6);
        size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
                                      btree_reserve * 2);
        bool resize = ca->buckets[0] != NULL;
@@ -2158,7 +2235,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
            !(buckets_nouse     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
                                            sizeof(unsigned long),
                                            GFP_KERNEL|__GFP_ZERO)) ||
-           !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
            !init_fifo(&free[RESERVE_MOVINGGC],
                       copygc_reserve, GFP_KERNEL) ||
            !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
@@ -2245,13 +2321,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
                sizeof(struct bucket_array) +
                ca->mi.nbuckets * sizeof(struct bucket));
 
-       free_percpu(ca->usage[0]);
+       for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+               free_percpu(ca->usage[i]);
+       kfree(ca->usage_base);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-       if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+       unsigned i;
+
+       ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+       if (!ca->usage_base)
                return -ENOMEM;
 
+       for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+               ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+               if (!ca->usage[i])
+                       return -ENOMEM;
+       }
+
        return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
 }
index a3873becbb70111b173b6c42e369e2bc5012027f..54dcc82798e58019e8e9ec4c17dbda9aff9fb74c 100644 (file)
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
        return __bucket(ca, b, false);
 }
 
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
-       return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree.
  */
 
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
 {
-       struct bucket *g = bucket(ca, b);
-
        return g->mark.gen - g->oldest_gen;
 }
 
@@ -153,18 +146,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark)
        return mark.dirty_sectors + mark.cached_sectors;
 }
 
-static inline bool bucket_unused(struct bucket_mark mark)
-{
-       return !mark.owned_by_allocator &&
-               !mark.data_type &&
-               !bucket_sectors_used(mark);
-}
-
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
-       return (!mark.owned_by_allocator &&
-               !mark.dirty_sectors &&
-               !mark.stripe);
+       return !mark.dirty_sectors && !mark.stripe;
 }
 
 static inline bool bucket_needs_journal_commit(struct bucket_mark m,
@@ -178,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
 
-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
                                          struct bch_dev_usage stats)
 {
@@ -223,19 +205,21 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
                READ_ONCE(c->replicas.nr);
 }
 
-void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
+static inline unsigned dev_usage_u64s(void)
+{
+       return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
 
 u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
 
-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
 
 void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
 
 void bch2_fs_usage_to_text(struct printbuf *,
-                          struct bch_fs *, struct bch_fs_usage *);
+                          struct bch_fs *, struct bch_fs_usage_online *);
 
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
 
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *);
@@ -245,8 +229,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
-                           size_t, struct bucket_mark *);
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
                            size_t, bool, struct gc_pos, unsigned);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
@@ -255,37 +237,36 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
                  s64, struct bch_fs_usage *, u64, unsigned);
-int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-                       struct disk_reservation *, unsigned);
 
 int bch2_mark_update(struct btree_trans *, struct btree_iter *,
                     struct bkey_i *, struct bch_fs_usage *, unsigned);
 
-int bch2_replicas_delta_list_apply(struct bch_fs *,
-                                  struct bch_fs_usage *,
-                                  struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
                        unsigned, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
                           struct bkey_i *insert, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
+void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
-/* disk reservations: */
+int bch2_trans_mark_metadata_bucket(struct btree_trans *,
+                       struct disk_reservation *, struct bch_dev *,
+                       size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
+                          struct bch_dev *);
 
-void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+/* disk reservations: */
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
                                             struct disk_reservation *res)
 {
-       if (res->sectors)
-               __bch2_disk_reservation_put(c, res);
+       this_cpu_sub(*c->online_reserved, res->sectors);
+       res->sectors = 0;
 }
 
 #define BCH_DISK_RESERVATION_NOFAIL            (1 << 0)
 
 int bch2_disk_reservation_add(struct bch_fs *,
-                            struct disk_reservation *,
-                            unsigned, int);
+                             struct disk_reservation *,
+                             u64, int);
 
 static inline struct disk_reservation
 bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
@@ -302,8 +283,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
 
 static inline int bch2_disk_reservation_get(struct bch_fs *c,
                                            struct disk_reservation *res,
-                                           unsigned sectors,
-                                           unsigned nr_replicas,
+                                           u64 sectors, unsigned nr_replicas,
                                            int flags)
 {
        *res = bch2_disk_reservation_init(c, nr_replicas);
index d6057d22b18e5b14e4bf56a194bb48d7d4953b37..588b1a72adaec00ddc17b24df784b6540a7935b1 100644 (file)
@@ -37,11 +37,12 @@ struct bucket {
                const struct bucket_mark mark;
        };
 
-       u16                             io_time[2];
+       u64                             io_time[2];
        u8                              oldest_gen;
        u8                              gc_gen;
        unsigned                        gen_valid:1;
-       u8                              ec_redundancy;
+       u8                              stripe_redundancy;
+       u32                             stripe;
 };
 
 struct bucket_array {
@@ -52,26 +53,18 @@ struct bucket_array {
 };
 
 struct bch_dev_usage {
-       u64                     buckets[BCH_DATA_NR];
-       u64                     buckets_alloc;
+       u64                     buckets_ec;
        u64                     buckets_unavailable;
 
-       /* _compressed_ sectors: */
-       u64                     sectors[BCH_DATA_NR];
-       u64                     sectors_fragmented;
-
-       u64                     buckets_ec;
-       u64                     sectors_ec;
+       struct {
+               u64             buckets;
+               u64             sectors; /* _compressed_ sectors: */
+               u64             fragmented;
+       }                       d[BCH_DATA_NR];
 };
 
 struct bch_fs_usage {
        /* all fields are in units of 512 byte sectors: */
-
-       u64                     online_reserved;
-
-       /* fields after online_reserved are cleared/recalculated by gc: */
-       u64                     gc_start[0];
-
        u64                     hidden;
        u64                     btree;
        u64                     data;
@@ -91,6 +84,11 @@ struct bch_fs_usage {
        u64                     replicas[];
 };
 
+struct bch_fs_usage_online {
+       u64                     online_reserved;
+       struct bch_fs_usage     u;
+};
+
 struct bch_fs_usage_short {
        u64                     capacity;
        u64                     used;
@@ -98,22 +96,6 @@ struct bch_fs_usage_short {
        u64                     nr_inodes;
 };
 
-struct replicas_delta {
-       s64                     delta;
-       struct bch_replicas_entry r;
-} __packed;
-
-struct replicas_delta_list {
-       unsigned                size;
-       unsigned                used;
-
-       struct                  {} memset_start;
-       u64                     nr_inodes;
-       u64                     persistent_reserved[BCH_REPLICAS_MAX];
-       struct                  {} memset_end;
-       struct replicas_delta   d[0];
-};
-
 /*
  * A reservation for space on disk:
  */
index 0377f9018d27a2d4cbd3d941db6b9f2080584932..c61601476c0d168a180cad686c6fc788aefaa518 100644 (file)
@@ -5,6 +5,7 @@
 #include "bcachefs_ioctl.h"
 #include "buckets.h"
 #include "chardev.h"
+#include "journal.h"
 #include "move.h"
 #include "replicas.h"
 #include "super.h"
@@ -340,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c,
        ctx->c = c;
        ctx->arg = arg;
 
-       ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+       ctx->thread = kthread_create(bch2_data_thread, ctx,
+                                    "bch-data/%s", c->name);
        if (IS_ERR(ctx->thread)) {
                ret = PTR_ERR(ctx->thread);
                goto err;
@@ -377,7 +379,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 {
        struct bch_ioctl_fs_usage *arg = NULL;
        struct bch_replicas_usage *dst_e, *dst_end;
-       struct bch_fs_usage *src;
+       struct bch_fs_usage_online *src;
        u32 replica_entries_bytes;
        unsigned i;
        int ret = 0;
@@ -403,7 +405,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
        arg->online_reserved    = src->online_reserved;
 
        for (i = 0; i < BCH_REPLICAS_MAX; i++)
-               arg->persistent_reserved[i] = src->persistent_reserved[i];
+               arg->persistent_reserved[i] = src->u.persistent_reserved[i];
 
        dst_e   = arg->replicas;
        dst_end = (void *) arg->replicas + replica_entries_bytes;
@@ -417,7 +419,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
                        break;
                }
 
-               dst_e->sectors          = src->replicas[i];
+               dst_e->sectors          = src->u.replicas[i];
                dst_e->r                = *src_e;
 
                /* recheck after setting nr_devs: */
@@ -475,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
        arg.nr_buckets          = ca->mi.nbuckets - ca->mi.first_bucket;
        arg.available_buckets   = arg.nr_buckets - src.buckets_unavailable;
        arg.ec_buckets          = src.buckets_ec;
-       arg.ec_sectors          = src.sectors_ec;
+       arg.ec_sectors          = 0;
 
        for (i = 0; i < BCH_DATA_NR; i++) {
-               arg.buckets[i] = src.buckets[i];
-               arg.sectors[i] = src.sectors[i];
+               arg.buckets[i] = src.d[i].buckets;
+               arg.sectors[i] = src.d[i].sectors;
        }
 
        percpu_ref_put(&ca->ref);
@@ -563,6 +565,26 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c,
        return ret;
 }
 
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
+                                  struct bch_ioctl_disk_resize_journal arg)
+{
+       struct bch_dev *ca;
+       int ret;
+
+       if ((arg.flags & ~BCH_BY_INDEX) ||
+           arg.pad)
+               return -EINVAL;
+
+       ca = bch2_device_lookup(c, arg.dev, arg.flags);
+       if (IS_ERR(ca))
+               return PTR_ERR(ca);
+
+       ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+
+       percpu_ref_put(&ca->ref);
+       return ret;
+}
+
 #define BCH_IOCTL(_name, _argtype)                                     \
 do {                                                                   \
        _argtype i;                                                     \
@@ -619,6 +641,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
                BCH_IOCTL(data, struct bch_ioctl_data);
        case BCH_IOCTL_DISK_RESIZE:
                BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+       case BCH_IOCTL_DISK_RESIZE_JOURNAL:
+               BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
 
        default:
                return -ENOTTY;
index 24dee8039d57bd37f652e85273d0089abd9a5c3d..728b7ef1a1490f7bd2e38db93df4729d3eec9dd2 100644 (file)
@@ -77,11 +77,11 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
                                                       bool data)
 {
        switch (type) {
-       case BCH_CSUM_OPT_NONE:
+       case BCH_CSUM_OPT_none:
             return BCH_CSUM_NONE;
-       case BCH_CSUM_OPT_CRC32C:
+       case BCH_CSUM_OPT_crc32c:
             return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
-       case BCH_CSUM_OPT_CRC64:
+       case BCH_CSUM_OPT_crc64:
             return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
        default:
             BUG();
index 1d1590de55e85b9a484882104057ecfc7a7a6478..4324cfe7eed0de48ef2a26d97b024d3bb5d712b5 100644 (file)
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 
        spin_lock(&clock->timer_lock);
 
-       if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+       if (time_after_eq((unsigned long) atomic64_read(&clock->now),
                          timer->expire)) {
                spin_unlock(&clock->timer_lock);
                timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 {
        struct io_timer *timer;
-       unsigned long now = atomic_long_add_return(sectors, &clock->now);
+       unsigned long now = atomic64_add_return(sectors, &clock->now);
 
        while ((timer = get_expired_timer(clock, now)))
                timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
        unsigned i;
 
        spin_lock(&clock->timer_lock);
-       now = atomic_long_read(&clock->now);
+       now = atomic64_read(&clock->now);
 
        for (i = 0; i < clock->timers.used; i++)
                pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
 
 int bch2_io_clock_init(struct io_clock *clock)
 {
-       atomic_long_set(&clock->now, 0);
+       atomic64_set(&clock->now, 0);
        spin_lock_init(&clock->timer_lock);
 
        clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
index 92c740a475656da2093528b802320ad73eabbe46..5fae0012d808f7a1b5f4e5334804eee50c31d577 100644 (file)
@@ -26,7 +26,7 @@ struct io_timer {
 typedef HEAP(struct io_timer *)        io_timer_heap;
 
 struct io_clock {
-       atomic_long_t           now;
+       atomic64_t              now;
        u16 __percpu            *pcpu_buf;
        unsigned                max_slop;
 
index b50d2b0d5fd33f3f37dd5c874f12e95709cf5749..f63651d291e53de8737c52254fe222bf07512ed3 100644 (file)
@@ -70,7 +70,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
 
        BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
 
-       if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+       if (!PageHighMem(bio_iter_page(bio, start)) &&
            bio_phys_contig(bio, start))
                return (struct bbuf) {
                        .b = page_address(bio_iter_page(bio, start)) +
@@ -336,8 +336,19 @@ static int attempt_compress(struct bch_fs *c,
                ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
                        ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
 
+               /*
+                * ZSTD requires that when we decompress we pass in the exact
+                * compressed size - rounding it up to the nearest sector
+                * doesn't work, so we use the first 4 bytes of the buffer for
+                * that.
+                *
+                * Additionally, the ZSTD code seems to have a bug where it will
+                * write just past the end of the buffer - so subtract a fudge
+                * factor (7 bytes) from the dst buffer size to account for
+                * that.
+                */
                size_t len = ZSTD_compressCCtx(ctx,
-                               dst + 4,        dst_len - 4,
+                               dst + 4,        dst_len - 4 - 7,
                                src,            src_len,
                                c->zstd_params);
                if (ZSTD_isError(len))
index aa10591a3b1a8f3b84322080b989f00547282dc3..acf600387c9fe3378723289c9eb6c7bb8aa6e9af 100644 (file)
@@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        v->written      = 0;
        v->c.level      = b->c.level;
        v->c.btree_id   = b->c.btree_id;
-       bch2_btree_keys_init(v, &c->expensive_debug_checks);
+       bch2_btree_keys_init(v);
 
        if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
                                       NULL, &pick) <= 0)
@@ -79,7 +79,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 
        memcpy(n_ondisk, n_sorted, btree_bytes(c));
 
-       if (bch2_btree_node_read_done(c, v, false))
+       if (bch2_btree_node_read_done(c, ca, v, false))
                goto out;
 
        n_sorted = c->verify_data->data;
@@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+       iter = bch2_trans_get_iter(&trans, i->id, i->from,
+                                  BTREE_ITER_PREFETCH|
+                                  BTREE_ITER_ALL_SNAPSHOTS);
        k = bch2_btree_iter_peek(iter);
 
        while (k.k && !(err = bkey_err(k))) {
@@ -242,6 +244,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
                if (!i->size)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
 
        return err < 0 ? err : i->ret;
@@ -271,7 +275,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
        if (err)
                return err;
 
-       if (!i->size || !bkey_cmp(POS_MAX, i->from))
+       if (!i->size || !bpos_cmp(POS_MAX, i->from))
                return i->ret;
 
        bch2_trans_init(&trans, i->c, 0, 0);
@@ -287,13 +291,15 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
                 * can't easily correctly restart a btree node traversal across
                 * all nodes, meh
                 */
-               i->from = bkey_cmp(POS_MAX, b->key.k.p)
-                       ? bkey_successor(b->key.k.p)
+               i->from = bpos_cmp(POS_MAX, b->key.k.p)
+                       ? bpos_successor(b->key.k.p)
                        : b->key.k.p;
 
                if (!i->size)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
 
        return err < 0 ? err : i->ret;
@@ -352,7 +358,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
                if (err)
                        break;
 
-               bch2_btree_iter_next(iter);
+               bch2_btree_iter_advance(iter);
                i->from = iter->pos;
 
                err = flush_buf(i);
index 56c2d1ab5f630de8ce1b2484f934bc90dceb12eb..7ac1615e9447db326533ac6db5c1b129f02481a2 100644 (file)
@@ -8,44 +8,15 @@ struct bio;
 struct btree;
 struct bch_fs;
 
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_DEBUG_PARAM(name, description)                             \
-       static inline bool name(struct bch_fs *c)                       \
-       { return bch2_##name || c->name;        }
-BCH_DEBUG_PARAMS_ALWAYS()
-#undef BCH_DEBUG_PARAM
-
 #ifdef CONFIG_BCACHEFS_DEBUG
-
-#define BCH_DEBUG_PARAM(name, description)                             \
-       static inline bool name(struct bch_fs *c)                       \
-       { return bch2_##name || c->name;        }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
 void __bch2_btree_verify(struct bch_fs *, struct btree *);
-
-#define bypass_torture_test(d)         ((d)->bypass_torture_test)
-
-#else /* DEBUG */
-
-#define BCH_DEBUG_PARAM(name, description)                             \
-       static inline bool name(struct bch_fs *c) { return false; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
+#else
 static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-
-#define bypass_torture_test(d)         0
-
 #endif
 
 static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
 {
-       if (verify_btree_ondisk(c))
+       if (bch2_verify_btree_ondisk)
                __bch2_btree_verify(c, b);
 }
 
index f34bfda8ab0d6be5abdcae5c972442a3d6de8f8f..cf4ce2e7f29c1d51cad6c8c14f43822ea26a6079 100644 (file)
@@ -64,7 +64,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 }
 
 const struct bch_hash_desc bch2_dirent_hash_desc = {
-       .btree_id       = BTREE_ID_DIRENTS,
+       .btree_id       = BTREE_ID_dirents,
        .key_type       = KEY_TYPE_dirent,
        .hash_key       = dirent_hash_key,
        .hash_bkey      = dirent_hash_bkey,
@@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 int bch2_dirent_create(struct btree_trans *trans,
                       u64 dir_inum, const struct bch_hash_info *hash_info,
                       u8 type, const struct qstr *name, u64 dst_inum,
-                      int flags)
+                      u64 *dir_offset, int flags)
 {
        struct bkey_i_dirent *dirent;
        int ret;
@@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-                            dir_inum, &dirent->k_i, flags);
+       ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+                           dir_inum, &dirent->k_i, flags);
+       *dir_offset = dirent->k.p.offset;
+
+       return ret;
 }
 
 static void dirent_copy_target(struct bkey_i_dirent *dst,
@@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
 int bch2_dirent_rename(struct btree_trans *trans,
                       u64 src_dir, struct bch_hash_info *src_hash,
                       u64 dst_dir, struct bch_hash_info *dst_hash,
-                      const struct qstr *src_name, u64 *src_inum,
-                      const struct qstr *dst_name, u64 *dst_inum,
+                      const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
+                      const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
                       enum bch_rename_mode mode)
 {
        struct btree_iter *src_iter = NULL, *dst_iter = NULL;
@@ -255,14 +258,14 @@ int bch2_dirent_rename(struct btree_trans *trans,
                                new_dst->k.p = src_iter->pos;
                                bch2_trans_update(trans, src_iter,
                                                  &new_dst->k_i, 0);
-                               goto out;
+                               goto out_set_offset;
                        } else {
                                /* If we're overwriting, we can't insert new_dst
                                 * at a different slot because it has to
                                 * overwrite old_dst - just make sure to use a
                                 * whiteout when deleting src:
                                 */
-                               new_src->k.type = KEY_TYPE_whiteout;
+                               new_src->k.type = KEY_TYPE_hash_whiteout;
                        }
                } else {
                        /* Check if we need a whiteout to delete src: */
@@ -272,12 +275,15 @@ int bch2_dirent_rename(struct btree_trans *trans,
                                goto out;
 
                        if (ret)
-                               new_src->k.type = KEY_TYPE_whiteout;
+                               new_src->k.type = KEY_TYPE_hash_whiteout;
                }
        }
 
        bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
        bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+out_set_offset:
+       *src_offset = new_src->k.p.offset;
+       *dst_offset = new_dst->k.p.offset;
 out:
        bch2_trans_iter_put(trans, src_iter);
        bch2_trans_iter_put(trans, dst_iter);
@@ -321,6 +327,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
 
        k = bch2_btree_iter_peek_slot(iter);
        inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+       bch2_trans_iter_put(&trans, iter);
 out:
        bch2_trans_exit(&trans);
        return inum;
@@ -332,7 +339,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
        struct bkey_s_c k;
        int ret;
 
-       for_each_btree_key(trans, iter, BTREE_ID_DIRENTS,
+       for_each_btree_key(trans, iter, BTREE_ID_dirents,
                           POS(dir_inum, 0), 0, k, ret) {
                if (k.k->p.inode > dir_inum)
                        break;
@@ -357,7 +364,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+       for_each_btree_key(&trans, iter, BTREE_ID_dirents,
                           POS(inum, ctx->pos), 0, k, ret) {
                if (k.k->p.inode > inum)
                        break;
@@ -379,6 +386,8 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
                        break;
                ctx->pos = dirent.k->p.offset + 1;
        }
+       bch2_trans_iter_put(&trans, iter);
+
        ret = bch2_trans_exit(&trans) ?: ret;
 
        return ret;
index 34769371dd13d1725dc96f2c6d01a5fb10d9ecd6..e1d8ce377d43755cd5584edf1afa05d0ba65495e 100644 (file)
@@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
 
 int bch2_dirent_create(struct btree_trans *, u64,
                       const struct bch_hash_info *, u8,
-                      const struct qstr *, u64, int);
+                      const struct qstr *, u64, u64 *, int);
 
 int bch2_dirent_delete_at(struct btree_trans *,
                          const struct bch_hash_info *,
@@ -46,8 +46,8 @@ enum bch_rename_mode {
 int bch2_dirent_rename(struct btree_trans *,
                       u64, struct bch_hash_info *,
                       u64, struct bch_hash_info *,
-                      const struct qstr *, u64 *,
-                      const struct qstr *, u64 *,
+                      const struct qstr *, u64 *, u64 *,
+                      const struct qstr *, u64 *, u64 *,
                       enum bch_rename_mode);
 
 struct btree_iter *
index e4a4805ef218d4973d15933c34a30c8a1e3c45af..f712f685dd0e09122dd4b91a4a63ac2e9fd83cd2 100644 (file)
@@ -4,7 +4,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
+       if (!bkey_cmp(k.k->p, POS_MIN))
+               return "stripe at pos 0";
+
        if (k.k->p.inode)
                return "invalid stripe key";
 
@@ -138,44 +141,19 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
                       stripe_blockcount_get(s, i));
 }
 
-static int ptr_matches_stripe(struct bch_fs *c,
-                             struct bch_stripe *v,
-                             const struct bch_extent_ptr *ptr)
+/* returns blocknr in stripe that we matched: */
+static int bkey_matches_stripe(struct bch_stripe *s,
+                              struct bkey_s_c k)
 {
-       unsigned i;
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
+       unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
 
-       for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
-               const struct bch_extent_ptr *ptr2 = v->ptrs + i;
-
-               if (ptr->dev == ptr2->dev &&
-                   ptr->gen == ptr2->gen &&
-                   ptr->offset >= ptr2->offset &&
-                   ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
-                       return i;
-       }
-
-       return -1;
-}
-
-static int extent_matches_stripe(struct bch_fs *c,
-                                struct bch_stripe *v,
-                                struct bkey_s_c k)
-{
-
-       switch (k.k->type) {
-       case KEY_TYPE_extent: {
-               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-               const struct bch_extent_ptr *ptr;
-               int idx;
-
-               extent_for_each_ptr(e, ptr) {
-                       idx = ptr_matches_stripe(c, v, ptr);
-                       if (idx >= 0)
-                               return idx;
-               }
-               break;
-       }
-       }
+       bkey_for_each_ptr(ptrs, ptr)
+               for (i = 0; i < nr_data; i++)
+                       if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
+                                                     le16_to_cpu(s->sectors)))
+                               return i;
 
        return -1;
 }
@@ -200,46 +178,95 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
        return false;
 }
 
+/* Stripe bufs: */
+
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
+{
+       unsigned i;
+
+       for (i = 0; i < buf->key.v.nr_blocks; i++) {
+               kvpfree(buf->data[i], buf->size << 9);
+               buf->data[i] = NULL;
+       }
+}
+
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+                              unsigned offset, unsigned size)
+{
+       struct bch_stripe *v = &buf->key.v;
+       unsigned csum_granularity = 1U << v->csum_granularity_bits;
+       unsigned end = offset + size;
+       unsigned i;
+
+       BUG_ON(end > le16_to_cpu(v->sectors));
+
+       offset  = round_down(offset, csum_granularity);
+       end     = min_t(unsigned, le16_to_cpu(v->sectors),
+                       round_up(end, csum_granularity));
+
+       buf->offset     = offset;
+       buf->size       = end - offset;
+
+       memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+       for (i = 0; i < buf->key.v.nr_blocks; i++) {
+               buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+               if (!buf->data[i])
+                       goto err;
+       }
+
+       return 0;
+err:
+       ec_stripe_buf_exit(buf);
+       return -ENOMEM;
+}
+
 /* Checksumming: */
 
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
+                                        unsigned block, unsigned offset)
 {
        struct bch_stripe *v = &buf->key.v;
        unsigned csum_granularity = 1 << v->csum_granularity_bits;
-       unsigned csums_per_device = stripe_csums_per_device(v);
-       unsigned csum_bytes = bch_crc_bytes[v->csum_type];
-       unsigned i, j;
+       unsigned end = buf->offset + buf->size;
+       unsigned len = min(csum_granularity, end - offset);
+
+       BUG_ON(offset >= end);
+       BUG_ON(offset <  buf->offset);
+       BUG_ON(offset & (csum_granularity - 1));
+       BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+              (len & (csum_granularity - 1)));
+
+       return bch2_checksum(NULL, v->csum_type,
+                            null_nonce(),
+                            buf->data[block] + ((offset - buf->offset) << 9),
+                            len << 9);
+}
 
-       if (!csum_bytes)
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+       struct bch_stripe *v = &buf->key.v;
+       unsigned i, j, csums_per_device = stripe_csums_per_device(v);
+
+       if (!v->csum_type)
                return;
 
        BUG_ON(buf->offset);
        BUG_ON(buf->size != le16_to_cpu(v->sectors));
 
-       for (i = 0; i < v->nr_blocks; i++) {
-               for (j = 0; j < csums_per_device; j++) {
-                       unsigned offset = j << v->csum_granularity_bits;
-                       unsigned len = min(csum_granularity, buf->size - offset);
-
-                       struct bch_csum csum =
-                               bch2_checksum(NULL, v->csum_type,
-                                             null_nonce(),
-                                             buf->data[i] + (offset << 9),
-                                             len << 9);
-
-                       memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
-               }
-       }
+       for (i = 0; i < v->nr_blocks; i++)
+               for (j = 0; j < csums_per_device; j++)
+                       stripe_csum_set(v, i, j,
+                               ec_block_checksum(buf, i, j << v->csum_granularity_bits));
 }
 
 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 {
        struct bch_stripe *v = &buf->key.v;
        unsigned csum_granularity = 1 << v->csum_granularity_bits;
-       unsigned csum_bytes = bch_crc_bytes[v->csum_type];
        unsigned i;
 
-       if (!csum_bytes)
+       if (!v->csum_type)
                return;
 
        for (i = 0; i < v->nr_blocks; i++) {
@@ -252,21 +279,18 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
                while (offset < end) {
                        unsigned j = offset >> v->csum_granularity_bits;
                        unsigned len = min(csum_granularity, end - offset);
-                       struct bch_csum csum;
+                       struct bch_csum want = stripe_csum_get(v, i, j);
+                       struct bch_csum got = ec_block_checksum(buf, i, offset);
 
-                       BUG_ON(offset & (csum_granularity - 1));
-                       BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
-                              ((offset + len) & (csum_granularity - 1)));
+                       if (bch2_crc_cmp(want, got)) {
+                               char buf2[200];
 
-                       csum = bch2_checksum(NULL, v->csum_type,
-                                            null_nonce(),
-                                            buf->data[i] + ((offset - buf->offset) << 9),
-                                            len << 9);
+                               bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
 
-                       if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
-                               __bcache_io_error(c,
-                                       "checksum error while doing reconstruct read (%u:%u)",
-                                       i, j);
+                               bch_err_ratelimited(c,
+                                       "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
+                                       (void *) _RET_IP_, i, j, v->csum_type,
+                                       want.lo, got.lo, buf2);
                                clear_bit(i, buf->valid);
                                break;
                        }
@@ -287,25 +311,21 @@ static void ec_generate_ec(struct ec_stripe_buf *buf)
        raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
 }
 
-static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
-{
-       return nr - bitmap_weight(buf->valid, nr);
-}
-
 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
 {
-       return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+       return buf->key.v.nr_blocks -
+               bitmap_weight(buf->valid, buf->key.v.nr_blocks);
 }
 
 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 {
        struct bch_stripe *v = &buf->key.v;
-       unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
+       unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
        unsigned nr_data = v->nr_blocks - v->nr_redundant;
        unsigned bytes = buf->size << 9;
 
        if (ec_nr_failed(buf) > v->nr_redundant) {
-               __bcache_io_error(c,
+               bch_err_ratelimited(c,
                        "error doing reconstruct read: unable to read enough blocks");
                return -1;
        }
@@ -323,14 +343,23 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 static void ec_block_endio(struct bio *bio)
 {
        struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+       struct bch_stripe *v = &ec_bio->buf->key.v;
+       struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
        struct bch_dev *ca = ec_bio->ca;
        struct closure *cl = bio->bi_private;
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
                               bio_data_dir(bio) ? "write" : "read",
                               bch2_blk_status_to_str(bio->bi_status)))
                clear_bit(ec_bio->idx, ec_bio->buf->valid);
 
+       if (ptr_stale(ca, ptr)) {
+               bch_err_ratelimited(ca->fs,
+                                   "error %s stripe: stale pointer after io",
+                                   bio_data_dir(bio) == READ ? "reading from" : "writing to");
+               clear_bit(ec_bio->idx, ec_bio->buf->valid);
+       }
+
        bio_put(&ec_bio->bio);
        percpu_ref_put(&ca->io_ref);
        closure_put(cl);
@@ -347,6 +376,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
                ? BCH_DATA_user
                : BCH_DATA_parity;
 
+       if (ptr_stale(ca, ptr)) {
+               bch_err_ratelimited(c,
+                                   "error %s stripe: stale pointer",
+                                   rw == READ ? "reading from" : "writing to");
+               clear_bit(idx, buf->valid);
+               return;
+       }
+
        if (!bch2_dev_get_ioref(ca, rw)) {
                clear_bit(idx, buf->valid);
                return;
@@ -389,92 +426,82 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
        percpu_ref_put(&ca->io_ref);
 }
 
-/* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+       if (k.k->type != KEY_TYPE_stripe) {
+               ret = -ENOENT;
+               goto err;
+       }
+       bkey_reassemble(&stripe->key.k_i, k);
+err:
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
        struct ec_stripe_buf *buf;
        struct closure cl;
-       struct bkey_s_c k;
        struct bch_stripe *v;
-       unsigned stripe_idx;
-       unsigned offset, end;
-       unsigned i, nr_data, csum_granularity;
-       int ret = 0, idx;
+       unsigned i, offset;
+       int ret = 0;
 
        closure_init_stack(&cl);
 
        BUG_ON(!rbio->pick.has_ec);
 
-       stripe_idx = rbio->pick.ec.idx;
-
        buf = kzalloc(sizeof(*buf), GFP_NOIO);
        if (!buf)
                return -ENOMEM;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
-                                  POS(0, stripe_idx),
-                                  BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(iter);
-       if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
-               __bcache_io_error(c,
-                       "error doing reconstruct read: stripe not found");
+       ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+       if (ret) {
+               bch_err_ratelimited(c,
+                       "error doing reconstruct read: error %i looking up stripe", ret);
                kfree(buf);
-               return bch2_trans_exit(&trans) ?: -EIO;
+               return -EIO;
        }
 
-       bkey_reassemble(&buf->key.k_i, k);
-       bch2_trans_exit(&trans);
-
        v = &buf->key.v;
 
-       nr_data = v->nr_blocks - v->nr_redundant;
-
-       idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
-       BUG_ON(idx < 0);
-
-       csum_granularity = 1U << v->csum_granularity_bits;
-
-       offset  = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
-       end     = offset + bio_sectors(&rbio->bio);
-
-       BUG_ON(end > le16_to_cpu(v->sectors));
-
-       buf->offset     = round_down(offset, csum_granularity);
-       buf->size       = min_t(unsigned, le16_to_cpu(v->sectors),
-                               round_up(end, csum_granularity)) - buf->offset;
-
-       for (i = 0; i < v->nr_blocks; i++) {
-               buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
-               if (!buf->data[i]) {
-                       ret = -ENOMEM;
-                       goto err;
-               }
+       if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
+               bch_err_ratelimited(c,
+                       "error doing reconstruct read: pointer doesn't match stripe");
+               ret = -EIO;
+               goto err;
        }
 
-       memset(buf->valid, 0xFF, sizeof(buf->valid));
-
-       for (i = 0; i < v->nr_blocks; i++) {
-               struct bch_extent_ptr *ptr = v->ptrs + i;
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+       offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
+       if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
+               bch_err_ratelimited(c,
+                       "error doing reconstruct read: read is bigger than stripe");
+               ret = -EIO;
+               goto err;
+       }
 
-               if (ptr_stale(ca, ptr)) {
-                       __bcache_io_error(c,
-                                         "error doing reconstruct read: stale pointer");
-                       clear_bit(i, buf->valid);
-                       continue;
-               }
+       ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+       if (ret)
+               goto err;
 
+       for (i = 0; i < v->nr_blocks; i++)
                ec_block_io(c, buf, REQ_OP_READ, i, &cl);
-       }
 
        closure_sync(&cl);
 
        if (ec_nr_failed(buf) > v->nr_redundant) {
-               __bcache_io_error(c,
+               bch_err_ratelimited(c,
                        "error doing reconstruct read: unable to read enough blocks");
                ret = -EIO;
                goto err;
@@ -487,10 +514,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
                goto err;
 
        memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
-                     buf->data[idx] + ((offset - buf->offset) << 9));
+                     buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
 err:
-       for (i = 0; i < v->nr_blocks; i++)
-               kfree(buf->data[i]);
+       ec_stripe_buf_exit(buf);
        kfree(buf);
        return ret;
 }
@@ -643,8 +669,7 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
-       //pr_info("deleting stripe %zu", idx);
-       return bch2_btree_delete_range(c, BTREE_ID_EC,
+       return bch2_btree_delete_range(c, BTREE_ID_stripes,
                                       POS(0, idx),
                                       POS(0, idx + 1),
                                       NULL);
@@ -675,24 +700,25 @@ static void ec_stripe_delete_work(struct work_struct *work)
 /* stripe creation: */
 
 static int ec_stripe_bkey_insert(struct bch_fs *c,
-                                struct ec_stripe_new *s,
-                                struct bkey_i_stripe *stripe)
+                                struct bkey_i_stripe *stripe,
+                                struct disk_reservation *res)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bpos start_pos = POS(0, c->ec_stripe_hint);
+       struct bpos min_pos = POS(0, 1);
+       struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 retry:
        bch2_trans_begin(&trans);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
+       for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos,
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
                if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
                        if (start_pos.offset) {
-                               start_pos = POS_MIN;
+                               start_pos = min_pos;
                                bch2_btree_iter_set_pos(iter, start_pos);
                                continue;
                        }
@@ -717,7 +743,7 @@ found_slot:
 
        bch2_trans_update(&trans, iter, &stripe->k_i, 0);
 
-       ret = bch2_trans_commit(&trans, &s->res, NULL,
+       ret = bch2_trans_commit(&trans, res, NULL,
                                BTREE_INSERT_NOFAIL);
 err:
        bch2_trans_iter_put(&trans, iter);
@@ -731,6 +757,46 @@ err:
        return ret;
 }
 
+static int ec_stripe_bkey_update(struct btree_trans *trans,
+                                struct bkey_i_stripe *new)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       const struct bch_stripe *existing;
+       unsigned i;
+       int ret;
+
+       iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
+                                  new->k.p, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (!k.k || k.k->type != KEY_TYPE_stripe) {
+               bch_err(trans->c, "error updating stripe: not found");
+               ret = -ENOENT;
+               goto err;
+       }
+
+       existing = bkey_s_c_to_stripe(k).v;
+
+       if (existing->nr_blocks != new->v.nr_blocks) {
+               bch_err(trans->c, "error updating stripe: nr_blocks does not match");
+               ret = -EINVAL;
+               goto err;
+       }
+
+       for (i = 0; i < new->v.nr_blocks; i++)
+               stripe_blockcount_set(&new->v, i,
+                       stripe_blockcount_get(existing, i));
+
+       bch2_trans_update(trans, iter, &new->k_i, 0);
+err:
+       bch2_trans_iter_put(trans, iter);
+       return ret;
+}
+
 static void extent_stripe_ptr_add(struct bkey_s_extent e,
                                  struct ec_stripe_buf *s,
                                  struct bch_extent_ptr *ptr,
@@ -745,6 +811,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
        *dst = (struct bch_extent_stripe_ptr) {
                .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
                .block          = block,
+               .redundancy     = s->key.v.nr_redundant,
                .idx            = s->key.k.p.offset,
        };
 }
@@ -757,15 +824,15 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
        struct btree_iter *iter;
        struct bkey_s_c k;
        struct bkey_s_extent e;
-       struct bkey_on_stack sk;
-       int ret = 0, dev, idx;
+       struct bkey_buf sk;
+       int ret = 0, dev, block;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        /* XXX this doesn't support the reflink btree */
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
                                   bkey_start_pos(pos),
                                   BTREE_ITER_INTENT);
 
@@ -775,41 +842,41 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
                struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 
                if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
-                       bch2_btree_iter_next(iter);
+                       bch2_btree_iter_advance(iter);
                        continue;
                }
 
-               idx = extent_matches_stripe(c, &s->key.v, k);
-               if (idx < 0) {
-                       bch2_btree_iter_next(iter);
+               block = bkey_matches_stripe(&s->key.v, k);
+               if (block < 0) {
+                       bch2_btree_iter_advance(iter);
                        continue;
                }
 
-               dev = s->key.v.ptrs[idx].dev;
+               dev = s->key.v.ptrs[block].dev;
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
                e = bkey_i_to_s_extent(sk.k);
 
                bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
                ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
                BUG_ON(!ec_ptr);
 
-               extent_stripe_ptr_add(e, s, ec_ptr, idx);
+               extent_stripe_ptr_add(e, s, ec_ptr, block);
 
                bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
                bch2_trans_update(&trans, iter, sk.k, 0);
 
                ret = bch2_trans_commit(&trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_USE_RESERVE);
+                                       BTREE_INSERT_NOFAIL);
                if (ret == -EINTR)
                        ret = 0;
                if (ret)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
 
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
@@ -823,14 +890,13 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        struct open_bucket *ob;
        struct bkey_i *k;
        struct stripe *m;
-       struct bch_stripe *v = &s->stripe.key.v;
+       struct bch_stripe *v = &s->new_stripe.key.v;
        unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
-       struct closure cl;
        int ret;
 
        BUG_ON(s->h->s == s);
 
-       closure_init_stack(&cl);
+       closure_sync(&s->iodone);
 
        if (s->err) {
                if (s->err != -EROFS)
@@ -838,73 +904,86 @@ static void ec_stripe_create(struct ec_stripe_new *s)
                goto err;
        }
 
+       if (s->have_existing_stripe) {
+               ec_validate_checksums(c, &s->existing_stripe);
+
+               if (ec_do_recov(c, &s->existing_stripe)) {
+                       bch_err(c, "error creating stripe: error reading existing stripe");
+                       goto err;
+               }
+
+               for (i = 0; i < nr_data; i++)
+                       if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
+                               swap(s->new_stripe.data[i],
+                                    s->existing_stripe.data[i]);
+
+               ec_stripe_buf_exit(&s->existing_stripe);
+       }
+
        BUG_ON(!s->allocated);
 
        if (!percpu_ref_tryget(&c->writes))
                goto err;
 
-       BUG_ON(bitmap_weight(s->blocks_allocated,
-                            s->blocks.nr) != s->blocks.nr);
-
-       ec_generate_ec(&s->stripe);
+       ec_generate_ec(&s->new_stripe);
 
-       ec_generate_checksums(&s->stripe);
+       ec_generate_checksums(&s->new_stripe);
 
        /* write p/q: */
        for (i = nr_data; i < v->nr_blocks; i++)
-               ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
+               ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+       closure_sync(&s->iodone);
 
-       closure_sync(&cl);
-
-       for (i = nr_data; i < v->nr_blocks; i++)
-               if (!test_bit(i, s->stripe.valid)) {
-                       bch_err(c, "error creating stripe: error writing redundancy buckets");
-                       goto err_put_writes;
-               }
+       if (ec_nr_failed(&s->new_stripe)) {
+               bch_err(c, "error creating stripe: error writing redundancy buckets");
+               goto err_put_writes;
+       }
 
-       ret = s->existing_stripe
-               ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
-                                   &s->res, NULL, BTREE_INSERT_NOFAIL)
-               : ec_stripe_bkey_insert(c, s, &s->stripe.key);
+       ret = s->have_existing_stripe
+               ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+                               ec_stripe_bkey_update(&trans, &s->new_stripe.key))
+               : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
        if (ret) {
                bch_err(c, "error creating stripe: error creating stripe key");
                goto err_put_writes;
        }
 
        for_each_keylist_key(&s->keys, k) {
-               ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+               ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
                if (ret) {
-                       bch_err(c, "error creating stripe: error updating pointers");
+                       bch_err(c, "error creating stripe: error %i updating pointers", ret);
                        break;
                }
        }
 
        spin_lock(&c->ec_stripes_heap_lock);
-       m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
-#if 0
-       pr_info("created a %s stripe %llu",
-               s->existing_stripe ? "existing" : "new",
-               s->stripe.key.k.p.offset);
-#endif
+       m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
+
        BUG_ON(m->on_heap);
-       bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
+       bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
        spin_unlock(&c->ec_stripes_heap_lock);
 err_put_writes:
        percpu_ref_put(&c->writes);
 err:
        bch2_disk_reservation_put(c, &s->res);
 
-       open_bucket_for_each(c, &s->blocks, ob, i) {
-               ob->ec = NULL;
-               __bch2_open_bucket_put(c, ob);
-       }
-
-       bch2_open_buckets_put(c, &s->parity);
+       for (i = 0; i < v->nr_blocks; i++)
+               if (s->blocks[i]) {
+                       ob = c->open_buckets + s->blocks[i];
+
+                       if (i < nr_data) {
+                               ob->ec = NULL;
+                               __bch2_open_bucket_put(c, ob);
+                       } else {
+                               bch2_open_bucket_put(c, ob);
+                       }
+               }
 
        bch2_keylist_free(&s->keys, s->inline_keys);
 
-       for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
-               kvpfree(s->stripe.data[i], s->stripe.size << 9);
+       ec_stripe_buf_exit(&s->existing_stripe);
+       ec_stripe_buf_exit(&s->new_stripe);
+       closure_debug_destroy(&s->iodone);
        kfree(s);
 }
 
@@ -981,7 +1060,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
        ca      = bch_dev_bkey_exists(c, ob->ptr.dev);
        offset  = ca->mi.bucket_size - ob->sectors_free;
 
-       return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+       return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
 }
 
 void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
@@ -993,8 +1072,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
        if (!ob)
                return;
 
-       //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
-
        ec = ob->ec;
        mutex_lock(&ec->lock);
 
@@ -1088,7 +1165,6 @@ static void ec_stripe_key_init(struct bch_fs *c,
 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 {
        struct ec_stripe_new *s;
-       unsigned i;
 
        lockdep_assert_held(&h->lock);
 
@@ -1097,41 +1173,27 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
                return -ENOMEM;
 
        mutex_init(&s->lock);
+       closure_init(&s->iodone, NULL);
        atomic_set(&s->pin, 1);
        s->c            = c;
        s->h            = h;
        s->nr_data      = min_t(unsigned, h->nr_active_devs,
-                               EC_STRIPE_MAX) - h->redundancy;
+                               BCH_BKEY_PTRS_MAX) - h->redundancy;
        s->nr_parity    = h->redundancy;
 
        bch2_keylist_init(&s->keys, s->inline_keys);
 
-       s->stripe.offset        = 0;
-       s->stripe.size          = h->blocksize;
-       memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
-
-       ec_stripe_key_init(c, &s->stripe.key, s->nr_data,
+       ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
                           s->nr_parity, h->blocksize);
 
-       for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
-               s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
-               if (!s->stripe.data[i])
-                       goto err;
-       }
-
        h->s = s;
-
        return 0;
-err:
-       for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
-               kvpfree(s->stripe.data[i], s->stripe.size << 9);
-       kfree(s);
-       return -ENOMEM;
 }
 
 static struct ec_stripe_head *
 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
-                        unsigned algo, unsigned redundancy)
+                        unsigned algo, unsigned redundancy,
+                        bool copygc)
 {
        struct ec_stripe_head *h;
        struct bch_dev *ca;
@@ -1147,6 +1209,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
        h->target       = target;
        h->algo         = algo;
        h->redundancy   = redundancy;
+       h->copygc       = copygc;
 
        rcu_read_lock();
        h->devs = target_rw_devs(c, BCH_DATA_user, target);
@@ -1171,16 +1234,17 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
        if (h->s &&
            h->s->allocated &&
            bitmap_weight(h->s->blocks_allocated,
-                         h->s->blocks.nr) == h->s->blocks.nr)
+                         h->s->nr_data) == h->s->nr_data)
                ec_stripe_set_pending(c, h);
 
        mutex_unlock(&h->lock);
 }
 
 struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
-                                              unsigned target,
-                                              unsigned algo,
-                                              unsigned redundancy)
+                                                unsigned target,
+                                                unsigned algo,
+                                                unsigned redundancy,
+                                                bool copygc)
 {
        struct ec_stripe_head *h;
 
@@ -1191,76 +1255,98 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
        list_for_each_entry(h, &c->ec_stripe_head_list, list)
                if (h->target           == target &&
                    h->algo             == algo &&
-                   h->redundancy       == redundancy) {
+                   h->redundancy       == redundancy &&
+                   h->copygc           == copygc) {
                        mutex_lock(&h->lock);
                        goto found;
                }
 
-       h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+       h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
 found:
        mutex_unlock(&c->ec_stripe_head_lock);
        return h;
 }
 
-/*
- * XXX: use a higher watermark for allocating open buckets here:
- */
-static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
+static enum bucket_alloc_ret
+new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+                        struct closure *cl)
 {
-       struct bch_devs_mask devs;
+       struct bch_devs_mask devs = h->devs;
        struct open_bucket *ob;
-       unsigned i, nr_have, nr_data =
-               min_t(unsigned, h->nr_active_devs,
-                     EC_STRIPE_MAX) - h->redundancy;
+       struct open_buckets buckets;
+       unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
        bool have_cache = true;
-       int ret = 0;
-
-       devs = h->devs;
-
-       for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) {
-               __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
-               --nr_data;
+       enum bucket_alloc_ret ret = ALLOC_SUCCESS;
+
+       for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+               if (test_bit(i, h->s->blocks_gotten)) {
+                       __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+                       if (i < h->s->nr_data)
+                               nr_have_data++;
+                       else
+                               nr_have_parity++;
+               }
        }
 
-       BUG_ON(h->s->blocks.nr > nr_data);
-       BUG_ON(h->s->parity.nr > h->redundancy);
-
-       open_bucket_for_each(c, &h->s->parity, ob, i)
-               __clear_bit(ob->ptr.dev, devs.d);
-       open_bucket_for_each(c, &h->s->blocks, ob, i)
-               __clear_bit(ob->ptr.dev, devs.d);
+       BUG_ON(nr_have_data     > h->s->nr_data);
+       BUG_ON(nr_have_parity   > h->s->nr_parity);
 
        percpu_down_read(&c->mark_lock);
        rcu_read_lock();
 
-       if (h->s->parity.nr < h->redundancy) {
-               nr_have = h->s->parity.nr;
-
-               ret = bch2_bucket_alloc_set(c, &h->s->parity,
+       buckets.nr = 0;
+       if (nr_have_parity < h->s->nr_parity) {
+               ret = bch2_bucket_alloc_set(c, &buckets,
                                            &h->parity_stripe,
                                            &devs,
-                                           h->redundancy,
-                                           &nr_have,
+                                           h->s->nr_parity,
+                                           &nr_have_parity,
                                            &have_cache,
-                                           RESERVE_NONE,
+                                           h->copygc
+                                           ? RESERVE_MOVINGGC
+                                           : RESERVE_NONE,
                                            0,
-                                           NULL);
+                                           cl);
+
+               open_bucket_for_each(c, &buckets, ob, i) {
+                       j = find_next_zero_bit(h->s->blocks_gotten,
+                                              h->s->nr_data + h->s->nr_parity,
+                                              h->s->nr_data);
+                       BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+                       h->s->blocks[j] = buckets.v[i];
+                       h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+                       __set_bit(j, h->s->blocks_gotten);
+               }
+
                if (ret)
                        goto err;
        }
 
-       if (h->s->blocks.nr < nr_data) {
-               nr_have = h->s->blocks.nr;
-
-               ret = bch2_bucket_alloc_set(c, &h->s->blocks,
+       buckets.nr = 0;
+       if (nr_have_data < h->s->nr_data) {
+               ret = bch2_bucket_alloc_set(c, &buckets,
                                            &h->block_stripe,
                                            &devs,
-                                           nr_data,
-                                           &nr_have,
+                                           h->s->nr_data,
+                                           &nr_have_data,
                                            &have_cache,
-                                           RESERVE_NONE,
+                                           h->copygc
+                                           ? RESERVE_MOVINGGC
+                                           : RESERVE_NONE,
                                            0,
-                                           NULL);
+                                           cl);
+
+               open_bucket_for_each(c, &buckets, ob, i) {
+                       j = find_next_zero_bit(h->s->blocks_gotten,
+                                              h->s->nr_data, 0);
+                       BUG_ON(j >= h->s->nr_data);
+
+                       h->s->blocks[j] = buckets.v[i];
+                       h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+                       __set_bit(j, h->s->blocks_gotten);
+               }
+
                if (ret)
                        goto err;
        }
@@ -1272,53 +1358,101 @@ err:
 
 /* XXX: doesn't obey target: */
 static s64 get_existing_stripe(struct bch_fs *c,
-                              unsigned target,
-                              unsigned algo,
-                              unsigned redundancy)
+                              struct ec_stripe_head *head)
 {
        ec_stripes_heap *h = &c->ec_stripes_heap;
        struct stripe *m;
        size_t heap_idx;
        u64 stripe_idx;
+       s64 ret = -1;
 
        if (may_create_new_stripe(c))
                return -1;
 
        spin_lock(&c->ec_stripes_heap_lock);
        for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+               /* No blocks worth reusing, stripe will just be deleted: */
                if (!h->data[heap_idx].blocks_nonempty)
                        continue;
 
                stripe_idx = h->data[heap_idx].idx;
                m = genradix_ptr(&c->stripes[0], stripe_idx);
 
-               if (m->algorithm        == algo &&
-                   m->nr_redundant     == redundancy &&
+               if (m->algorithm        == head->algo &&
+                   m->nr_redundant     == head->redundancy &&
+                   m->sectors          == head->blocksize &&
                    m->blocks_nonempty  < m->nr_blocks - m->nr_redundant) {
                        bch2_stripes_heap_del(c, m, stripe_idx);
-                       spin_unlock(&c->ec_stripes_heap_lock);
-                       return stripe_idx;
+                       ret = stripe_idx;
+                       break;
                }
        }
-
        spin_unlock(&c->ec_stripes_heap_lock);
-       return -1;
+       return ret;
 }
 
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
+                                                  struct ec_stripe_head *h)
 {
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bkey_s_c k;
+       unsigned i;
+       s64 idx;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (!ret)
-               bkey_reassemble(&stripe->key.k_i, k);
-       bch2_trans_exit(&trans);
+       idx = get_existing_stripe(c, h);
+       if (idx < 0) {
+               bch_err(c, "failed to find an existing stripe");
+               return -ENOSPC;
+       }
+
+       h->s->have_existing_stripe = true;
+       ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+       if (ret) {
+               bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
+               return ret;
+       }
+
+       if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
+               /*
+                * this is a problem: we have deleted from the
+                * stripes heap already
+                */
+               BUG();
+       }
+
+       BUG_ON(h->s->existing_stripe.size != h->blocksize);
+       BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+
+       for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
+               if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
+                       __set_bit(i, h->s->blocks_gotten);
+                       __set_bit(i, h->s->blocks_allocated);
+               }
+
+               ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+       }
+
+       bkey_copy(&h->s->new_stripe.key.k_i,
+                       &h->s->existing_stripe.key.k_i);
+
+       return 0;
+}
+
+static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
+                                                       struct ec_stripe_head *h)
+{
+       int ret;
+
+       ret = bch2_disk_reservation_get(c, &h->s->res,
+                       h->blocksize,
+                       h->s->nr_parity, 0);
+
+       if (ret) {
+               /*
+                * This means we need to wait for copygc to
+                * empty out buckets from existing stripes:
+                */
+               bch_err(c, "failed to reserve stripe");
+       }
 
        return ret;
 }
@@ -1326,86 +1460,58 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
                                               unsigned target,
                                               unsigned algo,
-                                              unsigned redundancy)
+                                              unsigned redundancy,
+                                              bool copygc,
+                                              struct closure *cl)
 {
-       struct closure cl;
        struct ec_stripe_head *h;
-       struct open_bucket *ob;
-       unsigned i, data_idx = 0;
-       s64 idx;
        int ret;
+       bool needs_stripe_new;
 
-       closure_init_stack(&cl);
-
-       h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
-       if (!h)
-               return NULL;
-
-       if (!h->s && ec_new_stripe_alloc(c, h)) {
-               bch2_ec_stripe_head_put(c, h);
+       h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
+       if (!h) {
+               bch_err(c, "no stripe head");
                return NULL;
        }
 
-       if (!h->s->allocated) {
-               if (!h->s->existing_stripe &&
-                   (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) {
-                       //pr_info("got existing stripe %llu", idx);
-
-                       h->s->existing_stripe = true;
-                       h->s->existing_stripe_idx = idx;
-                       if (get_stripe_key(c, idx, &h->s->stripe)) {
-                               /* btree error */
-                               BUG();
-                       }
-
-                       for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++)
-                               if (stripe_blockcount_get(&h->s->stripe.key.v, i)) {
-                                       __set_bit(i, h->s->blocks_allocated);
-                                       ec_block_io(c, &h->s->stripe, READ, i, &cl);
-                               }
-               }
-
-               if (!h->s->existing_stripe &&
-                   !h->s->res.sectors) {
-                       ret = bch2_disk_reservation_get(c, &h->s->res,
-                                                       h->blocksize,
-                                                       h->s->nr_parity, 0);
-                       if (ret) {
-                               /* What should we do here? */
-                               bch_err(c, "unable to create new stripe: %i", ret);
-                               bch2_ec_stripe_head_put(c, h);
-                               h = NULL;
-                               goto out;
-
-                       }
-
-               }
-
-               if (new_stripe_alloc_buckets(c, h)) {
-                       bch2_ec_stripe_head_put(c, h);
-                       h = NULL;
-                       goto out;
+       needs_stripe_new = !h->s;
+       if (needs_stripe_new) {
+               if (ec_new_stripe_alloc(c, h)) {
+                       ret = -ENOMEM;
+                       bch_err(c, "failed to allocate new stripe");
+                       goto err;
                }
 
-               open_bucket_for_each(c, &h->s->blocks, ob, i) {
-                       data_idx = find_next_zero_bit(h->s->blocks_allocated,
-                                                     h->s->nr_data, data_idx);
-                       BUG_ON(data_idx >= h->s->nr_data);
+               if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize))
+                       BUG();
+       }
 
-                       h->s->stripe.key.v.ptrs[data_idx] = ob->ptr;
-                       h->s->data_block_idx[i] = data_idx;
-                       data_idx++;
-               }
+       /*
+        * Try reserve a new stripe before reusing an
+        * existing stripe. This will prevent unnecessary
+        * read amplification during write oriented workloads.
+        */
+       ret = 0;
+       if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
+               ret = __bch2_ec_stripe_head_reserve(c, h);
+       if (ret && needs_stripe_new)
+               ret = __bch2_ec_stripe_head_reuse(c, h);
+       if (ret)
+               goto err;
 
-               open_bucket_for_each(c, &h->s->parity, ob, i)
-                       h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
+       if (!h->s->allocated) {
+               ret = new_stripe_alloc_buckets(c, h, cl);
+               if (ret)
+                       goto err;
 
-               //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
                h->s->allocated = true;
        }
-out:
-       closure_sync(&cl);
+
        return h;
+
+err:
+       bch2_ec_stripe_head_put(c, h);
+       return ERR_PTR(-ret);
 }
 
 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
@@ -1421,12 +1527,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
                if (!h->s)
                        goto unlock;
 
-               open_bucket_for_each(c, &h->s->blocks, ob, i)
-                       if (ob->ptr.dev == ca->dev_idx)
-                               goto found;
-               open_bucket_for_each(c, &h->s->parity, ob, i)
+               for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+                       if (!h->s->blocks[i])
+                               continue;
+
+                       ob = c->open_buckets + h->s->blocks[i];
                        if (ob->ptr.dev == ca->dev_idx)
                                goto found;
+               }
                goto unlock;
 found:
                h->s->err = -EROFS;
@@ -1437,13 +1545,23 @@ unlock:
        mutex_unlock(&c->ec_stripe_head_lock);
 }
 
+void bch2_stripes_heap_start(struct bch_fs *c)
+{
+       struct genradix_iter iter;
+       struct stripe *m;
+
+       genradix_for_each(&c->stripes[0], iter, m)
+               if (m->alive)
+                       bch2_stripes_heap_insert(c, m, iter.pos);
+}
+
 static int __bch2_stripe_write_key(struct btree_trans *trans,
                                   struct btree_iter *iter,
                                   struct stripe *m,
                                   size_t idx,
                                   struct bkey_i_stripe *new_key)
 {
-       struct bch_fs *c = trans->c;
+       const struct bch_stripe *v;
        struct bkey_s_c k;
        unsigned i;
        int ret;
@@ -1458,16 +1576,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans,
        if (k.k->type != KEY_TYPE_stripe)
                return -EIO;
 
+       v = bkey_s_c_to_stripe(k).v;
+       for (i = 0; i < v->nr_blocks; i++)
+               if (m->block_sectors[i] != stripe_blockcount_get(v, i))
+                       goto write;
+       return 0;
+write:
        bkey_reassemble(&new_key->k_i, k);
 
-       spin_lock(&c->ec_stripes_heap_lock);
-
        for (i = 0; i < new_key->v.nr_blocks; i++)
                stripe_blockcount_set(&new_key->v, i,
                                      m->block_sectors[i]);
-       m->dirty = false;
-
-       spin_unlock(&c->ec_stripes_heap_lock);
 
        bch2_trans_update(trans, iter, &new_key->k_i, 0);
        return 0;
@@ -1487,11 +1606,11 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        genradix_for_each(&c->stripes[0], giter, m) {
-               if (!m->dirty)
+               if (!m->alive)
                        continue;
 
                ret = __bch2_trans_do(&trans, NULL, NULL,
@@ -1516,18 +1635,11 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
        int ret = 0;
 
        if (k.k->type == KEY_TYPE_stripe) {
-               struct stripe *m;
-
                ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
                        bch2_mark_key(c, k, 0, 0, NULL, 0,
                                      BTREE_TRIGGER_NOATOMIC);
                if (ret)
                        return ret;
-
-               spin_lock(&c->ec_stripes_heap_lock);
-               m = genradix_ptr(&c->stripes[0], k.k->p.offset);
-               bch2_stripes_heap_insert(c, m, k.k->p.offset);
-               spin_unlock(&c->ec_stripes_heap_lock);
        }
 
        return ret;
@@ -1535,7 +1647,7 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id,
 
 int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-       int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC,
+       int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_stripes,
                                          NULL, bch2_stripes_read_fn);
        if (ret)
                bch_err(c, "error reading stripes: %i", ret);
@@ -1552,12 +1664,13 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
 
        k = bch2_btree_iter_prev(iter);
        if (!IS_ERR_OR_NULL(k.k))
                idx = k.k->p.offset + 1;
+
+       bch2_trans_iter_put(&trans, iter);
        ret = bch2_trans_exit(&trans);
        if (ret)
                return ret;
@@ -1586,7 +1699,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
        size_t i;
 
        spin_lock(&c->ec_stripes_heap_lock);
-       for (i = 0; i < min(h->used, 20UL); i++) {
+       for (i = 0; i < min_t(size_t, h->used, 20); i++) {
                m = genradix_ptr(&c->stripes[0], h->data[i].idx);
 
                pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
@@ -1608,19 +1721,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
                       h->target, h->algo, h->redundancy);
 
                if (h->s)
-                       pr_buf(out, "\tpending: blocks %u allocated %u\n",
-                              h->s->blocks.nr,
+                       pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+                              h->s->nr_data, h->s->nr_parity,
                               bitmap_weight(h->s->blocks_allocated,
-                                            h->s->blocks.nr));
+                                            h->s->nr_data));
        }
        mutex_unlock(&c->ec_stripe_head_lock);
 
        mutex_lock(&c->ec_stripe_new_lock);
        list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-               pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
-                      s->blocks.nr,
-                      bitmap_weight(s->blocks_allocated,
-                                    s->blocks.nr),
+               pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+                      s->nr_data, s->nr_parity,
                       atomic_read(&s->pin));
        }
        mutex_unlock(&c->ec_stripe_new_lock);
index 15f751fc2a35d32bea03efde0cf5cbbd332bb41a..744e51eaf327e66577b768479b384d25c07a88db 100644 (file)
@@ -60,9 +60,66 @@ static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
 }
 
 static inline void *stripe_csum(struct bch_stripe *s,
-                               unsigned dev, unsigned csum_idx)
+                               unsigned block, unsigned csum_idx)
 {
-       return (void *) s + stripe_csum_offset(s, dev, csum_idx);
+       EBUG_ON(block >= s->nr_blocks);
+       EBUG_ON(csum_idx >= stripe_csums_per_device(s));
+
+       return (void *) s + stripe_csum_offset(s, block, csum_idx);
+}
+
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
+                                  unsigned block, unsigned csum_idx)
+{
+       struct bch_csum csum = { 0 };
+
+       memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
+       return csum;
+}
+
+static inline void stripe_csum_set(struct bch_stripe *s,
+                                  unsigned block, unsigned csum_idx,
+                                  struct bch_csum csum)
+{
+       memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
+}
+
+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
+                                            const struct bch_extent_ptr *data_ptr,
+                                            unsigned sectors)
+{
+       return  data_ptr->dev    == stripe_ptr->dev &&
+               data_ptr->gen    == stripe_ptr->gen &&
+               data_ptr->offset >= stripe_ptr->offset &&
+               data_ptr->offset  < stripe_ptr->offset + sectors;
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+                                          struct extent_ptr_decoded p)
+{
+       unsigned nr_data = s->nr_blocks - s->nr_redundant;
+
+       BUG_ON(!p.has_ec);
+
+       if (p.ec.block >= nr_data)
+               return false;
+
+       return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
+                                        le16_to_cpu(s->sectors));
+}
+
+static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m,
+                                            struct extent_ptr_decoded p)
+{
+       unsigned nr_data = m->nr_blocks - m->nr_redundant;
+
+       BUG_ON(!p.has_ec);
+
+       if (p.ec.block >= nr_data)
+               return false;
+
+       return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
+                                        m->sectors);
 }
 
 struct bch_read_bio;
@@ -71,9 +128,9 @@ struct ec_stripe_buf {
        /* might not be buffering the entire stripe: */
        unsigned                offset;
        unsigned                size;
-       unsigned long           valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+       unsigned long           valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
 
-       void                    *data[EC_STRIPE_MAX];
+       void                    *data[BCH_BKEY_PTRS_MAX];
 
        union {
                struct bkey_i_stripe    key;
@@ -88,6 +145,7 @@ struct ec_stripe_new {
        struct ec_stripe_head   *h;
        struct mutex            lock;
        struct list_head        list;
+       struct closure          iodone;
 
        /* counts in flight writes, stripe is created when pin == 0 */
        atomic_t                pin;
@@ -98,20 +156,18 @@ struct ec_stripe_new {
        u8                      nr_parity;
        bool                    allocated;
        bool                    pending;
-       bool                    existing_stripe;
-       u64                     existing_stripe_idx;
-
-       unsigned long           blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
+       bool                    have_existing_stripe;
 
-       struct open_buckets     blocks;
-       u8                      data_block_idx[EC_STRIPE_MAX];
-       struct open_buckets     parity;
+       unsigned long           blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+       unsigned long           blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+       open_bucket_idx_t       blocks[BCH_BKEY_PTRS_MAX];
        struct disk_reservation res;
 
        struct keylist          keys;
        u64                     inline_keys[BKEY_U64s * 8];
 
-       struct ec_stripe_buf    stripe;
+       struct ec_stripe_buf    new_stripe;
+       struct ec_stripe_buf    existing_stripe;
 };
 
 struct ec_stripe_head {
@@ -121,6 +177,7 @@ struct ec_stripe_head {
        unsigned                target;
        unsigned                algo;
        unsigned                redundancy;
+       bool                    copygc;
 
        struct bch_devs_mask    devs;
        unsigned                nr_active_devs;
@@ -145,8 +202,8 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
 int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
 
 void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
-                                              unsigned, unsigned);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
+                       unsigned, unsigned, unsigned, bool, struct closure *);
 
 void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
@@ -156,6 +213,8 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
 
 void bch2_ec_flush_new_stripes(struct bch_fs *);
 
+void bch2_stripes_heap_start(struct bch_fs *);
+
 struct journal_keys;
 int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
 int bch2_stripes_write(struct bch_fs *, unsigned);
index e4d633fca5bf913a78a4d78168141dc1458fdaf1..3fc31222459a81fd92ec2fbdc1225d5bf9bedbfe 100644 (file)
@@ -4,11 +4,9 @@
 
 #include <linux/llist.h>
 
-#define EC_STRIPE_MAX  16
-
 struct bch_replicas_padded {
        struct bch_replicas_entry       e;
-       u8                              pad[EC_STRIPE_MAX];
+       u8                              pad[BCH_BKEY_PTRS_MAX];
 };
 
 struct stripe {
@@ -20,11 +18,11 @@ struct stripe {
        u8                      nr_blocks;
        u8                      nr_redundant;
 
-       unsigned                alive:1;
-       unsigned                dirty:1;
+       unsigned                alive:1; /* does a corresponding key exist in stripes btree? */
        unsigned                on_heap:1;
        u8                      blocks_nonempty;
-       u16                     block_sectors[EC_STRIPE_MAX];
+       u16                     block_sectors[BCH_BKEY_PTRS_MAX];
+       struct bch_extent_ptr   ptrs[BCH_BKEY_PTRS_MAX];
 
        struct bch_replicas_padded r;
 };
index cd46706fb6f5a8020ad63a128b5c28fc910163b8..a8ee1db8aa3917851dfdd011e9d00e63bf8a84bd 100644 (file)
@@ -11,13 +11,13 @@ bool bch2_inconsistent_error(struct bch_fs *c)
        set_bit(BCH_FS_ERROR, &c->flags);
 
        switch (c->opts.errors) {
-       case BCH_ON_ERROR_CONTINUE:
+       case BCH_ON_ERROR_continue:
                return false;
-       case BCH_ON_ERROR_RO:
+       case BCH_ON_ERROR_ro:
                if (bch2_fs_emergency_read_only(c))
                        bch_err(c, "emergency read only");
                return true;
-       case BCH_ON_ERROR_PANIC:
+       case BCH_ON_ERROR_panic:
                panic(bch2_fmt(c, "panic after error"));
                return true;
        default:
@@ -38,10 +38,10 @@ void bch2_io_error_work(struct work_struct *work)
        bool dev;
 
        down_write(&c->state_lock);
-       dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+       dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
                                    BCH_FORCE_IF_DEGRADED);
        if (dev
-           ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+           ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
                                  BCH_FORCE_IF_DEGRADED)
            : bch2_fs_emergency_read_only(c))
                bch_err(ca,
index 94b53312fbbda9b61e2f192f34f413ff88bcbeff..0e49fd728e440cb5be02bf1da3e399fa52e3e9f0 100644 (file)
@@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *);
 /* Logs message and handles the error: */
 #define bch2_dev_io_error(ca, fmt, ...)                                        \
 do {                                                                   \
-       printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,                  \
-               "IO error on %s for " fmt),                             \
+       printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt,              \
                (ca)->name, ##__VA_ARGS__);                             \
        bch2_io_error(ca);                                              \
 } while (0)
 
+#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...)           \
+do {                                                                   \
+       printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
+               (ca)->name, (_inum), (_offset), ##__VA_ARGS__);         \
+       bch2_io_error(ca);                                              \
+} while (0)
+
 #define bch2_dev_io_err_on(cond, ca, ...)                              \
 ({                                                                     \
        bool _ret = (cond);                                             \
@@ -196,16 +202,13 @@ do {                                                                      \
        _ret;                                                           \
 })
 
-/* kill? */
-
-#define __bcache_io_error(c, fmt, ...)                                 \
-       printk_ratelimited(KERN_ERR bch2_fmt(c,                         \
-                       "IO error: " fmt), ##__VA_ARGS__)
-
-#define bcache_io_error(c, bio, fmt, ...)                              \
-do {                                                                   \
-       __bcache_io_error(c, fmt, ##__VA_ARGS__);                       \
-       (bio)->bi_status = BLK_STS_IOERR;                                       \
-} while (0)
+#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...)         \
+({                                                                     \
+       bool _ret = (cond);                                             \
+                                                                       \
+       if (_ret)                                                       \
+               bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
+       _ret;                                                           \
+})
 
 #endif /* _BCACHEFS_ERROR_H */
index fd011df3cb9943b2dd9c2ae0f3a80191263c5b41..bb4b2b4352e04c727b405f7ea78449cd5095d149 100644 (file)
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
@@ -63,7 +62,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
                struct bkey_s_c r_k;
 
                for_each_btree_key(trans, iter,
-                                  BTREE_ID_REFLINK, POS(0, idx + offset),
+                                  BTREE_ID_reflink, POS(0, idx + offset),
                                   BTREE_ITER_SLOTS, r_k, ret2) {
                        if (bkey_cmp(bkey_start_pos(r_k.k),
                                     POS(0, idx + sectors)) >= 0)
@@ -100,24 +99,12 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
                           struct bpos *end)
 {
        struct btree_trans *trans = iter->trans;
-       struct btree *b;
-       struct btree_node_iter  node_iter;
-       struct bkey_packed      *_k;
-       unsigned                nr_iters = 0;
+       struct btree_iter *copy;
+       struct bkey_s_c k;
+       unsigned nr_iters = 0;
        int ret;
 
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               return ret;
-
-       b = iter->l[0].b;
-       node_iter = iter->l[0].iter;
-
-       BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
-              bkey_cmp(bkey_start_pos(&insert->k),
-                       bkey_predecessor(b->data->min_key)) < 0);
-
-       *end = bpos_min(insert->k.p, b->key.k.p);
+       *end = insert->k.p;
 
        /* extent_update_to_keys(): */
        nr_iters += 1;
@@ -127,9 +114,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
        if (ret < 0)
                return ret;
 
-       while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
-               struct bkey     unpacked;
-               struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+       copy = bch2_trans_copy_iter(trans, iter);
+
+       for_each_btree_key_continue(copy, 0, k, ret) {
                unsigned offset = 0;
 
                if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
@@ -156,10 +143,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
                                        &nr_iters, EXTENT_ITERS_MAX);
                if (ret)
                        break;
-
-               bch2_btree_node_iter_advance(&node_iter, b);
        }
 
+       bch2_trans_iter_put(trans, copy);
        return ret < 0 ? ret : 0;
 }
 
@@ -193,18 +179,13 @@ bch2_extent_can_insert(struct btree_trans *trans,
                       struct btree_iter *iter,
                       struct bkey_i *insert)
 {
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree_node_iter node_iter = l->iter;
-       struct bkey_packed *_k;
        struct bkey_s_c k;
-       struct bkey unpacked;
-       int sectors;
-
-       _k = bch2_btree_node_iter_peek(&node_iter, l->b);
-       if (!_k)
-               return BTREE_INSERT_OK;
+       int ret, sectors;
 
-       k = bkey_disassemble(l->b, _k, &unpacked);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
 
        /* Check if we're splitting a compressed extent: */
 
index 88297b30f6221850ac15b46197c7de908520477a..b07d39555eb6e1fe92c1d5233ef3013aae359e14 100644 (file)
@@ -89,7 +89,7 @@ static inline bool ptr_better(struct bch_fs *c,
                return bch2_rand_range(l1 + l2) > l1;
        }
 
-       if (force_reconstruct_read(c))
+       if (bch2_force_reconstruct_read)
                return p1.idx > p2.idx;
 
        return p1.idx < p2.idx;
@@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
                    !bch2_dev_is_readable(ca))
                        p.idx++;
 
-               if (force_reconstruct_read(c) &&
+               if (bch2_force_reconstruct_read &&
                    !p.idx && p.has_ec)
                        p.idx++;
 
@@ -158,56 +158,33 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+       if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
                return "value too big";
 
        return bch2_bkey_ptrs_invalid(c, k);
 }
 
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+                           struct bkey_s_c k)
 {
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
-       const char *err;
-       char buf[160];
-       struct bucket_mark mark;
-       struct bch_dev *ca;
-
-       if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
-               return;
-
-       if (!percpu_down_read_trylock(&c->mark_lock))
-               return;
+       bch2_bkey_ptrs_to_text(out, c, k);
+}
 
-       bkey_for_each_ptr(ptrs, ptr) {
-               ca = bch_dev_bkey_exists(c, ptr->dev);
+const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-               mark = ptr_bucket_mark(ca, ptr);
+       if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
+               return "value too small";
 
-               err = "stale";
-               if (gen_after(mark.gen, ptr->gen))
-                       goto err;
+       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+               return "value too big";
 
-               err = "inconsistent";
-               if (mark.data_type != BCH_DATA_btree ||
-                   mark.dirty_sectors < c->opts.btree_node_size)
-                       goto err;
-       }
-out:
-       percpu_up_read(&c->mark_lock);
-       return;
-err:
-       bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
-               err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-               PTR_BUCKET_NR(ca, ptr),
-               mark.gen, (unsigned) mark.v.counter);
-       goto out;
-}
+       if (c->sb.version < bcachefs_metadata_version_snapshot &&
+           bp.v->min_key.snapshot)
+               return "invalid min_key.snapshot";
 
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
-{
-       bch2_bkey_ptrs_to_text(out, c, k);
+       return bch2_bkey_ptrs_invalid(c, k);
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
@@ -215,9 +192,8 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-       pr_buf(out, "seq %llx sectors %u written %u min_key ",
+       pr_buf(out, "seq %llx written %u min_key ",
               le64_to_cpu(bp.v->seq),
-              le16_to_cpu(bp.v->sectors),
               le16_to_cpu(bp.v->sectors_written));
 
        bch2_bpos_to_text(out, bp.v->min_key);
@@ -237,8 +213,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
            btree_node_type_is_extents(btree_id) &&
            bkey_cmp(bp.v->min_key, POS_MIN))
                bp.v->min_key = write
-                       ? bkey_predecessor(bp.v->min_key)
-                       : bkey_successor(bp.v->min_key);
+                       ? bpos_nosnap_predecessor(bp.v->min_key)
+                       : bpos_nosnap_successor(bp.v->min_key);
 }
 
 /* KEY_TYPE_extent: */
@@ -248,49 +224,6 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
        return bch2_bkey_ptrs_invalid(c, k);
 }
 
-void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       char buf[160];
-
-       if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) ||
-           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
-               return;
-
-       if (!percpu_down_read_trylock(&c->mark_lock))
-               return;
-
-       extent_for_each_ptr_decode(e, p, entry) {
-               struct bch_dev *ca      = bch_dev_bkey_exists(c, p.ptr.dev);
-               struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
-               unsigned stale          = gen_after(mark.gen, p.ptr.gen);
-               unsigned disk_sectors   = ptr_disk_sectors(p);
-               unsigned mark_sectors   = p.ptr.cached
-                       ? mark.cached_sectors
-                       : mark.dirty_sectors;
-
-               bch2_fs_inconsistent_on(stale && !p.ptr.cached, c,
-                       "stale dirty pointer (ptr gen %u bucket %u",
-                       p.ptr.gen, mark.gen);
-
-               bch2_fs_inconsistent_on(stale > 96, c,
-                       "key too stale: %i", stale);
-
-               bch2_fs_inconsistent_on(!stale &&
-                       (mark.data_type != BCH_DATA_user ||
-                        mark_sectors < disk_sectors), c,
-                       "extent pointer not marked: %s:\n"
-                       "type %u sectors %u < %u",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
-                       mark.data_type,
-                       mark_sectors, disk_sectors);
-       }
-
-       percpu_up_read(&c->mark_lock);
-}
-
 void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
                         struct bkey_s_c k)
 {
@@ -665,7 +598,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
 }
 
 bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
-                               unsigned nr_replicas)
+                               unsigned nr_replicas, bool compressed)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -678,21 +611,45 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
+       for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
                           BTREE_ITER_SLOTS, k, err) {
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
-               if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) {
+               if (nr_replicas > bch2_bkey_replicas(c, k) ||
+                   (!compressed && bch2_bkey_sectors_compressed(k))) {
                        ret = false;
                        break;
                }
        }
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
 
        return ret;
 }
 
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p = { 0 };
+       unsigned replicas = 0;
+
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               if (p.ptr.cached)
+                       continue;
+
+               if (p.has_ec)
+                       replicas += p.ec.redundancy;
+
+               replicas++;
+
+       }
+
+       return replicas;
+}
+
 static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
                                           struct extent_ptr_decoded p)
 {
@@ -704,19 +661,12 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 
        ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-       if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+       if (ca->mi.state != BCH_MEMBER_STATE_failed)
                durability = max_t(unsigned, durability, ca->mi.durability);
 
-       if (p.has_ec) {
-               struct stripe *s =
-                       genradix_ptr(&c->stripes[0], p.ec.idx);
+       if (p.has_ec)
+               durability += p.ec.redundancy;
 
-               if (WARN_ON(!s))
-                       goto out;
-
-               durability += s->nr_redundant;
-       }
-out:
        return durability;
 }
 
@@ -764,6 +714,15 @@ void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
                }
 }
 
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
+{
+       union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+       union bch_extent_entry *next = extent_entry_next(entry);
+
+       memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+       k->k.u64s -= extent_entry_u64s(entry);
+}
+
 void bch2_bkey_append_ptr(struct bkey_i *k,
                          struct bch_extent_ptr ptr)
 {
@@ -949,9 +908,9 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 
        /* will only happen if all pointers were cached: */
        if (!bch2_bkey_nr_ptrs(k.s_c))
-               k.k->type = KEY_TYPE_discard;
+               k.k->type = KEY_TYPE_deleted;
 
-       return bkey_whiteout(k.k);
+       return bkey_deleted(k.k);
 }
 
 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
@@ -1046,16 +1005,17 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
 const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       struct bch_devs_list devs;
        const union bch_extent_entry *entry;
        struct bch_extent_crc_unpacked crc;
        unsigned size_ondisk = k.k->size;
        const char *reason;
        unsigned nonce = UINT_MAX;
+       unsigned i;
 
-       if (k.k->type == KEY_TYPE_btree_ptr)
+       if (k.k->type == KEY_TYPE_btree_ptr ||
+           k.k->type == KEY_TYPE_btree_ptr_v2)
                size_ondisk = c->opts.btree_node_size;
-       if (k.k->type == KEY_TYPE_btree_ptr_v2)
-               size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors);
 
        bkey_extent_entry_for_each(ptrs, entry) {
                if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
@@ -1101,6 +1061,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
                }
        }
 
+       devs = bch2_bkey_devs(k);
+       bubble_sort(devs.devs, devs.nr, u8_cmp);
+       for (i = 0; i + 1 < devs.nr; i++)
+               if (devs.devs[i] == devs.devs[i + 1])
+                       return "multiple ptrs to same device";
+
        return NULL;
 }
 
@@ -1235,7 +1201,7 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 
        len = where.offset - bkey_start_offset(k.k);
 
-       k.k->p = where;
+       k.k->p.offset = where.offset;
        k.k->size = len;
 
        if (!len) {
index 74c7bb8f9104e1603207054eab4d66526c5f7bcc..ccee43a2019da10ca572faf6c6cf4463bc7930e5 100644 (file)
@@ -368,10 +368,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 /* KEY_TYPE_btree_ptr: */
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 
+const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
@@ -379,14 +379,12 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 
 #define bch2_bkey_ops_btree_ptr (struct bkey_ops) {            \
        .key_invalid    = bch2_btree_ptr_invalid,               \
-       .key_debugcheck = bch2_btree_ptr_debugcheck,            \
        .val_to_text    = bch2_btree_ptr_to_text,               \
        .swab           = bch2_ptr_swab,                        \
 }
 
 #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {         \
-       .key_invalid    = bch2_btree_ptr_invalid,               \
-       .key_debugcheck = bch2_btree_ptr_debugcheck,            \
+       .key_invalid    = bch2_btree_ptr_v2_invalid,            \
        .val_to_text    = bch2_btree_ptr_v2_to_text,            \
        .swab           = bch2_ptr_swab,                        \
        .compat         = bch2_btree_ptr_v2_compat,             \
@@ -395,14 +393,12 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 /* KEY_TYPE_extent: */
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 enum merge_result bch2_extent_merge(struct bch_fs *,
                                    struct bkey_s, struct bkey_s);
 
 #define bch2_bkey_ops_extent (struct bkey_ops) {               \
        .key_invalid    = bch2_extent_invalid,                  \
-       .key_debugcheck = bch2_extent_debugcheck,               \
        .val_to_text    = bch2_extent_to_text,                  \
        .swab           = bch2_ptr_swab,                        \
        .key_normalize  = bch2_extent_normalize,                \
@@ -538,12 +534,15 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
 bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
 void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
                                    unsigned, unsigned);
 
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
                                    struct extent_ptr_decoded *);
@@ -583,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s);
 
 /* Generic extent code: */
 
+enum bch_extent_overlap {
+       BCH_EXTENT_OVERLAP_ALL          = 0,
+       BCH_EXTENT_OVERLAP_BACK         = 1,
+       BCH_EXTENT_OVERLAP_FRONT        = 2,
+       BCH_EXTENT_OVERLAP_MIDDLE       = 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+                                                         const struct bkey *m)
+{
+       int cmp1 = bkey_cmp(k->p, m->p) < 0;
+       int cmp2 = bkey_cmp(bkey_start_pos(k),
+                           bkey_start_pos(m)) > 0;
+
+       return (cmp1 << 1) + cmp2;
+}
+
 int bch2_cut_front_s(struct bpos, struct bkey_s);
 int bch2_cut_back_s(struct bpos, struct bkey_s);
 
index 878419d409927c7d33902993d0f37a4b5357e362..281a6135e59977f7d057c8b7958a85ec87435dfd 100644 (file)
@@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
 {
        struct bch_fs *c = trans->c;
        struct btree_iter *dir_iter = NULL;
+       struct btree_iter *inode_iter = NULL;
        struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
-       u64 now = bch2_current_time(trans->c);
+       u64 now = bch2_current_time(c);
+       u64 dir_offset = 0;
        int ret;
 
        dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
@@ -34,9 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
        if (!name)
                new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-       ret = bch2_inode_create(trans, new_inode,
-                               BLOCKDEV_INODE_MAX, 0,
-                               &c->unused_inode_hint);
+       inode_iter = bch2_inode_create(trans, new_inode, U32_MAX);
+       ret = PTR_ERR_OR_ZERO(inode_iter);
        if (ret)
                goto err;
 
@@ -68,11 +69,24 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
                ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
                                         mode_to_type(new_inode->bi_mode),
                                         name, new_inode->bi_inum,
+                                        &dir_offset,
                                         BCH_HASH_SET_MUST_CREATE);
                if (ret)
                        goto err;
        }
+
+       if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+               new_inode->bi_dir               = dir_u->bi_inum;
+               new_inode->bi_dir_offset        = dir_offset;
+       }
+
+       /* XXX use bch2_btree_iter_set_snapshot() */
+       inode_iter->snapshot = U32_MAX;
+       bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+
+       ret = bch2_inode_write(trans, inode_iter, new_inode);
 err:
+       bch2_trans_iter_put(trans, inode_iter);
        bch2_trans_iter_put(trans, dir_iter);
        return ret;
 }
@@ -81,9 +95,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
                    u64 inum, struct bch_inode_unpacked *dir_u,
                    struct bch_inode_unpacked *inode_u, const struct qstr *name)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
        struct bch_hash_info dir_hash;
-       u64 now = bch2_current_time(trans->c);
+       u64 now = bch2_current_time(c);
+       u64 dir_offset = 0;
        int ret;
 
        inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
@@ -94,6 +110,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
        inode_u->bi_ctime = now;
        bch2_inode_nlink_inc(inode_u);
 
+       inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+
        dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
        ret = PTR_ERR_OR_ZERO(dir_iter);
        if (ret)
@@ -101,12 +119,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 
        dir_u->bi_mtime = dir_u->bi_ctime = now;
 
-       dir_hash = bch2_hash_info_init(trans->c, dir_u);
+       dir_hash = bch2_hash_info_init(c, dir_u);
 
-       ret =   bch2_dirent_create(trans, dir_inum, &dir_hash,
-                                 mode_to_type(inode_u->bi_mode),
-                                 name, inum, BCH_HASH_SET_MUST_CREATE) ?:
-               bch2_inode_write(trans, dir_iter, dir_u) ?:
+       ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+                                mode_to_type(inode_u->bi_mode),
+                                name, inum, &dir_offset,
+                                BCH_HASH_SET_MUST_CREATE);
+       if (ret)
+               goto err;
+
+       if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+               inode_u->bi_dir         = dir_inum;
+               inode_u->bi_dir_offset  = dir_offset;
+       }
+
+       ret =   bch2_inode_write(trans, dir_iter, dir_u) ?:
                bch2_inode_write(trans, inode_iter, inode_u);
 err:
        bch2_trans_iter_put(trans, dir_iter);
@@ -119,10 +146,11 @@ int bch2_unlink_trans(struct btree_trans *trans,
                      struct bch_inode_unpacked *inode_u,
                      const struct qstr *name)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
                          *inode_iter = NULL;
        struct bch_hash_info dir_hash;
-       u64 inum, now = bch2_current_time(trans->c);
+       u64 inum, now = bch2_current_time(c);
        struct bkey_s_c k;
        int ret;
 
@@ -131,7 +159,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       dir_hash = bch2_hash_info_init(trans->c, dir_u);
+       dir_hash = bch2_hash_info_init(c, dir_u);
 
        dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
                                                 name, BTREE_ITER_INTENT);
@@ -197,10 +225,12 @@ int bch2_rename_trans(struct btree_trans *trans,
                      const struct qstr *dst_name,
                      enum bch_rename_mode mode)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
        struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
        struct bch_hash_info src_hash, dst_hash;
-       u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+       u64 src_inode, src_offset, dst_inode, dst_offset;
+       u64 now = bch2_current_time(c);
        int ret;
 
        src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
@@ -209,7 +239,7 @@ int bch2_rename_trans(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+       src_hash = bch2_hash_info_init(c, src_dir_u);
 
        if (dst_dir != src_dir) {
                dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
@@ -218,7 +248,7 @@ int bch2_rename_trans(struct btree_trans *trans,
                if (ret)
                        goto err;
 
-               dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+               dst_hash = bch2_hash_info_init(c, dst_dir_u);
        } else {
                dst_dir_u = src_dir_u;
                dst_hash = src_hash;
@@ -227,8 +257,8 @@ int bch2_rename_trans(struct btree_trans *trans,
        ret = bch2_dirent_rename(trans,
                                 src_dir, &src_hash,
                                 dst_dir, &dst_hash,
-                                src_name, &src_inode,
-                                dst_name, &dst_inode,
+                                src_name, &src_inode, &src_offset,
+                                dst_name, &dst_inode, &dst_offset,
                                 mode);
        if (ret)
                goto err;
@@ -247,6 +277,16 @@ int bch2_rename_trans(struct btree_trans *trans,
                        goto err;
        }
 
+       if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+               src_inode_u->bi_dir             = dst_dir_u->bi_inum;
+               src_inode_u->bi_dir_offset      = dst_offset;
+
+               if (mode == BCH_RENAME_EXCHANGE) {
+                       dst_inode_u->bi_dir             = src_dir_u->bi_inum;
+                       dst_inode_u->bi_dir_offset      = src_offset;
+               }
+       }
+
        if (mode == BCH_RENAME_OVERWRITE) {
                if (S_ISDIR(src_inode_u->bi_mode) !=
                    S_ISDIR(dst_inode_u->bi_mode)) {
index 3aed2ca4dcedbef12bb94da817886cf17443605d..1a94e7f7cd9615c8eb246c18748808ce2606eee8 100644 (file)
@@ -3,7 +3,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
 #include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>
 
+static inline struct address_space *faults_disabled_mapping(void)
+{
+       return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+       current->faults_disabled_mapping =
+               (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+       return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
 struct quota_res {
        u64                             sectors;
 };
@@ -68,6 +84,7 @@ struct dio_read {
        struct closure                  cl;
        struct kiocb                    *req;
        long                            ret;
+       bool                            should_dirty;
        struct bch_read_bio             rbio;
 };
 
@@ -265,28 +282,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page)
 /* for newly allocated pages: */
 static void __bch2_page_state_release(struct page *page)
 {
-       struct bch_page_state *s = __bch2_page_state(page);
-
-       if (!s)
-               return;
-
-       ClearPagePrivate(page);
-       set_page_private(page, 0);
-       put_page(page);
-       kfree(s);
+       kfree(detach_page_private(page));
 }
 
 static void bch2_page_state_release(struct page *page)
 {
-       struct bch_page_state *s = bch2_page_state(page);
-
-       if (!s)
-               return;
-
-       ClearPagePrivate(page);
-       set_page_private(page, 0);
-       put_page(page);
-       kfree(s);
+       EBUG_ON(!PageLocked(page));
+       __bch2_page_state_release(page);
 }
 
 /* for newly allocated pages: */
@@ -300,13 +302,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page,
                return NULL;
 
        spin_lock_init(&s->lock);
-       /*
-        * migrate_page_move_mapping() assumes that pages with private data
-        * have their count elevated by 1.
-        */
-       get_page(page);
-       set_page_private(page, (unsigned long) s);
-       SetPagePrivate(page);
+       attach_page_private(page, s);
        return s;
 }
 
@@ -514,10 +510,35 @@ static void bch2_set_page_dirty(struct bch_fs *c,
 vm_fault_t bch2_page_fault(struct vm_fault *vmf)
 {
        struct file *file = vmf->vma->vm_file;
+       struct address_space *mapping = file->f_mapping;
+       struct address_space *fdm = faults_disabled_mapping();
        struct bch_inode_info *inode = file_bch_inode(file);
        int ret;
 
+       if (fdm == mapping)
+               return VM_FAULT_SIGBUS;
+
+       /* Lock ordering: */
+       if (fdm > mapping) {
+               struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+               if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+                       goto got_lock;
+
+               bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+
+               bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+               bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+               bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+
+               /* Signal that lock has been dropped: */
+               set_fdm_dropped_locks();
+               return VM_FAULT_SIGBUS;
+       }
+
        bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+got_lock:
        ret = filemap_fault(vmf);
        bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
@@ -608,14 +629,8 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
        if (ret != MIGRATEPAGE_SUCCESS)
                return ret;
 
-       if (PagePrivate(page)) {
-               ClearPagePrivate(page);
-               get_page(newpage);
-               set_page_private(newpage, page_private(page));
-               set_page_private(page, 0);
-               put_page(page);
-               SetPagePrivate(newpage);
-       }
+       if (PagePrivate(page))
+               attach_page_private(newpage, detach_page_private(page));
 
        if (mode != MIGRATE_SYNC_NO_COPY)
                migrate_page_copy(newpage, page);
@@ -647,41 +662,33 @@ static void bch2_readpages_end_io(struct bio *bio)
        bio_put(bio);
 }
 
-static inline void page_state_init_for_read(struct page *page)
-{
-       SetPagePrivate(page);
-       page->private = 0;
-}
-
 struct readpages_iter {
        struct address_space    *mapping;
        struct page             **pages;
        unsigned                nr_pages;
-       unsigned                nr_added;
        unsigned                idx;
        pgoff_t                 offset;
 };
 
 static int readpages_iter_init(struct readpages_iter *iter,
-                              struct address_space *mapping,
-                              struct list_head *pages, unsigned nr_pages)
+                              struct readahead_control *ractl)
 {
+       unsigned i, nr_pages = readahead_count(ractl);
+
        memset(iter, 0, sizeof(*iter));
 
-       iter->mapping   = mapping;
-       iter->offset    = list_last_entry(pages, struct page, lru)->index;
+       iter->mapping   = ractl->mapping;
+       iter->offset    = readahead_index(ractl);
+       iter->nr_pages  = nr_pages;
 
        iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
        if (!iter->pages)
                return -ENOMEM;
 
-       while (!list_empty(pages)) {
-               struct page *page = list_last_entry(pages, struct page, lru);
-
-               __bch2_page_state_create(page, __GFP_NOFAIL);
-
-               iter->pages[iter->nr_pages++] = page;
-               list_del(&page->lru);
+       nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
+       for (i = 0; i < nr_pages; i++) {
+               __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+               put_page(iter->pages[i]);
        }
 
        return 0;
@@ -689,41 +696,9 @@ static int readpages_iter_init(struct readpages_iter *iter,
 
 static inline struct page *readpage_iter_next(struct readpages_iter *iter)
 {
-       struct page *page;
-       unsigned i;
-       int ret;
-
-       BUG_ON(iter->idx > iter->nr_added);
-       BUG_ON(iter->nr_added > iter->nr_pages);
-
-       if (iter->idx < iter->nr_added)
-               goto out;
-
-       while (1) {
-               if (iter->idx == iter->nr_pages)
-                       return NULL;
-
-               ret = add_to_page_cache_lru_vec(iter->mapping,
-                               iter->pages     + iter->nr_added,
-                               iter->nr_pages  - iter->nr_added,
-                               iter->offset    + iter->nr_added,
-                               GFP_NOFS);
-               if (ret > 0)
-                       break;
-
-               page = iter->pages[iter->nr_added];
-               iter->idx++;
-               iter->nr_added++;
-
-               __bch2_page_state_release(page);
-               put_page(page);
-       }
-
-       iter->nr_added += ret;
+       if (iter->idx >= iter->nr_pages)
+               return NULL;
 
-       for (i = iter->idx; i < iter->nr_added; i++)
-               put_page(iter->pages[i]);
-out:
        EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
 
        return iter->pages[iter->idx];
@@ -817,7 +792,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
                       struct readpages_iter *readpages_iter)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        int flags = BCH_READ_RETRY_IF_STALE|
                BCH_READ_MAY_PROMOTE;
        int ret = 0;
@@ -825,11 +800,12 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
        rbio->c = c;
        rbio->start_time = local_clock();
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
 retry:
        while (1) {
                struct bkey_s_c k;
                unsigned bytes, sectors, offset_into_extent;
+               enum btree_id data_btree = BTREE_ID_extents;
 
                bch2_btree_iter_set_pos(iter,
                                POS(inum, rbio->bio.bi_iter.bi_sector));
@@ -843,9 +819,9 @@ retry:
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
 
-               ret = bch2_read_indirect_extent(trans,
+               ret = bch2_read_indirect_extent(trans, &data_btree,
                                        &offset_into_extent, &sk);
                if (ret)
                        break;
@@ -869,7 +845,8 @@ retry:
                if (bkey_extent_is_allocation(k.k))
                        bch2_add_page_sectors(&rbio->bio, k);
 
-               bch2_read_extent(trans, rbio, k, offset_into_extent, flags);
+               bch2_read_extent(trans, rbio, iter->pos,
+                                data_btree, k, offset_into_extent, flags);
 
                if (flags & BCH_READ_LAST_FRAGMENT)
                        break;
@@ -882,17 +859,18 @@ retry:
                goto retry;
 
        if (ret) {
-               bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+               bch_err_inum_ratelimited(c, inum,
+                               "read error %i from btree lookup", ret);
+               rbio->bio.bi_status = BLK_STS_IOERR;
                bio_endio(&rbio->bio);
        }
 
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 }
 
-int bch2_readpages(struct file *file, struct address_space *mapping,
-                  struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
 {
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
        struct btree_trans trans;
@@ -901,12 +879,11 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
        struct readpages_iter readpages_iter;
        int ret;
 
-       ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+       ret = readpages_iter_init(&readpages_iter, ractl);
        BUG_ON(ret);
 
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
                                   BTREE_ITER_SLOTS);
 
        bch2_pagecache_add_get(&inode->ei_pagecache_lock);
@@ -934,10 +911,9 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
 
        bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        kfree(readpages_iter.pages);
-
-       return 0;
 }
 
 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
@@ -954,11 +930,12 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
        BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
                                   BTREE_ITER_SLOTS);
 
        bchfs_read(&trans, iter, rbio, inum, NULL);
 
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
 }
 
@@ -1042,6 +1019,8 @@ static void bch2_writepage_io_done(struct closure *cl)
        unsigned i;
 
        if (io->op.error) {
+               set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
                bio_for_each_segment_all(bvec, bio, iter) {
                        struct bch_page_state *s;
 
@@ -1644,12 +1623,22 @@ again:
 
 /* O_DIRECT reads */
 
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+       if (check_dirty) {
+               bio_check_pages_dirty(bio);
+       } else {
+               bio_release_pages(bio, false);
+               bio_put(bio);
+       }
+}
+
 static void bch2_dio_read_complete(struct closure *cl)
 {
        struct dio_read *dio = container_of(cl, struct dio_read, cl);
 
        dio->req->ki_complete(dio->req, dio->ret, 0);
-       bio_check_pages_dirty(&dio->rbio.bio);  /* transfers ownership */
+       bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
 }
 
 static void bch2_direct_IO_read_endio(struct bio *bio)
@@ -1664,8 +1653,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio)
 
 static void bch2_direct_IO_read_split_endio(struct bio *bio)
 {
+       struct dio_read *dio = bio->bi_private;
+       bool should_dirty = dio->should_dirty;
+
        bch2_direct_IO_read_endio(bio);
-       bio_check_pages_dirty(bio);     /* transfers ownership */
+       bio_check_or_release(bio, should_dirty);
 }
 
 static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
@@ -1719,6 +1711,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 
        dio->req        = req;
        dio->ret        = ret;
+       /*
+        * This is one of the sketchier things I've encountered: we have to skip
+        * the dirtying of requests that are internal from the kernel (i.e. from
+        * loopback), because we'll deadlock on page_lock.
+        */
+       dio->should_dirty = iter_is_iovec(iter);
 
        goto start;
        while (iter->count) {
@@ -1740,7 +1738,9 @@ start:
                }
 
                offset += bio->bi_iter.bi_size;
-               bio_set_pages_dirty(bio);
+
+               if (dio->should_dirty)
+                       bio_set_pages_dirty(bio);
 
                if (iter->count)
                        closure_get(&dio->cl);
@@ -1754,7 +1754,7 @@ start:
                closure_sync(&dio->cl);
                closure_debug_destroy(&dio->cl);
                ret = dio->ret;
-               bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+               bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
                return ret;
        } else {
                return -EIOCBQUEUED;
@@ -1812,14 +1812,16 @@ static long bch2_dio_write_loop(struct dio_write *dio)
        struct bio *bio = &dio->op.wbio.bio;
        struct bvec_iter_all iter;
        struct bio_vec *bv;
-       unsigned unaligned;
-       bool sync = dio->sync;
+       unsigned unaligned, iter_count;
+       bool sync = dio->sync, dropped_locks;
        long ret;
 
        if (dio->loop)
                goto loop;
 
        while (1) {
+               iter_count = dio->iter.count;
+
                if (kthread)
                        kthread_use_mm(dio->mm);
                BUG_ON(current->faults_disabled_mapping);
@@ -1827,13 +1829,34 @@ static long bch2_dio_write_loop(struct dio_write *dio)
 
                ret = bio_iov_iter_get_pages(bio, &dio->iter);
 
+               dropped_locks = fdm_dropped_locks();
+
                current->faults_disabled_mapping = NULL;
                if (kthread)
                        kthread_unuse_mm(dio->mm);
 
+               /*
+                * If the fault handler returned an error but also signalled
+                * that it dropped & retook ei_pagecache_lock, we just need to
+                * re-shoot down the page cache and retry:
+                */
+               if (dropped_locks && ret)
+                       ret = 0;
+
                if (unlikely(ret < 0))
                        goto err;
 
+               if (unlikely(dropped_locks)) {
+                       ret = write_invalidate_inode_pages_range(mapping,
+                                       req->ki_pos,
+                                       req->ki_pos + iter_count - 1);
+                       if (unlikely(ret))
+                               goto err;
+
+                       if (!bio->bi_iter.bi_size)
+                               continue;
+               }
+
                unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
                bio->bi_iter.bi_size -= unaligned;
                iov_iter_revert(&dio->iter, unaligned);
@@ -1865,7 +1888,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                                                dio->op.opts.data_replicas, 0);
                if (unlikely(ret) &&
                    !bch2_check_range_allocated(c, dio->op.pos,
-                               bio_sectors(bio), dio->op.opts.data_replicas))
+                               bio_sectors(bio),
+                               dio->op.opts.data_replicas,
+                               dio->op.opts.compression != 0))
                        goto err;
 
                task_io_account_write(bio->bi_iter.bi_size);
@@ -1908,7 +1933,13 @@ loop:
 
                bio_for_each_segment_all(bv, bio, iter)
                        put_page(bv->bv_page);
-               if (!dio->iter.count || dio->op.error)
+
+               if (dio->op.error) {
+                       set_bit(EI_INODE_ERROR, &inode->ei_flags);
+                       break;
+               }
+
+               if (!dio->iter.count)
                        break;
 
                bio_reset(bio);
@@ -2104,7 +2135,7 @@ static inline int range_has_data(struct bch_fs *c,
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) {
+       for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
@@ -2113,6 +2144,7 @@ static inline int range_has_data(struct bch_fs *c,
                        break;
                }
        }
+       bch2_trans_iter_put(&trans, iter);
 
        return bch2_trans_exit(&trans) ?: ret;
 }
@@ -2282,6 +2314,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
        bch2_trans_init(&trans, c, 0, 0);
        iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
        ret = PTR_ERR_OR_ZERO(iter);
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
 
        if (ret)
@@ -2296,7 +2329,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
        if (ret)
                goto err;
 
-       BUG_ON(inode->v.i_size < inode_u.bi_size);
+       WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+               inode->v.i_size < inode_u.bi_size);
 
        if (iattr->ia_size > inode->v.i_size) {
                ret = bch2_extend(inode, &inode_u, iattr);
@@ -2409,19 +2443,16 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       struct bkey_on_stack copy;
+       struct bkey_buf copy;
        struct btree_trans trans;
-       struct btree_iter *src, *dst;
+       struct btree_iter *src, *dst, *del;
        loff_t shift, new_size;
        u64 src_start;
-       int ret;
+       int ret = 0;
 
        if ((offset | len) & (block_bytes(c) - 1))
                return -EINVAL;
 
-       bkey_on_stack_init(&copy);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
-
        /*
         * We need i_mutex to keep the page cache consistent with the extents
         * btree, and the btree consistent with i_size - we don't need outside
@@ -2477,15 +2508,15 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                        goto err;
        }
 
-       src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       bch2_bkey_buf_init(&copy);
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
+       src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
                        POS(inode->v.i_ino, src_start >> 9),
                        BTREE_ITER_INTENT);
-       BUG_ON(IS_ERR_OR_NULL(src));
-
        dst = bch2_trans_copy_iter(&trans, src);
-       BUG_ON(IS_ERR_OR_NULL(dst));
+       del = bch2_trans_copy_iter(&trans, src);
 
-       while (1) {
+       while (ret == 0 || ret == -EINTR) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(c, 0);
                struct bkey_i delete;
@@ -2499,18 +2530,16 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                        ? bch2_btree_iter_peek_prev(src)
                        : bch2_btree_iter_peek(src);
                if ((ret = bkey_err(k)))
-                       goto bkey_err;
+                       continue;
 
                if (!k.k || k.k->p.inode != inode->v.i_ino)
                        break;
 
-               BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k)));
-
                if (insert &&
                    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
                        break;
 reassemble:
-               bkey_on_stack_reassemble(&copy, c, k);
+               bch2_bkey_buf_reassemble(&copy, c, k);
 
                if (insert &&
                    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
@@ -2521,7 +2550,7 @@ reassemble:
 
                ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
                if (ret)
-                       goto bkey_err;
+                       continue;
 
                if (bkey_cmp(atomic_end, copy.k->k.p)) {
                        if (insert) {
@@ -2537,6 +2566,7 @@ reassemble:
                delete.k.p = copy.k->k.p;
                delete.k.size = copy.k->k.size;
                delete.k.p.offset -= shift >> 9;
+               bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
 
                next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
@@ -2557,26 +2587,24 @@ reassemble:
                        BUG_ON(ret);
                }
 
-               bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k));
-
-               ret =   bch2_trans_update(&trans, src, &delete, trigger_flags) ?:
+               ret =   bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
                        bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
                        bch2_trans_commit(&trans, &disk_res,
                                          &inode->ei_journal_seq,
                                          BTREE_INSERT_NOFAIL);
                bch2_disk_reservation_put(c, &disk_res);
-bkey_err:
+
                if (!ret)
                        bch2_btree_iter_set_pos(src, next_pos);
-
-               if (ret == -EINTR)
-                       ret = 0;
-               if (ret)
-                       goto err;
-
-               bch2_trans_cond_resched(&trans);
        }
-       bch2_trans_unlock(&trans);
+       bch2_trans_iter_put(&trans, del);
+       bch2_trans_iter_put(&trans, dst);
+       bch2_trans_iter_put(&trans, src);
+       bch2_trans_exit(&trans);
+       bch2_bkey_buf_exit(&copy, c);
+
+       if (ret)
+               goto err;
 
        if (!insert) {
                i_size_write(&inode->v, new_size);
@@ -2586,8 +2614,6 @@ bkey_err:
                mutex_unlock(&inode->ei_update_lock);
        }
 err:
-       bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&copy, c);
        bch2_pagecache_block_put(&inode->ei_pagecache_lock);
        inode_unlock(&inode->v);
        return ret;
@@ -2637,12 +2663,12 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
                truncate_pagecache_range(&inode->v, offset, end - 1);
        }
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
                        POS(inode->v.i_ino, block_start >> 9),
                        BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
        end_pos = POS(inode->v.i_ino, block_end >> 9);
 
-       while (bkey_cmp(iter->pos, end_pos) < 0) {
+       while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
                s64 i_sectors_delta = 0;
                struct disk_reservation disk_res = { 0 };
                struct quota_res quota_res = { 0 };
@@ -2706,9 +2732,11 @@ bkey_err:
                bch2_disk_reservation_put(c, &disk_res);
                if (ret == -EINTR)
                        ret = 0;
-               if (ret)
-                       goto err;
        }
+       bch2_trans_iter_put(&trans, iter);
+
+       if (ret)
+               goto err;
 
        /*
         * Do we need to extend the file?
@@ -2730,6 +2758,7 @@ bkey_err:
                        ret = PTR_ERR_OR_ZERO(inode_iter);
                } while (ret == -EINTR);
 
+               bch2_trans_iter_put(&trans, inode_iter);
                bch2_trans_unlock(&trans);
 
                if (ret)
@@ -2834,9 +2863,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
        u64 aligned_len;
        loff_t ret = 0;
 
-       if (!c->opts.reflink)
-               return -EOPNOTSUPP;
-
        if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
                return -EINVAL;
 
@@ -2970,7 +2996,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+       for_each_btree_key(&trans, iter, BTREE_ID_extents,
                           POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
                        break;
@@ -2980,6 +3006,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
                } else if (k.k->p.offset >> 9 > isize)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
@@ -3017,8 +3044,8 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
        int pg_offset;
        loff_t ret = -1;
 
-       page = find_lock_entry(mapping, index);
-       if (!page || xa_is_value(page))
+       page = find_lock_page(mapping, index);
+       if (!page)
                return offset;
 
        pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
@@ -3065,7 +3092,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+       for_each_btree_key(&trans, iter, BTREE_ID_extents,
                           POS(inode->v.i_ino, offset >> 9),
                           BTREE_ITER_SLOTS, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
@@ -3083,6 +3110,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
                        offset = max(offset, bkey_start_offset(k.k) << 9);
                }
        }
+       bch2_trans_iter_put(&trans, iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
index 7063556d289b7446716d674457b2dd0c32d9387f..2537a3d25ede1dd585f284353c8e2d58d1d21a78 100644 (file)
@@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *);
 int bch2_readpage(struct file *, struct page *);
 
 int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
-                  struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
 
 int bch2_write_begin(struct file *, struct address_space *, loff_t,
                     unsigned, unsigned, struct page **, void **);
index 0873d2f0928cdcffb021969c34e82d1493000d37..eb871634eeae772883292972de80e453dd811c92 100644 (file)
@@ -183,6 +183,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
                                    struct bch_inode_info *src,
                                    const char __user *name)
 {
+       struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
        struct bch_inode_info *dst;
        struct inode *vinode = NULL;
        char *kname = NULL;
@@ -202,8 +203,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
        qstr.name       = kname;
 
        ret = -ENOENT;
-       inum = bch2_dirent_lookup(c, src->v.i_ino,
-                                 &src->ei_str_hash,
+       inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
                                  &qstr);
        if (!inum)
                goto err1;
index 1d66acaca33cf9d35a9ebdff026cbe25819f198e..8034d48c62bb0a9feb273ff4f5bbbf35e8795709 100644 (file)
@@ -3,7 +3,7 @@
 
 #include "bcachefs.h"
 #include "acl.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "chardev.h"
@@ -42,6 +42,11 @@ static void journal_seq_copy(struct bch_fs *c,
                             struct bch_inode_info *dst,
                             u64 journal_seq)
 {
+       /*
+        * atomic64_cmpxchg has a fallback for archs that don't support it,
+        * cmpxchg does not:
+        */
+       atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
        u64 old, v = READ_ONCE(dst->ei_journal_seq);
 
        do {
@@ -49,7 +54,7 @@ static void journal_seq_copy(struct bch_fs *c,
 
                if (old >= journal_seq)
                        break;
-       } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+       } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
 
        bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
 }
@@ -86,6 +91,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock)
        __pagecache_lock_put(lock, 1);
 }
 
+bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
+{
+       return __pagecache_lock_tryget(lock, 1);
+}
+
 void bch2_pagecache_add_get(struct pagecache_lock *lock)
 {
        __pagecache_lock_get(lock, 1);
@@ -225,6 +235,13 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
        return &inode->v;
 }
 
+static int inum_test(struct inode *inode, void *p)
+{
+       unsigned long *ino = p;
+
+       return *ino == inode->i_ino;
+}
+
 static struct bch_inode_info *
 __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
              umode_t mode, dev_t rdev, bool tmpfile)
@@ -259,7 +276,8 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
        if (!tmpfile)
                mutex_lock(&dir->ei_update_lock);
 
-       bch2_trans_init(&trans, c, 8, 1024);
+       bch2_trans_init(&trans, c, 8,
+                       2048 + (!tmpfile ? dentry->d_name.len : 0));
 retry:
        bch2_trans_begin(&trans);
 
@@ -304,8 +322,12 @@ err_before_quota:
         * thread pulling the inode in and modifying it:
         */
 
-       old = to_bch_ei(insert_inode_locked2(&inode->v));
-       if (unlikely(old)) {
+       inode->v.i_state |= I_CREATING;
+       old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
+                                     inum_test, NULL, &inode->v.i_ino));
+       BUG_ON(!old);
+
+       if (unlikely(old != inode)) {
                /*
                 * We raced, another process pulled the new inode into cache
                 * before us:
@@ -346,11 +368,11 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 {
        struct bch_fs *c = vdir->i_sb->s_fs_info;
        struct bch_inode_info *dir = to_bch_ei(vdir);
+       struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
        struct inode *vinode = NULL;
        u64 inum;
 
-       inum = bch2_dirent_lookup(c, dir->v.i_ino,
-                                 &dir->ei_str_hash,
+       inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
                                  &dentry->d_name);
 
        if (inum)
@@ -390,16 +412,12 @@ static int __bch2_link(struct bch_fs *c,
        mutex_lock(&inode->ei_update_lock);
        bch2_trans_init(&trans, c, 4, 1024);
 
-       do {
-               bch2_trans_begin(&trans);
-               ret   = bch2_link_trans(&trans,
+       ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq,
+                             BTREE_INSERT_NOUNLOCK,
+                       bch2_link_trans(&trans,
                                        dir->v.i_ino,
                                        inode->v.i_ino, &dir_u, &inode_u,
-                                       &dentry->d_name) ?:
-                       bch2_trans_commit(&trans, NULL,
-                                       &inode->ei_journal_seq,
-                                       BTREE_INSERT_NOUNLOCK);
-       } while (ret == -EINTR);
+                                       &dentry->d_name));
 
        if (likely(!ret)) {
                BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@@ -446,17 +464,12 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
        bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
        bch2_trans_init(&trans, c, 4, 1024);
 
-       do {
-               bch2_trans_begin(&trans);
-
-               ret   = bch2_unlink_trans(&trans,
+       ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
+                             BTREE_INSERT_NOUNLOCK|
+                             BTREE_INSERT_NOFAIL,
+                       bch2_unlink_trans(&trans,
                                          dir->v.i_ino, &dir_u,
-                                         &inode_u, &dentry->d_name) ?:
-                       bch2_trans_commit(&trans, NULL,
-                                         &dir->ei_journal_seq,
-                                         BTREE_INSERT_NOUNLOCK|
-                                         BTREE_INSERT_NOFAIL);
-       } while (ret == -EINTR);
+                                         &inode_u, &dentry->d_name));
 
        if (likely(!ret)) {
                BUG_ON(inode_u.bi_inum != inode->v.i_ino);
@@ -570,21 +583,16 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
                        goto err;
        }
 
-retry:
-       bch2_trans_begin(&trans);
-       ret   = bch2_rename_trans(&trans,
-                                 src_dir->v.i_ino, &src_dir_u,
-                                 dst_dir->v.i_ino, &dst_dir_u,
-                                 &src_inode_u,
-                                 &dst_inode_u,
-                                 &src_dentry->d_name,
-                                 &dst_dentry->d_name,
-                                 mode) ?:
-               bch2_trans_commit(&trans, NULL,
-                                 &journal_seq,
-                                 BTREE_INSERT_NOUNLOCK);
-       if (ret == -EINTR)
-               goto retry;
+       ret = __bch2_trans_do(&trans, NULL, &journal_seq,
+                             BTREE_INSERT_NOUNLOCK,
+                       bch2_rename_trans(&trans,
+                                         src_dir->v.i_ino, &src_dir_u,
+                                         dst_dir->v.i_ino, &dst_dir_u,
+                                         &src_inode_u,
+                                         &dst_inode_u,
+                                         &src_dentry->d_name,
+                                         &dst_dentry->d_name,
+                                         mode));
        if (unlikely(ret))
                goto err;
 
@@ -706,7 +714,7 @@ retry:
        bch2_setattr_copy(inode, &inode_u, attr);
 
        if (attr->ia_valid & ATTR_MODE) {
-               ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl);
+               ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
                if (ret)
                        goto btree_err;
        }
@@ -717,6 +725,8 @@ retry:
                                  BTREE_INSERT_NOUNLOCK|
                                  BTREE_INSERT_NOFAIL);
 btree_err:
+       bch2_trans_iter_put(&trans, inode_iter);
+
        if (ret == -EINTR)
                goto retry;
        if (unlikely(ret))
@@ -807,7 +817,7 @@ static int bch2_fill_extent(struct bch_fs *c,
                            struct fiemap_extent_info *info,
                            struct bkey_s_c k, unsigned flags)
 {
-       if (bkey_extent_is_data(k.k)) {
+       if (bkey_extent_is_direct_data(k.k)) {
                struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                const union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
@@ -838,6 +848,12 @@ static int bch2_fill_extent(struct bch_fs *c,
                }
 
                return 0;
+       } else if (bkey_extent_is_inline_data(k.k)) {
+               return fiemap_fill_next_extent(info,
+                                              bkey_start_offset(k.k) << 9,
+                                              0, k.k->size << 9,
+                                              flags|
+                                              FIEMAP_EXTENT_DATA_INLINE);
        } else if (k.k->type == KEY_TYPE_reservation) {
                return fiemap_fill_next_extent(info,
                                               bkey_start_offset(k.k) << 9,
@@ -858,7 +874,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_on_stack cur, prev;
+       struct bkey_buf cur, prev;
        struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
        unsigned offset_into_extent, sectors;
        bool have_extent = false;
@@ -871,19 +887,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        if (start + len < start)
                return -EINVAL;
 
-       bkey_on_stack_init(&cur);
-       bkey_on_stack_init(&prev);
+       bch2_bkey_buf_init(&cur);
+       bch2_bkey_buf_init(&prev);
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
                                   POS(ei->v.i_ino, start >> 9), 0);
 retry:
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k)) &&
               bkey_cmp(iter->pos, end) < 0) {
+               enum btree_id data_btree = BTREE_ID_extents;
+
                if (!bkey_extent_is_data(k.k) &&
                    k.k->type != KEY_TYPE_reservation) {
-                       bch2_btree_iter_next(iter);
+                       bch2_btree_iter_advance(iter);
                        continue;
                }
 
@@ -891,24 +909,22 @@ retry:
                        bkey_start_offset(k.k);
                sectors                 = k.k->size - offset_into_extent;
 
-               bkey_on_stack_realloc(&cur, c, k.k->u64s);
-               bkey_on_stack_realloc(&prev, c, k.k->u64s);
-               bkey_reassemble(cur.k, k);
+               bch2_bkey_buf_reassemble(&cur, c, k);
 
-               ret = bch2_read_indirect_extent(&trans,
+               ret = bch2_read_indirect_extent(&trans, &data_btree,
                                        &offset_into_extent, &cur);
                if (ret)
                        break;
 
                k = bkey_i_to_s_c(cur.k);
+               bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
 
                sectors = min(sectors, k.k->size - offset_into_extent);
 
-               if (offset_into_extent)
-                       bch2_cut_front(POS(k.k->p.inode,
-                                          bkey_start_offset(k.k) +
-                                          offset_into_extent),
-                                      cur.k);
+               bch2_cut_front(POS(k.k->p.inode,
+                                  bkey_start_offset(k.k) +
+                                  offset_into_extent),
+                              cur.k);
                bch2_key_resize(&cur.k->k, sectors);
                cur.k->k.p = iter->pos;
                cur.k->k.p.offset += cur.k->k.size;
@@ -923,10 +939,8 @@ retry:
                bkey_copy(prev.k, cur.k);
                have_extent = true;
 
-               if (k.k->type == KEY_TYPE_reflink_v)
-                       bch2_btree_iter_set_pos(iter, k.k->p);
-               else
-                       bch2_btree_iter_next(iter);
+               bch2_btree_iter_set_pos(iter,
+                       POS(iter->pos.inode, iter->pos.offset + sectors));
        }
 
        if (ret == -EINTR)
@@ -936,9 +950,10 @@ retry:
                ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
                                       FIEMAP_EXTENT_LAST);
 
+       bch2_trans_iter_put(&trans, iter);
        ret = bch2_trans_exit(&trans) ?: ret;
-       bkey_on_stack_exit(&cur, c);
-       bkey_on_stack_exit(&prev, c);
+       bch2_bkey_buf_exit(&cur, c);
+       bch2_bkey_buf_exit(&prev, c);
        return ret < 0 ? ret : 0;
 }
 
@@ -983,10 +998,10 @@ static const struct file_operations bch_file_operations = {
        .open           = generic_file_open,
        .fsync          = bch2_fsync,
        .splice_read    = generic_file_splice_read,
-       /*
-        * Broken, on v5.3:
+#if 0
+       /* Busted: */
        .splice_write   = iter_file_splice_write,
-       */
+#endif
        .fallocate      = bch2_fallocate_dispatch,
        .unlocked_ioctl = bch2_fs_file_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1062,7 +1077,7 @@ static const struct address_space_operations bch_address_space_operations = {
        .writepage      = bch2_writepage,
        .readpage       = bch2_readpage,
        .writepages     = bch2_writepages,
-       .readpages      = bch2_readpages,
+       .readahead      = bch2_readahead,
        .set_page_dirty = __set_page_dirty_nobuffers,
        .write_begin    = bch2_write_begin,
        .write_end      = bch2_write_end,
@@ -1127,9 +1142,9 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
        inode->v.i_generation   = bi->bi_generation;
        inode->v.i_size         = bi->bi_size;
 
+       inode->ei_flags         = 0;
        inode->ei_journal_seq   = 0;
        inode->ei_quota_reserved = 0;
-       inode->ei_str_hash      = bch2_hash_info_init(c, bi);
        inode->ei_qid           = bch_qid(bi);
 
        inode->v.i_mapping->a_ops = &bch_address_space_operations;
@@ -1228,7 +1243,7 @@ static void bch2_evict_inode(struct inode *vinode)
                                KEY_TYPE_QUOTA_WARN);
                bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
-               bch2_inode_rm(c, inode->v.i_ino);
+               bch2_inode_rm(c, inode->v.i_ino, true);
        }
 }
 
@@ -1238,6 +1253,11 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct bch_fs *c = sb->s_fs_info;
        struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
        unsigned shift = sb->s_blocksize_bits - 9;
+       /*
+        * this assumes inodes take up 64 bytes, which is a decent average
+        * number:
+        */
+       u64 avail_inodes = ((usage.capacity - usage.used) << 3);
        u64 fsid;
 
        buf->f_type     = BCACHEFS_STATFS_MAGIC;
@@ -1245,8 +1265,9 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_blocks   = usage.capacity >> shift;
        buf->f_bfree    = (usage.capacity - usage.used) >> shift;
        buf->f_bavail   = buf->f_bfree;
-       buf->f_files    = 0;
-       buf->f_ffree    = 0;
+
+       buf->f_files    = usage.nr_inodes + avail_inodes;
+       buf->f_ffree    = avail_inodes;
 
        fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
               le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
index eda903a45325ea94929003db292cbaaefe31a8f0..2d82ed7dd740f8dab4d233365f90e2999cbbb41d 100644 (file)
@@ -26,12 +26,14 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock)
 }
 
 void bch2_pagecache_add_put(struct pagecache_lock *);
+bool bch2_pagecache_add_tryget(struct pagecache_lock *);
 void bch2_pagecache_add_get(struct pagecache_lock *);
 void bch2_pagecache_block_put(struct pagecache_lock *);
 void bch2_pagecache_block_get(struct pagecache_lock *);
 
 struct bch_inode_info {
        struct inode            v;
+       unsigned long           ei_flags;
 
        struct mutex            ei_update_lock;
        u64                     ei_journal_seq;
@@ -43,12 +45,16 @@ struct bch_inode_info {
        struct mutex            ei_quota_lock;
        struct bch_qid          ei_qid;
 
-       struct bch_hash_info    ei_str_hash;
-
        /* copy of inode in btree: */
        struct bch_inode_unpacked ei_inode;
 };
 
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR                 0
+
 #define to_bch_ei(_inode)                                      \
        container_of_or_null(_inode, struct bch_inode_info, v)
 
index 5a6df3d1973a9dedbfab51e1a5b667a7da3751df..acf128f06310ef06aab0d2efe123b3ca580e1fc6 100644 (file)
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "dirent.h"
 #include "error.h"
@@ -24,7 +24,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
        u64 sectors = 0;
        int ret;
 
-       for_each_btree_key(trans, iter, BTREE_ID_EXTENTS,
+       for_each_btree_key(trans, iter, BTREE_ID_extents,
                           POS(inum, 0), 0, k, ret) {
                if (k.k->p.inode != inum)
                        break;
@@ -58,7 +58,7 @@ static int __remove_dirent(struct btree_trans *trans,
        buf[name.len] = '\0';
        name.name = buf;
 
-       ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode);
+       ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0);
        if (ret && ret != -EINTR)
                bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
        if (ret)
@@ -126,8 +126,8 @@ static int walk_inode(struct btree_trans *trans,
                      struct inode_walker *w, u64 inum)
 {
        if (inum != w->cur_inum) {
-               int ret = bch2_inode_find_by_inum_trans(trans, inum,
-                                                       &w->inode);
+               int ret = __bch2_inode_find_by_inum_trans(trans, inum,
+                                                         &w->inode, 0);
 
                if (ret && ret != -ENOENT)
                        return ret;
@@ -193,7 +193,7 @@ static int hash_redo_key(const struct bch_hash_desc desc,
        bch2_trans_update(trans, k_iter, &delete, 0);
 
        return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
-                            tmp, BCH_HASH_SET_MUST_CREATE);
+                            tmp, 0);
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -230,7 +230,6 @@ static int hash_check_duplicates(struct btree_trans *trans,
                return 0;
 
        iter = bch2_trans_copy_iter(trans, h->chain);
-       BUG_ON(IS_ERR(iter));
 
        for_each_btree_key_continue(iter, 0, k2, ret) {
                if (bkey_cmp(k2.k->p, k.k->p) >= 0)
@@ -258,17 +257,15 @@ static void hash_set_chain_start(struct btree_trans *trans,
                        struct hash_check *h,
                        struct btree_iter *k_iter, struct bkey_s_c k)
 {
-       bool hole = (k.k->type != KEY_TYPE_whiteout &&
+       bool hole = (k.k->type != KEY_TYPE_hash_whiteout &&
                     k.k->type != desc.key_type);
 
        if (hole || k.k->p.offset > h->chain_end + 1)
                hash_stop_chain(trans, h);
 
        if (!hole) {
-               if (!h->chain) {
+               if (!h->chain)
                        h->chain = bch2_trans_copy_iter(trans, k_iter);
-                       BUG_ON(IS_ERR(h->chain));
-               }
 
                h->chain_end = k.k->p.offset;
        }
@@ -322,7 +319,7 @@ static int hash_check_key(struct btree_trans *trans,
                        bch_err(c, "hash_redo_key err %i", ret);
                        return ret;
                }
-               return 1;
+               return -EINTR;
        }
 
        ret = hash_check_duplicates(trans, desc, h, k_iter, k);
@@ -399,7 +396,7 @@ err_redo:
        if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n"
                     "hash table key at wrong offset: btree %u, offset %llu, "
                     "hashed to %llu chain starts at %llu\n%s",
-                    buf, strlen(buf), BTREE_ID_DIRENTS,
+                    buf, strlen(buf), BTREE_ID_dirents,
                     k->k->p.offset, hash, h->chain->pos.offset,
                     (bch2_bkey_val_to_text(&PBUF(buf), c,
                                            *k), buf))) {
@@ -416,18 +413,10 @@ err_redo:
        goto err;
 }
 
-static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
-{
-       return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
-                       POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9),
-                       POS(inode_nr + 1, 0), NULL);
-}
-
-static int bch2_fix_overlapping_extent(struct btree_trans *trans,
-                                      struct btree_iter *iter,
+static int fix_overlapping_extent(struct btree_trans *trans,
                                       struct bkey_s_c k, struct bpos cut_at)
 {
-       struct btree_iter *u_iter;
+       struct btree_iter *iter;
        struct bkey_i *u;
        int ret;
 
@@ -439,24 +428,24 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans,
        bkey_reassemble(u, k);
        bch2_cut_front(cut_at, u);
 
-       u_iter = bch2_trans_copy_iter(trans, iter);
-       ret = PTR_ERR_OR_ZERO(u_iter);
-       if (ret)
-               return ret;
 
        /*
-        * We don't want to go through the
-        * extent_handle_overwrites path:
+        * We don't want to go through the extent_handle_overwrites path:
+        *
+        * XXX: this is going to screw up disk accounting, extent triggers
+        * assume things about extent overwrites - we should be running the
+        * triggers manually here
         */
-       __bch2_btree_iter_set_pos(u_iter, u->k.p, false);
+       iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
+                                  BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
-       /*
-        * XXX: this is going to leave disk space
-        * accounting slightly wrong
-        */
-       ret = bch2_trans_update(trans, u_iter, u, 0);
-       bch2_trans_iter_put(trans, u_iter);
-       return ret;
+       BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+       bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN);
+       bch2_trans_iter_put(trans, iter);
+
+       return bch2_trans_commit(trans, NULL, NULL,
+                                BTREE_INSERT_NOFAIL|
+                                BTREE_INSERT_LAZY_RW);
 }
 
 /*
@@ -470,21 +459,45 @@ static int check_extents(struct bch_fs *c)
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_on_stack prev;
-       u64 i_sectors;
+       struct bkey_buf prev;
+       u64 i_sectors = 0;
        int ret = 0;
 
-       bkey_on_stack_init(&prev);
+       bch2_bkey_buf_init(&prev);
        prev.k->k = KEY(0, 0, 0);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        bch_verbose(c, "checking extents");
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
                                   POS(BCACHEFS_ROOT_INO, 0),
                                   BTREE_ITER_INTENT);
 retry:
-       for_each_btree_key_continue(iter, 0, k, ret) {
+       while ((k = bch2_btree_iter_peek(iter)).k &&
+              !(ret = bkey_err(k))) {
+               if (w.have_inode &&
+                   w.cur_inum != k.k->p.inode &&
+                   !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
+                   fsck_err_on(w.inode.bi_sectors != i_sectors, c,
+                               "inode %llu has incorrect i_sectors: got %llu, should be %llu",
+                               w.inode.bi_inum,
+                               w.inode.bi_sectors, i_sectors)) {
+                       struct btree_iter *inode_iter =
+                               bch2_trans_get_iter(&trans, BTREE_ID_inodes,
+                                                   POS(0, w.cur_inum),
+                                                   BTREE_ITER_INTENT);
+
+                       w.inode.bi_sectors = i_sectors;
+
+                       ret = __bch2_trans_do(&trans, NULL, NULL,
+                                             BTREE_INSERT_NOFAIL|
+                                             BTREE_INSERT_LAZY_RW,
+                                             bch2_inode_write(&trans, inode_iter, &w.inode));
+                       bch2_trans_iter_put(&trans, inode_iter);
+                       if (ret)
+                               break;
+               }
+
                if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
                        char buf1[200];
                        char buf2[200];
@@ -492,86 +505,55 @@ retry:
                        bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
                        bch2_bkey_val_to_text(&PBUF(buf2), c, k);
 
-                       if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
-                               ret = __bch2_trans_do(&trans, NULL, NULL,
-                                                     BTREE_INSERT_NOFAIL|
-                                                     BTREE_INSERT_LAZY_RW,
-                                               bch2_fix_overlapping_extent(&trans,
-                                                               iter, k, prev.k->k.p));
-                               if (ret)
-                                       goto err;
-                       }
+                       if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+                               return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
                }
-               bkey_on_stack_reassemble(&prev, c, k);
 
                ret = walk_inode(&trans, &w, k.k->p.inode);
                if (ret)
                        break;
 
+               if (w.first_this_inode)
+                       i_sectors = 0;
+
                if (fsck_err_on(!w.have_inode, c,
-                       "extent type %u for missing inode %llu",
-                       k.k->type, k.k->p.inode) ||
+                               "extent type %u for missing inode %llu",
+                               k.k->type, k.k->p.inode) ||
                    fsck_err_on(w.have_inode &&
-                       !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
-                       "extent type %u for non regular file, inode %llu mode %o",
-                       k.k->type, k.k->p.inode, w.inode.bi_mode)) {
-                       bch2_trans_unlock(&trans);
-
-                       ret = bch2_inode_truncate(c, k.k->p.inode, 0);
-                       if (ret)
-                               goto err;
-                       continue;
+                               !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
+                               "extent type %u for non regular file, inode %llu mode %o",
+                               k.k->type, k.k->p.inode, w.inode.bi_mode)) {
+                       bch2_fs_lazy_rw(c);
+                       return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
+                                                      POS(k.k->p.inode, 0),
+                                                      POS(k.k->p.inode, U64_MAX),
+                                                      NULL) ?: -EINTR;
                }
 
-               if (fsck_err_on(w.first_this_inode &&
-                       w.have_inode &&
-                       !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
-                       w.inode.bi_sectors !=
-                       (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)),
-                       c, "inode %llu has incorrect i_sectors: got %llu, should be %llu",
-                       w.inode.bi_inum,
-                       w.inode.bi_sectors, i_sectors)) {
-                       struct bkey_inode_buf p;
-
-                       w.inode.bi_sectors = i_sectors;
-
-                       bch2_trans_unlock(&trans);
-
-                       bch2_inode_pack(&p, &w.inode);
-
-                       ret = bch2_btree_insert(c, BTREE_ID_INODES,
-                                               &p.inode.k_i, NULL, NULL,
-                                               BTREE_INSERT_NOFAIL|
-                                               BTREE_INSERT_LAZY_RW);
-                       if (ret) {
-                               bch_err(c, "error in fsck: error %i updating inode", ret);
-                               goto err;
-                       }
-
-                       /* revalidate iterator: */
-                       k = bch2_btree_iter_peek(iter);
+               if (fsck_err_on(w.have_inode &&
+                               !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+                               k.k->type != KEY_TYPE_reservation &&
+                               k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
+                               "extent type %u offset %llu past end of inode %llu, i_size %llu",
+                               k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
+                       bch2_fs_lazy_rw(c);
+                       return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
+                                       POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c))),
+                                       POS(k.k->p.inode, U64_MAX),
+                                       NULL) ?: -EINTR;
                }
 
-               if (fsck_err_on(w.have_inode &&
-                       !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-                       k.k->type != KEY_TYPE_reservation &&
-                       k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
-                       "extent type %u offset %llu past end of inode %llu, i_size %llu",
-                       k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
-                       bch2_trans_unlock(&trans);
+               if (bkey_extent_is_allocation(k.k))
+                       i_sectors += k.k->size;
+               bch2_bkey_buf_reassemble(&prev, c, k);
 
-                       ret = bch2_inode_truncate(c, k.k->p.inode,
-                                                 w.inode.bi_size);
-                       if (ret)
-                               goto err;
-                       continue;
-               }
+               bch2_btree_iter_advance(iter);
        }
-err:
 fsck_err:
        if (ret == -EINTR)
                goto retry;
-       bkey_on_stack_exit(&prev, c);
+       bch2_trans_iter_put(&trans, iter);
+       bch2_bkey_buf_exit(&prev, c);
        return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -597,10 +579,11 @@ static int check_dirents(struct bch_fs *c)
 
        hash_check_init(&h);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
                                   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
-       for_each_btree_key_continue(iter, 0, k, ret) {
+       while ((k = bch2_btree_iter_peek(iter)).k &&
+              !(ret = bkey_err(k))) {
                struct bkey_s_c_dirent d;
                struct bch_inode_unpacked target;
                bool have_target;
@@ -675,7 +658,7 @@ retry:
                        continue;
                }
 
-               ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target);
+               ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0);
                if (ret && ret != -ENOENT)
                        break;
 
@@ -692,6 +675,39 @@ retry:
                        continue;
                }
 
+               if (!target.bi_nlink &&
+                   !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+                   (target.bi_dir != k.k->p.inode ||
+                    target.bi_dir_offset != k.k->p.offset) &&
+                   (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+                                "inode %llu has wrong backpointer:\n"
+                                "got       %llu:%llu\n"
+                                "should be %llu:%llu",
+                                d_inum,
+                                target.bi_dir,
+                                target.bi_dir_offset,
+                                k.k->p.inode,
+                                k.k->p.offset) ||
+                    c->opts.version_upgrade)) {
+                       struct bkey_inode_buf p;
+
+                       target.bi_dir           = k.k->p.inode;
+                       target.bi_dir_offset    = k.k->p.offset;
+                       bch2_trans_unlock(&trans);
+
+                       bch2_inode_pack(c, &p, &target);
+
+                       ret = bch2_btree_insert(c, BTREE_ID_inodes,
+                                               &p.inode.k_i, NULL, NULL,
+                                               BTREE_INSERT_NOFAIL|
+                                               BTREE_INSERT_LAZY_RW);
+                       if (ret) {
+                               bch_err(c, "error in fsck: error %i updating inode", ret);
+                               goto err;
+                       }
+                       continue;
+               }
+
                if (fsck_err_on(have_target &&
                                d.v->d_type !=
                                mode_to_type(target.bi_mode), c,
@@ -719,6 +735,8 @@ retry:
                                goto err;
 
                }
+
+               bch2_btree_iter_advance(iter);
        }
 
        hash_stop_chain(&trans, &h);
@@ -727,6 +745,8 @@ fsck_err:
        if (ret == -EINTR)
                goto retry;
 
+       bch2_trans_iter_put(&trans, h.chain);
+       bch2_trans_iter_put(&trans, iter);
        return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -749,10 +769,11 @@ static int check_xattrs(struct bch_fs *c)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
                                   POS(BCACHEFS_ROOT_INO, 0), 0);
 retry:
-       for_each_btree_key_continue(iter, 0, k, ret) {
+       while ((k = bch2_btree_iter_peek(iter)).k &&
+              !(ret = bkey_err(k))) {
                ret = walk_inode(&trans, &w, k.k->p.inode);
                if (ret)
                        break;
@@ -762,7 +783,7 @@ retry:
                                k.k->p.inode)) {
                        ret = bch2_btree_delete_at(&trans, iter, 0);
                        if (ret)
-                               goto err;
+                               break;
                        continue;
                }
 
@@ -772,12 +793,16 @@ retry:
                ret = hash_check_key(&trans, bch2_xattr_hash_desc,
                                     &h, iter, k);
                if (ret)
-                       goto fsck_err;
+                       break;
+
+               bch2_btree_iter_advance(iter);
        }
-err:
 fsck_err:
        if (ret == -EINTR)
                goto retry;
+
+       bch2_trans_iter_put(&trans, h.chain);
+       bch2_trans_iter_put(&trans, iter);
        return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -789,7 +814,9 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
 
        bch_verbose(c, "checking root directory");
 
-       ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
+       ret = bch2_trans_do(c, NULL, NULL, 0,
+               __bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO,
+                                               root_inode, 0));
        if (ret && ret != -ENOENT)
                return ret;
 
@@ -808,9 +835,9 @@ create_root:
                        0, NULL);
        root_inode->bi_inum = BCACHEFS_ROOT_INO;
 
-       bch2_inode_pack(&packed, root_inode);
+       bch2_inode_pack(c, &packed, root_inode);
 
-       return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+       return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
                                 NULL, NULL,
                                 BTREE_INSERT_NOFAIL|
                                 BTREE_INSERT_LAZY_RW);
@@ -836,7 +863,8 @@ static int check_lostfound(struct bch_fs *c,
                goto create_lostfound;
        }
 
-       ret = bch2_inode_find_by_inum(c, inum, lostfound_inode);
+       ret = bch2_trans_do(c, NULL, NULL, 0,
+               __bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0));
        if (ret && ret != -ENOENT)
                return ret;
 
@@ -866,36 +894,22 @@ create_lostfound:
        return ret;
 }
 
-struct inode_bitmap {
-       unsigned long   *bits;
-       size_t          size;
-};
+typedef GENRADIX(unsigned long) inode_bitmap;
 
-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr)
 {
-       return nr < b->size ? test_bit(nr, b->bits) : false;
+       unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG);
+       return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false;
 }
 
-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+static inline int inode_bitmap_set(inode_bitmap *b, size_t nr)
 {
-       if (nr >= b->size) {
-               size_t new_size = max_t(size_t, max_t(size_t,
-                                       PAGE_SIZE * 8,
-                                       b->size * 2),
-                                       nr + 1);
-               void *n;
-
-               new_size = roundup_pow_of_two(new_size);
-               n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
-               if (!n) {
-                       return -ENOMEM;
-               }
+       unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL);
 
-               b->bits = n;
-               b->size = new_size;
-       }
+       if (!w)
+               return -ENOMEM;
 
-       __set_bit(nr, b->bits);
+       *w |= 1UL << (nr & (BITS_PER_LONG - 1));
        return 0;
 }
 
@@ -934,7 +948,7 @@ noinline_for_stack
 static int check_directory_structure(struct bch_fs *c,
                                     struct bch_inode_unpacked *lostfound_inode)
 {
-       struct inode_bitmap dirs_done = { NULL, 0 };
+       inode_bitmap dirs_done;
        struct pathbuf path = { 0, 0, NULL };
        struct pathbuf_entry *e;
        struct btree_trans trans;
@@ -951,6 +965,7 @@ static int check_directory_structure(struct bch_fs *c,
 
        /* DFS: */
 restart_dfs:
+       genradix_init(&dirs_done);
        had_unreachable = false;
 
        ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
@@ -970,7 +985,7 @@ next:
                if (e->offset == U64_MAX)
                        goto up;
 
-               for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+               for_each_btree_key(&trans, iter, BTREE_ID_dirents,
                                   POS(e->inum, e->offset + 1), 0, k, ret) {
                        if (k.k->p.inode != e->inum)
                                break;
@@ -1023,7 +1038,7 @@ up:
                path.nr--;
        }
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0);
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS_MIN, 0);
 retry:
        for_each_btree_key_continue(iter, 0, k, ret) {
                if (k.k->type != KEY_TYPE_inode)
@@ -1057,7 +1072,7 @@ retry:
 
        if (had_unreachable) {
                bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
-               kfree(dirs_done.bits);
+               genradix_free(&dirs_done);
                kfree(path.entries);
                memset(&dirs_done, 0, sizeof(dirs_done));
                memset(&path, 0, sizeof(path));
@@ -1066,7 +1081,7 @@ retry:
 err:
 fsck_err:
        ret = bch2_trans_exit(&trans) ?: ret;
-       kfree(dirs_done.bits);
+       genradix_free(&dirs_done);
        kfree(path.entries);
        return ret;
 }
@@ -1087,6 +1102,11 @@ static void inc_link(struct bch_fs *c, nlink_table *links,
        if (inum < range_start || inum >= *range_end)
                return;
 
+       if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
+               *range_end = inum;
+               return;
+       }
+
        link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
        if (!link) {
                bch_verbose(c, "allocation failed during fsck - will need another pass");
@@ -1115,7 +1135,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 
        inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) {
+       for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
                switch (k.k->type) {
                case KEY_TYPE_dirent:
                        d = bkey_s_c_to_dirent(k);
@@ -1133,6 +1153,8 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 
                bch2_trans_cond_resched(&trans);
        }
+       bch2_trans_iter_put(&trans, iter);
+
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
                bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
@@ -1267,7 +1289,7 @@ static int check_inode(struct btree_trans *trans,
 
                bch2_fs_lazy_rw(c);
 
-               ret = bch2_inode_rm(c, u.bi_inum);
+               ret = bch2_inode_rm(c, u.bi_inum, false);
                if (ret)
                        bch_err(c, "error in fsck: error %i while deleting inode", ret);
                return ret;
@@ -1285,8 +1307,10 @@ static int check_inode(struct btree_trans *trans,
                 * XXX: need to truncate partial blocks too here - or ideally
                 * just switch units to bytes and that issue goes away
                 */
-
-               ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size);
+               ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+                               POS(u.bi_inum, round_up(u.bi_size, block_bytes(c))),
+                               POS(u.bi_inum, U64_MAX),
+                               NULL);
                if (ret) {
                        bch_err(c, "error in fsck: error %i truncating inode", ret);
                        return ret;
@@ -1323,10 +1347,21 @@ static int check_inode(struct btree_trans *trans,
                do_update = true;
        }
 
+       if (!S_ISDIR(u.bi_mode) &&
+           u.bi_nlink &&
+           !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+           (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+                        "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
+            c->opts.version_upgrade)) {
+               u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+               do_update = true;
+       }
+
        if (do_update) {
                struct bkey_inode_buf p;
 
-               bch2_inode_pack(&p, &u);
+               bch2_inode_pack(c, &p, &u);
+               p.inode.k.p = iter->pos;
 
                ret = __bch2_trans_do(trans, NULL, NULL,
                                      BTREE_INSERT_NOFAIL|
@@ -1356,28 +1391,30 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
                                   POS(0, range_start), 0);
        nlinks_iter = genradix_iter_init(links, 0);
 
        while ((k = bch2_btree_iter_peek(iter)).k &&
-              !(ret2 = bkey_err(k))) {
+              !(ret2 = bkey_err(k)) &&
+              iter->pos.offset < range_end) {
 peek_nlinks:   link = genradix_iter_peek(&nlinks_iter, links);
 
                if (!link && (!k.k || iter->pos.offset >= range_end))
                        break;
 
                nlinks_pos = range_start + nlinks_iter.pos;
-               if (iter->pos.offset > nlinks_pos) {
+
+               if (link && nlinks_pos < iter->pos.offset) {
                        /* Should have been caught by dirents pass: */
-                       need_fsck_err_on(link && link->count, c,
+                       need_fsck_err_on(link->count, c,
                                "missing inode %llu (nlink %u)",
                                nlinks_pos, link->count);
                        genradix_iter_advance(&nlinks_iter, links);
                        goto peek_nlinks;
                }
 
-               if (iter->pos.offset < nlinks_pos || !link)
+               if (!link || nlinks_pos > iter->pos.offset)
                        link = &zero_links;
 
                if (k.k && k.k->type == KEY_TYPE_inode) {
@@ -1396,10 +1433,11 @@ peek_nlinks:    link = genradix_iter_peek(&nlinks_iter, links);
                if (nlinks_pos == iter->pos.offset)
                        genradix_iter_advance(&nlinks_iter, links);
 
-               bch2_btree_iter_next(iter);
+               bch2_btree_iter_advance(iter);
                bch2_trans_cond_resched(&trans);
        }
 fsck_err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
 
        if (ret2)
@@ -1480,7 +1518,7 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
+       for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
                if (k.k->type != KEY_TYPE_inode)
                        continue;
 
@@ -1491,11 +1529,12 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c)
                     BCH_INODE_I_SECTORS_DIRTY|
                     BCH_INODE_UNLINKED)) {
                        ret = check_inode(&trans, NULL, iter, inode, NULL);
-                       BUG_ON(ret == -EINTR);
                        if (ret)
                                break;
                }
        }
+       bch2_trans_iter_put(&trans, iter);
+
        BUG_ON(ret == -EINTR);
 
        return bch2_trans_exit(&trans) ?: ret;
index 7d20f082ad45a48d67fded8629aaad613cddbdee..d4c328397156e9b102e9ce0424dcd6067be1581a 100644 (file)
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_key_cache.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
 #include "str_hash.h"
+#include "varint.h"
 
 #include <linux/random.h>
 
@@ -88,22 +90,17 @@ static int inode_decode_field(const u8 *in, const u8 *end,
        return bytes;
 }
 
-void bch2_inode_pack(struct bkey_inode_buf *packed,
-                    const struct bch_inode_unpacked *inode)
+static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
+                                       const struct bch_inode_unpacked *inode)
 {
-       u8 *out = packed->inode.v.fields;
+       struct bkey_i_inode *k = &packed->inode;
+       u8 *out = k->v.fields;
        u8 *end = (void *) &packed[1];
        u8 *last_nonzero_field = out;
        unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
        unsigned bytes;
 
-       bkey_inode_init(&packed->inode.k_i);
-       packed->inode.k.p.offset        = inode->bi_inum;
-       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
-       packed->inode.v.bi_flags        = cpu_to_le32(inode->bi_flags);
-       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
-
-#define x(_name, _bits)                                        \
+#define x(_name, _bits)                                                        \
        out += inode_encode_field(out, end, 0, inode->_name);           \
        nr_fields++;                                                    \
                                                                        \
@@ -122,7 +119,69 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
        set_bkey_val_bytes(&packed->inode.k, bytes);
        memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-       SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+       SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
+                              const struct bch_inode_unpacked *inode)
+{
+       struct bkey_i_inode *k = &packed->inode;
+       u8 *out = k->v.fields;
+       u8 *end = (void *) &packed[1];
+       u8 *last_nonzero_field = out;
+       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+       unsigned bytes;
+       int ret;
+
+#define x(_name, _bits)                                                        \
+       nr_fields++;                                                    \
+                                                                       \
+       if (inode->_name) {                                             \
+               ret = bch2_varint_encode(out, inode->_name);            \
+               out += ret;                                             \
+                                                                       \
+               if (_bits > 64)                                         \
+                       *out++ = 0;                                     \
+                                                                       \
+               last_nonzero_field = out;                               \
+               last_nonzero_fieldnr = nr_fields;                       \
+       } else {                                                        \
+               *out++ = 0;                                             \
+                                                                       \
+               if (_bits > 64)                                         \
+                       *out++ = 0;                                     \
+       }
+
+       BCH_INODE_FIELDS()
+#undef  x
+       BUG_ON(out > end);
+
+       out = last_nonzero_field;
+       nr_fields = last_nonzero_fieldnr;
+
+       bytes = out - (u8 *) &packed->inode.v;
+       set_bkey_val_bytes(&packed->inode.k, bytes);
+       memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+       SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+void bch2_inode_pack(struct bch_fs *c,
+                    struct bkey_inode_buf *packed,
+                    const struct bch_inode_unpacked *inode)
+{
+       bkey_inode_init(&packed->inode.k_i);
+       packed->inode.k.p.offset        = inode->bi_inum;
+       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
+       packed->inode.v.bi_flags        = cpu_to_le32(inode->bi_flags);
+       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
+
+       if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
+               SET_INODE_NEW_VARINT(&packed->inode.v, true);
+               bch2_inode_pack_v2(packed, inode);
+       } else {
+               bch2_inode_pack_v1(packed, inode);
+       }
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                struct bch_inode_unpacked unpacked;
@@ -134,26 +193,23 @@ void bch2_inode_pack(struct bkey_inode_buf *packed,
                BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
                BUG_ON(unpacked.bi_mode         != inode->bi_mode);
 
-#define x(_name, _bits)        BUG_ON(unpacked._name != inode->_name);
+#define x(_name, _bits)        if (unpacked._name != inode->_name)             \
+                       panic("unpacked %llu should be %llu",           \
+                             (u64) unpacked._name, (u64) inode->_name);
                BCH_INODE_FIELDS()
 #undef  x
        }
 }
 
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
-                     struct bch_inode_unpacked *unpacked)
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+                               struct bch_inode_unpacked *unpacked)
 {
        const u8 *in = inode.v->fields;
-       const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+       const u8 *end = bkey_val_end(inode);
        u64 field[2];
        unsigned fieldnr = 0, field_bits;
        int ret;
 
-       unpacked->bi_inum       = inode.k->p.offset;
-       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
-       unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
-       unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
-
 #define x(_name, _bits)                                        \
        if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {                    \
                memset(&unpacked->_name, 0,                             \
@@ -176,6 +232,62 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 #undef  x
 
        /* XXX: signal if there were more fields than expected? */
+       return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
+                               struct bch_inode_unpacked *unpacked)
+{
+       const u8 *in = inode.v->fields;
+       const u8 *end = bkey_val_end(inode);
+       unsigned fieldnr = 0;
+       int ret;
+       u64 v[2];
+
+#define x(_name, _bits)                                                        \
+       if (fieldnr < INODE_NR_FIELDS(inode.v)) {                       \
+               ret = bch2_varint_decode(in, end, &v[0]);               \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               in += ret;                                              \
+                                                                       \
+               if (_bits > 64) {                                       \
+                       ret = bch2_varint_decode(in, end, &v[1]);       \
+                       if (ret < 0)                                    \
+                               return ret;                             \
+                       in += ret;                                      \
+               } else {                                                \
+                       v[1] = 0;                                       \
+               }                                                       \
+       } else {                                                        \
+               v[0] = v[1] = 0;                                        \
+       }                                                               \
+                                                                       \
+       unpacked->_name = v[0];                                         \
+       if (v[1] || v[0] != unpacked->_name)                            \
+               return -1;                                              \
+       fieldnr++;
+
+       BCH_INODE_FIELDS()
+#undef  x
+
+       /* XXX: signal if there were more fields than expected? */
+       return 0;
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+                     struct bch_inode_unpacked *unpacked)
+{
+       unpacked->bi_inum       = inode.k->p.offset;
+       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
+       unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
+       unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
+
+       if (INODE_NEW_VARINT(inode.v)) {
+               return bch2_inode_unpack_v2(inode, unpacked);
+       } else {
+               return bch2_inode_unpack_v1(inode, unpacked);
+       }
 
        return 0;
 }
@@ -188,12 +300,9 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
-                                  BTREE_ITER_SLOTS|flags);
-       if (IS_ERR(iter))
-               return iter;
-
-       k = bch2_btree_iter_peek_slot(iter);
+       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum),
+                                  BTREE_ITER_CACHED|flags);
+       k = bch2_btree_iter_peek_cached(iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -222,7 +331,8 @@ int bch2_inode_write(struct btree_trans *trans,
        if (IS_ERR(inode_p))
                return PTR_ERR(inode_p);
 
-       bch2_inode_pack(inode_p, inode);
+       bch2_inode_pack(trans->c, inode_p, inode);
+       inode_p->inode.k.p.snapshot = iter->snapshot;
        bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
        return 0;
 }
@@ -271,6 +381,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
                return;
        }
 
+       pr_buf(out, "mode: %o ", unpacked.bi_mode);
+
 #define x(_name, _bits)                                                \
        pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
        BCH_INODE_FIELDS()
@@ -358,72 +470,113 @@ static inline u32 bkey_generation(struct bkey_s_c k)
        }
 }
 
-int bch2_inode_create(struct btree_trans *trans,
-                     struct bch_inode_unpacked *inode_u,
-                     u64 min, u64 max, u64 *hint)
+struct btree_iter *bch2_inode_create(struct btree_trans *trans,
+                                    struct bch_inode_unpacked *inode_u,
+                                    u32 snapshot)
 {
-       struct bkey_inode_buf *inode_p;
+       struct bch_fs *c = trans->c;
        struct btree_iter *iter = NULL;
        struct bkey_s_c k;
-       u64 start;
+       u64 min, max, start, pos, *hint;
        int ret;
 
-       if (!max)
-               max = ULLONG_MAX;
+       u64 cpu = raw_smp_processor_id();
+       unsigned bits = (c->opts.inodes_32bit
+               ? 31 : 63) - c->inode_shard_bits;
+
+       min = (cpu << bits);
+       max = (cpu << bits) | ~(ULLONG_MAX << bits);
 
-       if (trans->c->opts.inodes_32bit)
-               max = min_t(u64, max, U32_MAX);
+       min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+       hint = c->unused_inode_hints + cpu;
 
        start = READ_ONCE(*hint);
 
        if (start >= max || start < min)
                start = min;
 
-       inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
-       if (IS_ERR(inode_p))
-               return PTR_ERR(inode_p);
+       pos = start;
+       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
+                                  BTREE_ITER_ALL_SNAPSHOTS|
+                                  BTREE_ITER_INTENT);
 again:
-       for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
-                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               if (bkey_cmp(iter->pos, POS(0, max)) > 0)
-                       break;
+       while ((k = bch2_btree_iter_peek(iter)).k &&
+              !(ret = bkey_err(k)) &&
+              bkey_cmp(k.k->p, POS(0, max)) < 0) {
+               while (pos < iter->pos.offset) {
+                       if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
+                               goto found_slot;
+
+                       pos++;
+               }
 
-               if (k.k->type != KEY_TYPE_inode)
-                       goto found_slot;
+               if (k.k->p.snapshot == snapshot &&
+                   k.k->type != KEY_TYPE_inode &&
+                   !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
+                       bch2_btree_iter_next(iter);
+                       continue;
+               }
+
+               /*
+                * We don't need to iterate over keys in every snapshot once
+                * we've found just one:
+                */
+               pos = iter->pos.offset + 1;
+               bch2_btree_iter_set_pos(iter, POS(0, pos));
        }
 
-       bch2_trans_iter_put(trans, iter);
+       while (!ret && pos < max) {
+               if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
+                       goto found_slot;
 
-       if (ret)
-               return ret;
+               pos++;
+       }
 
-       if (start != min) {
-               /* Retry from start */
-               start = min;
-               goto again;
+       if (!ret && start == min)
+               ret = -ENOSPC;
+
+       if (ret) {
+               bch2_trans_iter_put(trans, iter);
+               return ERR_PTR(ret);
        }
 
-       return -ENOSPC;
+       /* Retry from start */
+       pos = start = min;
+       bch2_btree_iter_set_pos(iter, POS(0, pos));
+       goto again;
 found_slot:
+       bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret) {
+               bch2_trans_iter_put(trans, iter);
+               return ERR_PTR(ret);
+       }
+
+       /* We may have raced while the iterator wasn't pointing at pos: */
+       if (k.k->type == KEY_TYPE_inode ||
+           bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
+               goto again;
+
        *hint                   = k.k->p.offset;
        inode_u->bi_inum        = k.k->p.offset;
        inode_u->bi_generation  = bkey_generation(k);
-
-       bch2_inode_pack(inode_p, inode_u);
-       bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
-       bch2_trans_iter_put(trans, iter);
-       return 0;
+       return iter;
 }
 
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter *iter = NULL;
        struct bkey_i_inode_generation delete;
        struct bpos start = POS(inode_nr, 0);
        struct bpos end = POS(inode_nr + 1, 0);
+       struct bch_inode_unpacked inode_u;
+       struct bkey_s_c k;
        int ret;
 
+       bch2_trans_init(&trans, c, 0, 0);
+
        /*
         * If this was a directory, there shouldn't be any real dirents left -
         * but there could be whiteouts (from hash collisions) that we should
@@ -432,79 +585,71 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
         * XXX: the dirent could ideally would delete whiteouts when they're no
         * longer needed
         */
-       ret   = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
-                                       start, end, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_XATTRS,
-                                       start, end, NULL) ?:
-               bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
-                                       start, end, NULL);
+       ret   = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
+                                             start, end, NULL) ?:
+               bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
+                                             start, end, NULL) ?:
+               bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
+                                             start, end, NULL);
        if (ret)
-               return ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       do {
-               struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-               u32 bi_generation = 0;
-
-               ret = bkey_err(k);
-               if (ret)
-                       break;
+               goto err;
+retry:
+       bch2_trans_begin(&trans);
+
+       if (cached) {
+               iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
+                                          BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek_cached(iter);
+       } else {
+               iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
+                                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek_slot(iter);
+       }
 
-               bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
-                                       "inode %llu not found when deleting",
-                                       inode_nr);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
 
-               switch (k.k->type) {
-               case KEY_TYPE_inode: {
-                       struct bch_inode_unpacked inode_u;
+       if (k.k->type != KEY_TYPE_inode) {
+               bch2_fs_inconsistent(trans.c,
+                                    "inode %llu not found when deleting",
+                                    inode_nr);
+               ret = -EIO;
+               goto err;
+       }
 
-                       if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
-                               bi_generation = inode_u.bi_generation + 1;
-                       break;
-               }
-               case KEY_TYPE_inode_generation: {
-                       struct bkey_s_c_inode_generation g =
-                               bkey_s_c_to_inode_generation(k);
-                       bi_generation = le32_to_cpu(g.v->bi_generation);
-                       break;
-               }
-               }
+       bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
 
-               if (!bi_generation) {
-                       bkey_init(&delete.k);
-                       delete.k.p.offset = inode_nr;
-               } else {
-                       bkey_inode_generation_init(&delete.k_i);
-                       delete.k.p.offset = inode_nr;
-                       delete.v.bi_generation = cpu_to_le32(bi_generation);
-               }
+       bkey_inode_generation_init(&delete.k_i);
+       delete.k.p = iter->pos;
+       delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-               bch2_trans_update(&trans, iter, &delete.k_i, 0);
+       bch2_trans_update(&trans, iter, &delete.k_i, 0);
 
-               ret = bch2_trans_commit(&trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL);
-       } while (ret == -EINTR);
+       ret = bch2_trans_commit(&trans, NULL, NULL,
+                               BTREE_INSERT_NOFAIL);
+err:
+       bch2_trans_iter_put(&trans, iter);
+       if (ret == -EINTR)
+               goto retry;
 
        bch2_trans_exit(&trans);
        return ret;
 }
 
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
-                                 struct bch_inode_unpacked *inode)
+int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+                                   struct bch_inode_unpacked *inode,
+                                   unsigned flags)
 {
        struct btree_iter *iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
-                       POS(0, inode_nr), BTREE_ITER_SLOTS);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
-
-       k = bch2_btree_iter_peek_slot(iter);
+       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
+                       POS(0, inode_nr), flags);
+       k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED
+               ? bch2_btree_iter_peek_cached(iter)
+               : bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -517,38 +662,17 @@ err:
        return ret;
 }
 
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+                                 struct bch_inode_unpacked *inode)
+{
+       return __bch2_inode_find_by_inum_trans(trans, inode_nr,
+                                              inode, BTREE_ITER_CACHED);
+
+}
+
 int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
                            struct bch_inode_unpacked *inode)
 {
        return bch2_trans_do(c, NULL, NULL, 0,
                bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
 }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void)
-{
-       struct bch_inode_unpacked *u, test_inodes[] = {
-               {
-                       .bi_atime       = U64_MAX,
-                       .bi_ctime       = U64_MAX,
-                       .bi_mtime       = U64_MAX,
-                       .bi_otime       = U64_MAX,
-                       .bi_size        = U64_MAX,
-                       .bi_sectors     = U64_MAX,
-                       .bi_uid         = U32_MAX,
-                       .bi_gid         = U32_MAX,
-                       .bi_nlink       = U32_MAX,
-                       .bi_generation  = U32_MAX,
-                       .bi_dev         = U32_MAX,
-               },
-       };
-
-       for (u = test_inodes;
-            u < test_inodes + ARRAY_SIZE(test_inodes);
-            u++) {
-               struct bkey_inode_buf p;
-
-               bch2_inode_pack(&p, u);
-       }
-}
-#endif
index bb759a46dc415a57a4f76119036cb245fd2c645b..23c322d9a85b0e64a0fea7717a0a5618b2d47a98 100644 (file)
@@ -24,6 +24,14 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
        .val_to_text    = bch2_inode_generation_to_text,        \
 }
 
+#if 0
+typedef struct {
+       u64                     lo;
+       u32                     hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
 struct bch_inode_unpacked {
        u64                     bi_inum;
        __le64                  bi_hash_seed;
@@ -43,7 +51,8 @@ struct bkey_inode_buf {
 #undef  x
 } __attribute__((packed, aligned(8)));
 
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
+                    const struct bch_inode_unpacked *);
 int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
 struct btree_iter *bch2_inode_peek(struct btree_trans *,
@@ -60,12 +69,13 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
                     uid_t, gid_t, umode_t, dev_t,
                     struct bch_inode_unpacked *);
 
-int bch2_inode_create(struct btree_trans *,
-                     struct bch_inode_unpacked *,
-                     u64, u64, u64 *);
+struct btree_iter *bch2_inode_create(struct btree_trans *,
+                                    struct bch_inode_unpacked *, u32);
 
-int bch2_inode_rm(struct bch_fs *, u64);
+int bch2_inode_rm(struct bch_fs *, u64, bool);
 
+int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
+                                   struct bch_inode_unpacked *, unsigned);
 int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
                                  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
@@ -168,10 +178,4 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
        }
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void);
-#else
-static inline void bch2_inode_pack_test(void) {}
-#endif
-
 #endif /* _BCACHEFS_INODE_H */
index 8add8ccd129dade398cffe98d053cb983e97e0fd..36b10cb7ae6285ae3b461cdf1f89024a4303138d 100644 (file)
@@ -9,7 +9,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bset.h"
 #include "btree_update.h"
 #include "buckets.h"
@@ -171,7 +171,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 
        while (size) {
                struct page *page = __bio_alloc_page_pool(c, &using_mempool);
-               unsigned len = min(PAGE_SIZE, size);
+               unsigned len = min_t(size_t, PAGE_SIZE, size);
 
                BUG_ON(!bio_add_page(bio, page, len, 0));
                size -= len;
@@ -183,39 +183,47 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 
 /* Extent update path: */
 
-static int sum_sector_overwrites(struct btree_trans *trans,
-                                struct btree_iter *extent_iter,
-                                struct bkey_i *new,
-                                bool may_allocate,
-                                bool *maybe_extending,
-                                s64 *delta)
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+                              struct btree_iter *extent_iter,
+                              struct bkey_i *new,
+                              bool *maybe_extending,
+                              bool *should_check_enospc,
+                              s64 *i_sectors_delta,
+                              s64 *disk_sectors_delta)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter *iter;
        struct bkey_s_c old;
+       unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+       bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
        int ret = 0;
 
-       *maybe_extending = true;
-       *delta = 0;
+       *maybe_extending        = true;
+       *should_check_enospc    = false;
+       *i_sectors_delta        = 0;
+       *disk_sectors_delta     = 0;
 
        iter = bch2_trans_copy_iter(trans, extent_iter);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
 
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
-               if (!may_allocate &&
-                   bch2_bkey_nr_ptrs_fully_allocated(old) <
-                   bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
-                       ret = -ENOSPC;
-                       break;
-               }
+               s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+                       max(bkey_start_offset(&new->k),
+                           bkey_start_offset(old.k));
 
-               *delta += (min(new->k.p.offset,
-                             old.k->p.offset) -
-                         max(bkey_start_offset(&new->k),
-                             bkey_start_offset(old.k))) *
+               *i_sectors_delta += sectors *
                        (bkey_extent_is_allocation(&new->k) -
                         bkey_extent_is_allocation(old.k));
 
+               *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+               *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+                       ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+                       : 0;
+
+               if (!*should_check_enospc &&
+                   (new_replicas > bch2_bkey_replicas(c, old) ||
+                    (!new_compressed && bch2_bkey_sectors_compressed(old))))
+                       *should_check_enospc = true;
+
                if (bkey_cmp(old.k->p, new->k.p) >= 0) {
                        /*
                         * Check if there's already data above where we're
@@ -249,29 +257,41 @@ int bch2_extent_update(struct btree_trans *trans,
                       struct disk_reservation *disk_res,
                       u64 *journal_seq,
                       u64 new_i_size,
-                      s64 *i_sectors_delta)
+                      s64 *i_sectors_delta_total)
 {
        /* this must live until after bch2_trans_commit(): */
        struct bkey_inode_buf inode_p;
-       bool extending = false;
-       s64 delta = 0;
+       bool extending = false, should_check_enospc;
+       s64 i_sectors_delta = 0, disk_sectors_delta = 0;
        int ret;
 
        ret = bch2_extent_trim_atomic(k, iter);
        if (ret)
                return ret;
 
-       ret = sum_sector_overwrites(trans, iter, k,
-                       disk_res && disk_res->sectors != 0,
-                       &extending, &delta);
+       ret = bch2_sum_sector_overwrites(trans, iter, k,
+                       &extending,
+                       &should_check_enospc,
+                       &i_sectors_delta,
+                       &disk_sectors_delta);
        if (ret)
                return ret;
 
+       if (disk_res &&
+           disk_sectors_delta > (s64) disk_res->sectors) {
+               ret = bch2_disk_reservation_add(trans->c, disk_res,
+                                       disk_sectors_delta - disk_res->sectors,
+                                       !should_check_enospc
+                                       ? BCH_DISK_RESERVATION_NOFAIL : 0);
+               if (ret)
+                       return ret;
+       }
+
        new_i_size = extending
                ? min(k->k.p.offset << 9, new_i_size)
                : 0;
 
-       if (delta || new_i_size) {
+       if (i_sectors_delta || new_i_size) {
                struct btree_iter *inode_iter;
                struct bch_inode_unpacked inode_u;
 
@@ -298,10 +318,13 @@ int bch2_extent_update(struct btree_trans *trans,
                else
                        new_i_size = 0;
 
-               inode_u.bi_sectors += delta;
+               inode_u.bi_sectors += i_sectors_delta;
+
+               if (i_sectors_delta || new_i_size) {
+                       bch2_inode_pack(trans->c, &inode_p, &inode_u);
+
+                       inode_p.inode.k.p.snapshot = iter->snapshot;
 
-               if (delta || new_i_size) {
-                       bch2_inode_pack(&inode_p, &inode_u);
                        bch2_trans_update(trans, inode_iter,
                                          &inode_p.inode.k_i, 0);
                }
@@ -313,12 +336,13 @@ int bch2_extent_update(struct btree_trans *trans,
 
        ret = bch2_trans_commit(trans, disk_res, journal_seq,
                                BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE);
-       if (!ret && i_sectors_delta)
-               *i_sectors_delta += delta;
+                               BTREE_INSERT_NOFAIL);
+       if (ret)
+               return ret;
 
-       return ret;
+       if (i_sectors_delta_total)
+               *i_sectors_delta_total += i_sectors_delta;
+       return 0;
 }
 
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
@@ -378,12 +402,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
        int ret = 0;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
                                   POS(inum, start),
                                   BTREE_ITER_INTENT);
 
        ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
                             journal_seq, i_sectors_delta);
+
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
 
        if (ret == -EINTR)
@@ -395,17 +421,17 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 int bch2_write_index_default(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct keylist *keys = &op->insert_keys;
        struct bkey_i *k = bch2_keylist_front(keys);
        struct btree_trans trans;
        struct btree_iter *iter;
        int ret;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
                                   bkey_start_pos(&k->k),
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
@@ -414,7 +440,9 @@ int bch2_write_index_default(struct bch_write_op *op)
 
                k = bch2_keylist_front(keys);
 
-               bkey_on_stack_realloc(&sk, c, k->k.u64s);
+               k->k.p.snapshot = iter->snapshot;
+
+               bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
                bkey_copy(sk.k, k);
                bch2_cut_front(iter->pos, sk.k);
 
@@ -430,8 +458,9 @@ int bch2_write_index_default(struct bch_write_op *op)
                        bch2_keylist_pop_front(keys);
        } while (!bch2_keylist_empty(keys));
 
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
@@ -479,9 +508,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
                n->submit_time          = local_clock();
                n->bio.bi_iter.bi_sector = ptr->offset;
 
-               if (!journal_flushes_device(ca))
-                       n->bio.bi_opf |= REQ_FUA;
-
                if (likely(n->have_ioref)) {
                        this_cpu_add(ca->io_done->sectors[WRITE][type],
                                     bio_sectors(&n->bio));
@@ -578,7 +604,8 @@ static void __bch2_write_index(struct bch_write_op *op)
                op->written += sectors_start - keylist_sectors(keys);
 
                if (ret) {
-                       __bcache_io_error(c, "btree IO error %i", ret);
+                       bch_err_inum_ratelimited(c, op->pos.inode,
+                               "write error %i from btree update", ret);
                        op->error = ret;
                }
        }
@@ -623,7 +650,10 @@ static void bch2_write_endio(struct bio *bio)
        struct bch_fs *c                = wbio->c;
        struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+                                   op->pos.inode,
+                                   op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
+                                   "data write error: %s",
                               bch2_blk_status_to_str(bio->bi_status)))
                set_bit(wbio->dev, op->failed.d);
 
@@ -1281,15 +1311,14 @@ void bch2_write(struct closure *cl)
        wbio_init(bio)->put_bio = false;
 
        if (bio_sectors(bio) & (c->opts.block_size - 1)) {
-               __bcache_io_error(c, "misaligned write");
+               bch_err_inum_ratelimited(c, op->pos.inode,
+                                        "misaligned write");
                op->error = -EIO;
                goto err;
        }
 
        if (c->opts.nochanges ||
            !percpu_ref_tryget(&c->writes)) {
-               if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
-                       __bcache_io_error(c, "read only");
                op->error = -EROFS;
                goto err;
        }
@@ -1518,8 +1547,8 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
 
        promote = __promote_alloc(c,
                                  k.k->type == KEY_TYPE_reflink_v
-                                 ? BTREE_ID_REFLINK
-                                 : BTREE_ID_EXTENTS,
+                                 ? BTREE_ID_reflink
+                                 : BTREE_ID_extents,
                                  k, pos, pick, opts, sectors, rbio);
        if (!promote)
                return NULL;
@@ -1605,18 +1634,18 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct bkey_s_c k;
        int ret;
 
        flags &= ~BCH_READ_LAST_FRAGMENT;
        flags |= BCH_READ_MUST_CLONE;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-                                  rbio->pos, BTREE_ITER_SLOTS);
+       iter = bch2_trans_get_iter(&trans, rbio->data_btree,
+                                  rbio->read_pos, BTREE_ITER_SLOTS);
 retry:
        rbio->bio.bi_status = 0;
 
@@ -1624,109 +1653,38 @@ retry:
        if (bkey_err(k))
                goto err;
 
-       bkey_on_stack_reassemble(&sk, c, k);
+       bch2_bkey_buf_reassemble(&sk, c, k);
        k = bkey_i_to_s_c(sk.k);
        bch2_trans_unlock(&trans);
 
        if (!bch2_bkey_matches_ptr(c, k,
                                   rbio->pick.ptr,
-                                  rbio->pos.offset -
+                                  rbio->data_pos.offset -
                                   rbio->pick.crc.offset)) {
                /* extent we wanted to read no longer exists: */
                rbio->hole = true;
                goto out;
        }
 
-       ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags);
+       ret = __bch2_read_extent(&trans, rbio, bvec_iter,
+                                rbio->read_pos,
+                                rbio->data_btree,
+                                k, 0, failed, flags);
        if (ret == READ_RETRY)
                goto retry;
        if (ret)
                goto err;
 out:
        bch2_rbio_done(rbio);
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
        return;
 err:
        rbio->bio.bi_status = BLK_STS_IOERR;
        goto out;
 }
 
-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
-                           struct bvec_iter bvec_iter, u64 inode,
-                           struct bch_io_failures *failed, unsigned flags)
-{
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bkey_on_stack sk;
-       struct bkey_s_c k;
-       int ret;
-
-       flags &= ~BCH_READ_LAST_FRAGMENT;
-       flags |= BCH_READ_MUST_CLONE;
-
-       bkey_on_stack_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-                          POS(inode, bvec_iter.bi_sector),
-                          BTREE_ITER_SLOTS, k, ret) {
-               unsigned bytes, sectors, offset_into_extent;
-
-               bkey_on_stack_reassemble(&sk, c, k);
-
-               offset_into_extent = iter->pos.offset -
-                       bkey_start_offset(k.k);
-               sectors = k.k->size - offset_into_extent;
-
-               ret = bch2_read_indirect_extent(&trans,
-                                       &offset_into_extent, &sk);
-               if (ret)
-                       break;
-
-               k = bkey_i_to_s_c(sk.k);
-
-               sectors = min(sectors, k.k->size - offset_into_extent);
-
-               bch2_trans_unlock(&trans);
-
-               bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
-               swap(bvec_iter.bi_size, bytes);
-
-               ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
-                               offset_into_extent, failed, flags);
-               switch (ret) {
-               case READ_RETRY:
-                       goto retry;
-               case READ_ERR:
-                       goto err;
-               };
-
-               if (bytes == bvec_iter.bi_size)
-                       goto out;
-
-               swap(bvec_iter.bi_size, bytes);
-               bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-       }
-
-       if (ret == -EINTR)
-               goto retry;
-       /*
-        * If we get here, it better have been because there was an error
-        * reading a btree node
-        */
-       BUG_ON(!ret);
-       __bcache_io_error(c, "btree IO error: %i", ret);
-err:
-       rbio->bio.bi_status = BLK_STS_IOERR;
-out:
-       bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
-       bch2_rbio_done(rbio);
-}
-
 static void bch2_rbio_retry(struct work_struct *work)
 {
        struct bch_read_bio *rbio =
@@ -1734,7 +1692,7 @@ static void bch2_rbio_retry(struct work_struct *work)
        struct bch_fs *c        = rbio->c;
        struct bvec_iter iter   = rbio->bvec_iter;
        unsigned flags          = rbio->flags;
-       u64 inode               = rbio->pos.inode;
+       u64 inode               = rbio->read_pos.inode;
        struct bch_io_failures failed = { .nr = 0 };
 
        trace_read_retry(&rbio->bio);
@@ -1749,10 +1707,14 @@ static void bch2_rbio_retry(struct work_struct *work)
        flags |= BCH_READ_IN_RETRY;
        flags &= ~BCH_READ_MAY_PROMOTE;
 
-       if (flags & BCH_READ_NODECODE)
+       if (flags & BCH_READ_NODECODE) {
                bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
-       else
-               bch2_read_retry(c, rbio, iter, inode, &failed, flags);
+       } else {
+               flags &= ~BCH_READ_LAST_FRAGMENT;
+               flags |= BCH_READ_MUST_CLONE;
+
+               __bch2_read(c, rbio, iter, inode, &failed, flags);
+       }
 }
 
 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
@@ -1778,7 +1740,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
                                   struct bch_read_bio *rbio)
 {
        struct bch_fs *c = rbio->c;
-       u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
+       u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
        struct bch_extent_crc_unpacked new_crc;
        struct btree_iter *iter = NULL;
        struct bkey_i *new;
@@ -1788,26 +1750,12 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if (crc_is_compressed(rbio->pick.crc))
                return 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos,
+       iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       if ((ret = PTR_ERR_OR_ZERO(iter)))
-               goto out;
-
        k = bch2_btree_iter_peek_slot(iter);
        if ((ret = bkey_err(k)))
                goto out;
 
-       /*
-        * going to be temporarily appending another checksum entry:
-        */
-       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-                                BKEY_EXTENT_U64s_MAX * 8);
-       if ((ret = PTR_ERR_OR_ZERO(new)))
-               goto out;
-
-       bkey_reassemble(new, k);
-       k = bkey_i_to_s_c(new);
-
        if (bversion_cmp(k.k->version, rbio->version) ||
            !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
                goto out;
@@ -1826,6 +1774,16 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
                goto out;
        }
 
+       /*
+        * going to be temporarily appending another checksum entry:
+        */
+       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+                                sizeof(struct bch_extent_crc128));
+       if ((ret = PTR_ERR_OR_ZERO(new)))
+               goto out;
+
+       bkey_reassemble(new, k);
+
        if (!bch2_bkey_narrow_crcs(new, new_crc))
                goto out;
 
@@ -1925,17 +1883,15 @@ csum_err:
                return;
        }
 
-       bch2_dev_io_error(ca,
-               "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
-               rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+       bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
+               "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
                rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
                csum.hi, csum.lo, crc.csum_type);
        bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
        return;
 decompression_err:
-       __bcache_io_error(c, "decompression error, inode %llu offset %llu",
-                         rbio->pos.inode,
-                         (u64) rbio->bvec_iter.bi_sector);
+       bch_err_inum_ratelimited(c, rbio->read_pos.inode,
+                                "decompression error");
        bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
        return;
 }
@@ -1957,7 +1913,10 @@ static void bch2_read_endio(struct bio *bio)
        if (!rbio->split)
                rbio->bio.bi_end_io = rbio->end_io;
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+                                   rbio->read_pos.inode,
+                                   rbio->read_pos.offset,
+                                   "data read error: %s",
                               bch2_blk_status_to_str(bio->bi_status))) {
                bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
                return;
@@ -1987,7 +1946,7 @@ static void bch2_read_endio(struct bio *bio)
 
 int __bch2_read_indirect_extent(struct btree_trans *trans,
                                unsigned *offset_into_extent,
-                               struct bkey_on_stack *orig_k)
+                               struct bkey_buf *orig_k)
 {
        struct btree_iter *iter;
        struct bkey_s_c k;
@@ -1997,13 +1956,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
        reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
                *offset_into_extent;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+       iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
                                   POS(0, reflink_offset),
                                   BTREE_ITER_SLOTS);
-       ret = PTR_ERR_OR_ZERO(iter);
-       if (ret)
-               return ret;
-
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
@@ -2011,21 +1966,22 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
 
        if (k.k->type != KEY_TYPE_reflink_v &&
            k.k->type != KEY_TYPE_indirect_inline_data) {
-               __bcache_io_error(trans->c,
+               bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
                                "pointer to nonexistent indirect extent");
                ret = -EIO;
                goto err;
        }
 
        *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
-       bkey_on_stack_reassemble(orig_k, trans->c, k);
+       bch2_bkey_buf_reassemble(orig_k, trans->c, k);
 err:
        bch2_trans_iter_put(trans, iter);
        return ret;
 }
 
 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
-                      struct bvec_iter iter, struct bkey_s_c k,
+                      struct bvec_iter iter, struct bpos read_pos,
+                      enum btree_id data_btree, struct bkey_s_c k,
                       unsigned offset_into_extent,
                       struct bch_io_failures *failed, unsigned flags)
 {
@@ -2035,7 +1991,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
        struct bch_dev *ca;
        struct promote_op *promote = NULL;
        bool bounce = false, read_full = false, narrow_crcs = false;
-       struct bpos pos = bkey_start_pos(k.k);
+       struct bpos data_pos = bkey_start_pos(k.k);
        int pick_ret;
 
        if (bkey_extent_is_inline_data(k.k)) {
@@ -2057,7 +2013,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                goto hole;
 
        if (pick_ret < 0) {
-               __bcache_io_error(c, "no device to read from");
+               bch_err_inum_ratelimited(c, k.k->p.inode,
+                                        "no device to read from");
                goto err;
        }
 
@@ -2110,7 +2067,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                         pick.crc.offset ||
                         offset_into_extent));
 
-               pos.offset += offset_into_extent;
+               data_pos.offset += offset_into_extent;
                pick.ptr.offset += pick.crc.offset +
                        offset_into_extent;
                offset_into_extent              = 0;
@@ -2182,7 +2139,9 @@ get_bio:
        /* XXX: only initialize this if needed */
        rbio->devs_have         = bch2_bkey_devs(k);
        rbio->pick              = pick;
-       rbio->pos               = pos;
+       rbio->read_pos          = read_pos;
+       rbio->data_btree        = data_btree;
+       rbio->data_pos          = data_pos;
        rbio->version           = k.k->version;
        rbio->promote           = promote;
        INIT_WORK(&rbio->work, NULL);
@@ -2196,7 +2155,11 @@ get_bio:
 
        bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-       if (pick.ptr.cached)
+       /*
+        * If it's being moved internally, we don't want to flag it as a cache
+        * hit:
+        */
+       if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
                bch2_bucket_io_time_reset(trans, pick.ptr.dev,
                        PTR_BUCKET_NR(ca, &pick.ptr), READ);
 
@@ -2207,7 +2170,8 @@ get_bio:
 
        if (!rbio->pick.idx) {
                if (!rbio->have_ioref) {
-                       __bcache_io_error(c, "no device to read from");
+                       bch_err_inum_ratelimited(c, k.k->p.inode,
+                                                "no device to read from");
                        bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
                        goto out;
                }
@@ -2247,6 +2211,9 @@ out:
                        ret = READ_RETRY;
                }
 
+               if (!ret)
+                       goto out_read_done;
+
                return ret;
        }
 
@@ -2273,53 +2240,48 @@ out_read_done:
        return 0;
 }
 
-void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+                struct bvec_iter bvec_iter, u64 inode,
+                struct bch_io_failures *failed, unsigned flags)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct bkey_s_c k;
-       unsigned flags = BCH_READ_RETRY_IF_STALE|
-               BCH_READ_MAY_PROMOTE|
-               BCH_READ_USER_MAPPED;
        int ret;
 
-       BUG_ON(rbio->_state);
        BUG_ON(flags & BCH_READ_NODECODE);
-       BUG_ON(flags & BCH_READ_IN_RETRY);
 
-       rbio->c = c;
-       rbio->start_time = local_clock();
-
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-                                  POS(inode, rbio->bio.bi_iter.bi_sector),
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+                                  POS(inode, bvec_iter.bi_sector),
                                   BTREE_ITER_SLOTS);
        while (1) {
                unsigned bytes, sectors, offset_into_extent;
+               enum btree_id data_btree = BTREE_ID_extents;
 
                bch2_btree_iter_set_pos(iter,
-                               POS(inode, rbio->bio.bi_iter.bi_sector));
+                               POS(inode, bvec_iter.bi_sector));
 
                k = bch2_btree_iter_peek_slot(iter);
                ret = bkey_err(k);
                if (ret)
-                       goto err;
+                       break;
 
                offset_into_extent = iter->pos.offset -
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
 
-               ret = bch2_read_indirect_extent(&trans,
+               ret = bch2_read_indirect_extent(&trans, &data_btree,
                                        &offset_into_extent, &sk);
                if (ret)
-                       goto err;
+                       break;
 
                k = bkey_i_to_s_c(sk.k);
 
@@ -2335,31 +2297,37 @@ retry:
                 */
                bch2_trans_unlock(&trans);
 
-               bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
-               swap(rbio->bio.bi_iter.bi_size, bytes);
+               bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+               swap(bvec_iter.bi_size, bytes);
 
-               if (rbio->bio.bi_iter.bi_size == bytes)
+               if (bvec_iter.bi_size == bytes)
                        flags |= BCH_READ_LAST_FRAGMENT;
 
-               bch2_read_extent(&trans, rbio, k, offset_into_extent, flags);
+               ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
+                                        data_btree, k,
+                                        offset_into_extent, failed, flags);
+               if (ret)
+                       break;
 
                if (flags & BCH_READ_LAST_FRAGMENT)
                        break;
 
-               swap(rbio->bio.bi_iter.bi_size, bytes);
-               bio_advance(&rbio->bio, bytes);
+               swap(bvec_iter.bi_size, bytes);
+               bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
        }
-out:
-       bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
-       return;
-err:
-       if (ret == -EINTR)
+       bch2_trans_iter_put(&trans, iter);
+
+       if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
                goto retry;
 
-       bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
-       bch2_rbio_done(rbio);
-       goto out;
+       if (ret) {
+               bch_err_inum_ratelimited(c, inode,
+                                        "read error %i from btree lookup", ret);
+               rbio->bio.bi_status = BLK_STS_IOERR;
+               bch2_rbio_done(rbio);
+       }
+       bch2_trans_exit(&trans);
+       bch2_bkey_buf_exit(&sk, c);
 }
 
 void bch2_fs_io_exit(struct bch_fs *c)
index e6aac594f3e6a8e0267c0cad3aabde00059b0fae..2ac03c049c9236279cd5430a792058e295c84783 100644 (file)
@@ -3,7 +3,7 @@
 #define _BCACHEFS_IO_H
 
 #include "checksum.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "io_types.h"
 
 #define to_wbio(_bio)                  \
@@ -60,6 +60,8 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
                : op->c->wq;
 }
 
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+                              struct bkey_i *, bool *, bool *, s64 *, s64 *);
 int bch2_extent_update(struct btree_trans *, struct btree_iter *,
                       struct bkey_i *, struct disk_reservation *,
                       u64 *, u64, s64 *);
@@ -112,15 +114,18 @@ struct cache_promote_op;
 struct extent_ptr_decoded;
 
 int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-                               struct bkey_on_stack *);
+                               struct bkey_buf *);
 
 static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+                                           enum btree_id *data_btree,
                                            unsigned *offset_into_extent,
-                                           struct bkey_on_stack *k)
+                                           struct bkey_buf *k)
 {
-       return k->k->k.type == KEY_TYPE_reflink_p
-               ? __bch2_read_indirect_extent(trans, offset_into_extent, k)
-               : 0;
+       if (k->k->k.type != KEY_TYPE_reflink_p)
+               return 0;
+
+       *data_btree = BTREE_ID_reflink;
+       return __bch2_read_indirect_extent(trans, offset_into_extent, k);
 }
 
 enum bch_read_flags {
@@ -137,20 +142,37 @@ enum bch_read_flags {
 };
 
 int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
-                      struct bvec_iter, struct bkey_s_c, unsigned,
+                      struct bvec_iter, struct bpos, enum btree_id,
+                      struct bkey_s_c, unsigned,
                       struct bch_io_failures *, unsigned);
 
 static inline void bch2_read_extent(struct btree_trans *trans,
-                                   struct bch_read_bio *rbio,
-                                   struct bkey_s_c k,
-                                   unsigned offset_into_extent,
-                                   unsigned flags)
+                       struct bch_read_bio *rbio, struct bpos read_pos,
+                       enum btree_id data_btree, struct bkey_s_c k,
+                       unsigned offset_into_extent, unsigned flags)
 {
-       __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k,
-                          offset_into_extent, NULL, flags);
+       __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+                          data_btree, k, offset_into_extent, NULL, flags);
 }
 
-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+                u64, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+                            u64 inode)
+{
+       struct bch_io_failures failed = { .nr = 0 };
+
+       BUG_ON(rbio->_state);
+
+       rbio->c = c;
+       rbio->start_time = local_clock();
+
+       __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+                   BCH_READ_RETRY_IF_STALE|
+                   BCH_READ_MAY_PROMOTE|
+                   BCH_READ_USER_MAPPED);
+}
 
 static inline struct bch_read_bio *rbio_init(struct bio *bio,
                                             struct bch_io_opts opts)
index b23727d212b9d9353a0f9bc3531c9508d0c29351..e7aca7c9823aff2a5c6cfe8a21820c41d3d627fd 100644 (file)
@@ -58,8 +58,18 @@ struct bch_read_bio {
        struct bch_devs_list    devs_have;
 
        struct extent_ptr_decoded pick;
-       /* start pos of data we read (may not be pos of data we want) */
-       struct bpos             pos;
+
+       /*
+        * pos we read from - different from data_pos for indirect extents:
+        */
+       struct bpos             read_pos;
+
+       /*
+        * start pos of data we read (may not be pos of data we want) - for
+        * promote, narrow extents paths:
+        */
+       enum btree_id           data_btree;
+       struct bpos             data_pos;
        struct bversion         version;
 
        struct promote_op       *promote;
index b8b719902c637ffd739d7b43f1432beb8820f327..b901be5ba9c0878402e208c3df256fee103c87ae 100644 (file)
@@ -9,7 +9,9 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "btree_gc.h"
+#include "btree_update.h"
 #include "buckets.h"
+#include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 
 #include <trace/events/bcachefs.h>
 
-static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64);
+static u64 last_unwritten_seq(struct journal *j)
+{
+       union journal_res_state s = READ_ONCE(j->reservations);
+
+       lockdep_assert_held(&j->lock);
+
+       return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
+}
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+       return seq >= last_unwritten_seq(j);
+}
 
 static bool __journal_entry_is_open(union journal_res_state state)
 {
@@ -30,27 +44,50 @@ static bool journal_entry_is_open(struct journal *j)
        return __journal_entry_is_open(j->reservations);
 }
 
-static void journal_pin_new_entry(struct journal *j, int count)
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
 {
-       struct journal_entry_pin_list *p;
+       struct journal_buf *buf = NULL;
 
-       /*
-        * The fifo_push() needs to happen at the same time as j->seq is
-        * incremented for journal_last_seq() to be calculated correctly
-        */
-       atomic64_inc(&j->seq);
-       p = fifo_push_ref(&j->pin);
+       EBUG_ON(seq > journal_cur_seq(j));
+       EBUG_ON(seq == journal_cur_seq(j) &&
+               j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+       if (journal_seq_unwritten(j, seq)) {
+               buf = j->buf + (seq & JOURNAL_BUF_MASK);
+               EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+       }
+       return buf;
+}
 
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
+{
        INIT_LIST_HEAD(&p->list);
+       INIT_LIST_HEAD(&p->key_cache_list);
        INIT_LIST_HEAD(&p->flushed);
        atomic_set(&p->count, count);
        p->devs.nr = 0;
 }
 
+static void journal_pin_new_entry(struct journal *j)
+{
+       /*
+        * The fifo_push() needs to happen at the same time as j->seq is
+        * incremented for journal_last_seq() to be calculated correctly
+        */
+       atomic64_inc(&j->seq);
+       journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+}
+
 static void bch2_journal_buf_init(struct journal *j)
 {
        struct journal_buf *buf = journal_cur_buf(j);
 
+       bkey_extent_init(&buf->key);
+       buf->noflush    = false;
+       buf->must_flush = false;
+       buf->separate_flush = false;
+
        memset(buf->has_inode, 0, sizeof(buf->has_inode));
 
        memset(buf->data, 0, sizeof(*buf->data));
@@ -72,26 +109,23 @@ void bch2_journal_halt(struct journal *j)
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
+       j->err_seq = journal_cur_seq(j);
        journal_wake(j);
        closure_wake_up(&journal_cur_buf(j)->wait);
 }
 
 /* journal entry close/open: */
 
-void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+void __bch2_journal_buf_put(struct journal *j)
 {
-       if (!need_write_just_set &&
-           test_bit(JOURNAL_NEED_WRITE, &j->flags))
-               bch2_time_stats_update(j->delay_time,
-                                      j->need_write_time);
-
-       clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
        closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
 }
 
 /*
  * Returns true if journal entry is now closed:
+ *
+ * We don't close a journal_buf until the next journal_buf is finished writing,
+ * and can be opened again - this also initializes the next journal_buf:
  */
 static bool __journal_entry_close(struct journal *j)
 {
@@ -99,7 +133,6 @@ static bool __journal_entry_close(struct journal *j)
        struct journal_buf *buf = journal_cur_buf(j);
        union journal_res_state old, new;
        u64 v = atomic64_read(&j->reservations.counter);
-       bool set_need_write = false;
        unsigned sectors;
 
        lockdep_assert_held(&j->lock);
@@ -118,20 +151,19 @@ static bool __journal_entry_close(struct journal *j)
                if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
                        set_bit(JOURNAL_NEED_WRITE, &j->flags);
                        j->need_write_time = local_clock();
-                       set_need_write = true;
                }
 
-               if (new.prev_buf_unwritten)
-                       return false;
-
                new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
                new.idx++;
-               new.prev_buf_unwritten = 1;
+
+               if (new.idx == new.unwritten_idx)
+                       return false;
 
                BUG_ON(journal_state_count(new, new.idx));
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
+       /* Close out old buffer: */
        buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
 
        sectors = vstruct_blocks_plus(buf->data, c->block_bits,
@@ -139,8 +171,6 @@ static bool __journal_entry_close(struct journal *j)
        BUG_ON(sectors > buf->sectors);
        buf->sectors = sectors;
 
-       bkey_extent_init(&buf->key);
-
        /*
         * We have to set last_seq here, _before_ opening a new journal entry:
         *
@@ -162,29 +192,45 @@ static bool __journal_entry_close(struct journal *j)
         */
        buf->data->last_seq     = cpu_to_le64(journal_last_seq(j));
 
-       if (journal_entry_empty(buf->data))
-               clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
-       else
-               set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+       __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
 
-       journal_pin_new_entry(j, 1);
+       /* Initialize new buffer: */
+       journal_pin_new_entry(j);
 
        bch2_journal_buf_init(j);
 
        cancel_delayed_work(&j->write_work);
+       clear_bit(JOURNAL_NEED_WRITE, &j->flags);
 
        bch2_journal_space_available(j);
 
-       bch2_journal_buf_put(j, old.idx, set_need_write);
+       bch2_journal_buf_put(j, old.idx);
        return true;
 }
 
+static bool journal_entry_want_write(struct journal *j)
+{
+       union journal_res_state s = READ_ONCE(j->reservations);
+       bool ret = false;
+
+       /*
+        * Don't close it yet if we already have a write in flight, but do set
+        * NEED_WRITE:
+        */
+       if (s.idx != s.unwritten_idx)
+               set_bit(JOURNAL_NEED_WRITE, &j->flags);
+       else
+               ret = __journal_entry_close(j);
+
+       return ret;
+}
+
 static bool journal_entry_close(struct journal *j)
 {
        bool ret;
 
        spin_lock(&j->lock);
-       ret = __journal_entry_close(j);
+       ret = journal_entry_want_write(j);
        spin_unlock(&j->lock);
 
        return ret;
@@ -202,16 +248,19 @@ static bool journal_entry_close(struct journal *j)
  */
 static int journal_entry_open(struct journal *j)
 {
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *buf = journal_cur_buf(j);
        union journal_res_state old, new;
        int u64s;
        u64 v;
 
+       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
        lockdep_assert_held(&j->lock);
        BUG_ON(journal_entry_is_open(j));
 
        if (j->blocked)
-               return -EAGAIN;
+               return cur_entry_blocked;
 
        if (j->cur_entry_error)
                return j->cur_entry_error;
@@ -227,7 +276,7 @@ static int journal_entry_open(struct journal *j)
        u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
        if (u64s <= le32_to_cpu(buf->data->u64s))
-               return -ENOSPC;
+               return cur_entry_journal_full;
 
        /*
         * Must be set before marking the journal entry as open:
@@ -239,7 +288,7 @@ static int journal_entry_open(struct journal *j)
                old.v = new.v = v;
 
                if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-                       return -EROFS;
+                       return cur_entry_insufficient_devices;
 
                /* Handle any already added entries */
                new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
@@ -263,8 +312,8 @@ static int journal_entry_open(struct journal *j)
 
 static bool journal_quiesced(struct journal *j)
 {
-       union journal_res_state state = READ_ONCE(j->reservations);
-       bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
+       union journal_res_state s = READ_ONCE(j->reservations);
+       bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
 
        if (!ret)
                journal_entry_close(j);
@@ -291,17 +340,29 @@ static void journal_write_work(struct work_struct *work)
 u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 {
        size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-       u64 seq = 0;
+       union journal_res_state s;
+       unsigned i;
+       u64 seq;
 
-       if (!test_bit(h, j->buf[0].has_inode) &&
-           !test_bit(h, j->buf[1].has_inode))
-               return 0;
 
        spin_lock(&j->lock);
-       if (test_bit(h, journal_cur_buf(j)->has_inode))
-               seq = journal_cur_seq(j);
-       else if (test_bit(h, journal_prev_buf(j)->has_inode))
-               seq = journal_cur_seq(j) - 1;
+       seq = journal_cur_seq(j);
+       s = READ_ONCE(j->reservations);
+       i = s.idx;
+
+       while (1) {
+               if (test_bit(h, j->buf[i].has_inode))
+                       goto out;
+
+               if (i == s.unwritten_idx)
+                       break;
+
+               i = (i - 1) & JOURNAL_BUF_MASK;
+               seq--;
+       }
+
+       seq = 0;
+out:
        spin_unlock(&j->lock);
 
        return seq;
@@ -352,7 +413,7 @@ retry:
                 * Don't want to close current journal entry, just need to
                 * invoke reclaim:
                 */
-               ret = -ENOSPC;
+               ret = cur_entry_journal_full;
                goto unlock;
        }
 
@@ -375,14 +436,16 @@ retry:
                 * there's still a previous one in flight:
                 */
                trace_journal_entry_full(c);
-               ret = -EAGAIN;
+               ret = cur_entry_blocked;
        } else {
                ret = journal_entry_open(j);
        }
 unlock:
-       if ((ret == -EAGAIN || ret == -ENOSPC) &&
-           !j->res_get_blocked_start)
+       if ((ret && ret != cur_entry_insufficient_devices) &&
+           !j->res_get_blocked_start) {
                j->res_get_blocked_start = local_clock() ?: 1;
+               trace_journal_full(c);
+       }
 
        can_discard = j->can_discard;
        spin_unlock(&j->lock);
@@ -390,32 +453,46 @@ unlock:
        if (!ret)
                goto retry;
 
-       if (ret == -ENOSPC) {
-               WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
-                         "JOURNAL_RES_GET_RESERVED set but journal full");
-
-               /*
-                * Journal is full - can't rely on reclaim from work item due to
-                * freezing:
-                */
-               trace_journal_full(c);
+       if ((ret == cur_entry_journal_full ||
+            ret == cur_entry_journal_pin_full) &&
+           !can_discard &&
+           j->reservations.idx == j->reservations.unwritten_idx &&
+           (flags & JOURNAL_RES_GET_RESERVED)) {
+               char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+
+               bch_err(c, "Journal stuck!");
+               if (journal_debug_buf) {
+                       bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+                       bch_err(c, "%s", journal_debug_buf);
+
+                       bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
+                       bch_err(c, "Journal pins:\n%s", journal_debug_buf);
+                       kfree(journal_debug_buf);
+               }
 
-               if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
-                       if (can_discard) {
-                               bch2_journal_do_discards(j);
-                               goto retry;
-                       }
+               bch2_fatal_error(c);
+               dump_stack();
+       }
 
-                       if (mutex_trylock(&j->reclaim_lock)) {
-                               bch2_journal_reclaim(j);
-                               mutex_unlock(&j->reclaim_lock);
-                       }
+       /*
+        * Journal is full - can't rely on reclaim from work item due to
+        * freezing:
+        */
+       if ((ret == cur_entry_journal_full ||
+            ret == cur_entry_journal_pin_full) &&
+           !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+               if (can_discard) {
+                       bch2_journal_do_discards(j);
+                       goto retry;
                }
 
-               ret = -EAGAIN;
+               if (mutex_trylock(&j->reclaim_lock)) {
+                       bch2_journal_reclaim(j);
+                       mutex_unlock(&j->reclaim_lock);
+               }
        }
 
-       return ret;
+       return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
 }
 
 /*
@@ -446,10 +523,12 @@ static bool journal_preres_available(struct journal *j,
                                     unsigned new_u64s,
                                     unsigned flags)
 {
-       bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
+       bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
 
-       if (!ret)
-               bch2_journal_reclaim_work(&j->reclaim_work.work);
+       if (!ret && mutex_trylock(&j->reclaim_lock)) {
+               bch2_journal_reclaim(j);
+               mutex_unlock(&j->reclaim_lock);
+       }
 
        return ret;
 }
@@ -503,168 +582,82 @@ out:
 
 /* journal flushing: */
 
-u64 bch2_journal_last_unwritten_seq(struct journal *j)
-{
-       u64 seq;
-
-       spin_lock(&j->lock);
-       seq = journal_cur_seq(j);
-       if (j->reservations.prev_buf_unwritten)
-               seq--;
-       spin_unlock(&j->lock);
-
-       return seq;
-}
-
 /**
- * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
- * open yet, or wait if we cannot
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
  *
- * used by the btree interior update machinery, when it needs to write a new
- * btree root - every journal entry contains the roots of all the btrees, so it
- * doesn't need to bother with getting a journal reservation
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
  */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+                                struct closure *parent)
 {
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       int ret;
+       struct journal_buf *buf;
+       int ret = 0;
 
-       spin_lock(&j->lock);
+       if (seq <= j->flushed_seq_ondisk)
+               return 1;
 
-       /*
-        * Can't try to open more than one sequence number ahead:
-        */
-       BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
-
-       if (journal_cur_seq(j) > seq ||
-           journal_entry_is_open(j)) {
-               spin_unlock(&j->lock);
-               return 0;
-       }
+       spin_lock(&j->lock);
 
-       if (journal_cur_seq(j) < seq &&
-           !__journal_entry_close(j)) {
-               /* haven't finished writing out the previous one: */
-               trace_journal_entry_full(c);
-               ret = -EAGAIN;
-       } else {
-               BUG_ON(journal_cur_seq(j) != seq);
+       BUG_ON(seq > journal_cur_seq(j));
 
-               ret = journal_entry_open(j);
+       /* Recheck under lock: */
+       if (j->err_seq && seq >= j->err_seq) {
+               ret = -EIO;
+               goto out;
        }
 
-       if ((ret == -EAGAIN || ret == -ENOSPC) &&
-           !j->res_get_blocked_start)
-               j->res_get_blocked_start = local_clock() ?: 1;
-
-       if (ret == -EAGAIN || ret == -ENOSPC)
-               closure_wait(&j->async_wait, cl);
-
-       spin_unlock(&j->lock);
-
-       if (ret == -ENOSPC) {
-               trace_journal_full(c);
-               bch2_journal_reclaim_work(&j->reclaim_work.work);
-               ret = -EAGAIN;
+       if (seq <= j->flushed_seq_ondisk) {
+               ret = 1;
+               goto out;
        }
 
-       return ret;
-}
-
-static int journal_seq_error(struct journal *j, u64 seq)
-{
-       union journal_res_state state = READ_ONCE(j->reservations);
+       /* if seq was written, but not flushed - flush a newer one instead */
+       seq = max(seq, last_unwritten_seq(j));
 
-       if (seq == journal_cur_seq(j))
-               return bch2_journal_error(j);
-
-       if (seq + 1 == journal_cur_seq(j) &&
-           !state.prev_buf_unwritten &&
-           seq > j->seq_ondisk)
-               return -EIO;
-
-       return 0;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
-       /* seq should be for a journal entry that has been opened: */
-       BUG_ON(seq > journal_cur_seq(j));
-       BUG_ON(seq == journal_cur_seq(j) &&
-              j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+recheck_need_open:
+       if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+               struct journal_res res = { 0 };
 
-       if (seq == journal_cur_seq(j))
-               return journal_cur_buf(j);
-       if (seq + 1 == journal_cur_seq(j) &&
-           j->reservations.prev_buf_unwritten)
-               return journal_prev_buf(j);
-       return NULL;
-}
+               spin_unlock(&j->lock);
 
-/**
- * bch2_journal_wait_on_seq - wait for a journal entry to be written
- *
- * does _not_ cause @seq to be written immediately - if there is no other
- * activity to cause the relevant journal entry to be filled up or flushed it
- * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
- * configurable).
- */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
-                             struct closure *parent)
-{
-       struct journal_buf *buf;
+               ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+               if (ret)
+                       return ret;
 
-       spin_lock(&j->lock);
+               seq = res.seq;
+               buf = j->buf + (seq & JOURNAL_BUF_MASK);
+               buf->must_flush = true;
+               set_bit(JOURNAL_NEED_WRITE, &j->flags);
 
-       if ((buf = journal_seq_to_buf(j, seq))) {
-               if (!closure_wait(&buf->wait, parent))
+               if (parent && !closure_wait(&buf->wait, parent))
                        BUG();
 
-               if (seq == journal_cur_seq(j)) {
-                       smp_mb();
-                       if (bch2_journal_error(j))
-                               closure_wake_up(&buf->wait);
-               }
-       }
+               bch2_journal_res_put(j, &res);
 
-       spin_unlock(&j->lock);
-}
-
-/**
- * bch2_journal_flush_seq_async - wait for a journal entry to be written
- *
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
- * necessary
- */
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
-                                 struct closure *parent)
-{
-       struct journal_buf *buf;
-
-       spin_lock(&j->lock);
-
-       if (parent &&
-           (buf = journal_seq_to_buf(j, seq)))
-               if (!closure_wait(&buf->wait, parent))
-                       BUG();
-
-       if (seq == journal_cur_seq(j))
-               __journal_entry_close(j);
-       spin_unlock(&j->lock);
-}
+               spin_lock(&j->lock);
+               goto want_write;
+       }
 
-static int journal_seq_flushed(struct journal *j, u64 seq)
-{
-       int ret;
+       /*
+        * if write was kicked off without a flush, flush the next sequence
+        * number instead
+        */
+       buf = journal_seq_to_buf(j, seq);
+       if (buf->noflush) {
+               seq++;
+               goto recheck_need_open;
+       }
 
-       spin_lock(&j->lock);
-       ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
+       buf->must_flush = true;
 
+       if (parent && !closure_wait(&buf->wait, parent))
+               BUG();
+want_write:
        if (seq == journal_cur_seq(j))
-               __journal_entry_close(j);
+               journal_entry_want_write(j);
+out:
        spin_unlock(&j->lock);
-
        return ret;
 }
 
@@ -673,28 +666,14 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
        u64 start_time = local_clock();
        int ret, ret2;
 
-       ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+       ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
-       bch2_time_stats_update(j->flush_seq_time, start_time);
+       if (!ret)
+               bch2_time_stats_update(j->flush_seq_time, start_time);
 
        return ret ?: ret2 < 0 ? ret2 : 0;
 }
 
-/**
- * bch2_journal_meta_async - force a journal entry to be written
- */
-void bch2_journal_meta_async(struct journal *j, struct closure *parent)
-{
-       struct journal_res res;
-
-       memset(&res, 0, sizeof(res));
-
-       bch2_journal_res_get(j, &res, jset_u64s(0), 0);
-       bch2_journal_res_put(j, &res);
-
-       bch2_journal_flush_seq_async(j, res.seq, parent);
-}
-
 int bch2_journal_meta(struct journal *j)
 {
        struct journal_res res;
@@ -790,16 +769,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        if (nr <= ja->nr)
                return 0;
 
-       ret = -ENOMEM;
        new_buckets     = kzalloc(nr * sizeof(u64), GFP_KERNEL);
        new_bucket_seq  = kzalloc(nr * sizeof(u64), GFP_KERNEL);
-       if (!new_buckets || !new_bucket_seq)
+       if (!new_buckets || !new_bucket_seq) {
+               ret = -ENOMEM;
                goto err;
+       }
 
        journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-                                                nr + sizeof(*journal_buckets) / sizeof(u64));
-       if (!journal_buckets)
+                                       nr + sizeof(*journal_buckets) / sizeof(u64));
+       if (!journal_buckets) {
+               ret = -ENOSPC;
                goto err;
+       }
 
        /*
         * We may be called from the device add path, before the new device has
@@ -828,8 +810,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                goto err;
                        }
                } else {
-                       ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
+                       rcu_read_lock();
+                       ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
                                               false, cl);
+                       rcu_read_unlock();
                        if (IS_ERR(ob)) {
                                ret = cl ? -EAGAIN : -ENOSPC;
                                goto err;
@@ -843,6 +827,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                        spin_lock(&c->journal.lock);
                }
 
+               /*
+                * XXX
+                * For resize at runtime, we should be writing the new
+                * superblock before inserting into the journal array
+                */
+
                pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
                __array_insert_item(ja->buckets,                ja->nr, pos);
                __array_insert_item(ja->bucket_seq,             ja->nr, pos);
@@ -862,22 +852,32 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                if (pos <= ja->cur_idx)
                        ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
-               bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
-                                         ca->mi.bucket_size,
-                                         gc_phase(GC_PHASE_SB),
-                                         0);
+               if (!c || new_fs)
+                       bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
+                                                 ca->mi.bucket_size,
+                                                 gc_phase(GC_PHASE_SB),
+                                                 0);
 
                if (c) {
                        spin_unlock(&c->journal.lock);
                        percpu_up_read(&c->mark_lock);
                }
 
+               if (c && !new_fs)
+                       ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               bch2_trans_mark_metadata_bucket(&trans, NULL, ca,
+                                               bucket, BCH_DATA_journal,
+                                               ca->mi.bucket_size));
+
                if (!new_fs)
                        bch2_open_bucket_put(c, ob);
-       }
 
-       ret = 0;
+               if (ret)
+                       goto err;
+       }
 err:
+       bch2_sb_resize_journal(&ca->disk_sb,
+               ja->nr + sizeof(*journal_buckets) / sizeof(u64));
        kfree(new_bucket_seq);
        kfree(new_buckets);
 
@@ -938,14 +938,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
        if (dynamic_fault("bcachefs:add:journal_alloc"))
                return -ENOMEM;
 
+       /* 1/128th of the device by default: */
+       nr = ca->mi.nbuckets >> 7;
+
        /*
-        * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+        * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
         * is smaller:
         */
-       nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+       nr = clamp_t(unsigned, nr,
                     BCH_JOURNAL_BUCKETS_MIN,
-                    min(1 << 10,
-                        (1 << 20) / ca->mi.bucket_size));
+                    min(1 << 13,
+                        (1 << 24) / ca->mi.bucket_size));
 
        return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 }
@@ -955,15 +958,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 {
        union journal_res_state state;
-       struct journal_buf *w;
-       bool ret;
+       bool ret = false;
+       unsigned i;
 
        spin_lock(&j->lock);
        state = READ_ONCE(j->reservations);
-       w = j->buf + !state.idx;
+       i = state.idx;
 
-       ret = state.prev_buf_unwritten &&
-               bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
+       while (i != state.unwritten_idx) {
+               i = (i - 1) & JOURNAL_BUF_MASK;
+               if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+                       ret = true;
+       }
        spin_unlock(&j->lock);
 
        return ret;
@@ -980,17 +986,21 @@ void bch2_fs_journal_stop(struct journal *j)
 
        wait_event(j->wait, journal_entry_close(j));
 
-       /* do we need to write another journal entry? */
-       if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
-               bch2_journal_meta(j);
+       /*
+        * Always write a new journal entry, to make sure the clock hands are up
+        * to date (and match the superblock)
+        */
+       bch2_journal_meta(j);
 
        journal_quiesce(j);
 
        BUG_ON(!bch2_journal_error(j) &&
-              test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+              test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
+              (journal_entry_is_open(j) ||
+               j->last_empty_seq + 1 != journal_cur_seq(j)));
 
        cancel_delayed_work_sync(&j->write_work);
-       cancel_delayed_work_sync(&j->reclaim_work);
+       bch2_journal_reclaim_stop(j);
 }
 
 int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
@@ -1023,28 +1033,34 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        j->pin.back             = cur_seq;
        atomic64_set(&j->seq, cur_seq - 1);
 
-       fifo_for_each_entry_ptr(p, &j->pin, seq) {
-               INIT_LIST_HEAD(&p->list);
-               INIT_LIST_HEAD(&p->flushed);
-               atomic_set(&p->count, 1);
-               p->devs.nr = 0;
-       }
+       fifo_for_each_entry_ptr(p, &j->pin, seq)
+               journal_pin_list_init(p, 1);
 
        list_for_each_entry(i, journal_entries, list) {
+               unsigned ptr;
+
                seq = le64_to_cpu(i->j.seq);
                BUG_ON(seq >= cur_seq);
 
                if (seq < last_seq)
                        continue;
 
-               journal_seq_pin(j, seq)->devs = i->devs;
+               p = journal_seq_pin(j, seq);
+
+               p->devs.nr = 0;
+               for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+                       bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
        }
 
        spin_lock(&j->lock);
 
        set_bit(JOURNAL_STARTED, &j->flags);
+       j->last_flush_write = jiffies;
+
+       journal_pin_new_entry(j);
+
+       j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
 
-       journal_pin_new_entry(j, 1);
        bch2_journal_buf_init(j);
 
        c->last_bucket_seq_cleanup = journal_cur_seq(j);
@@ -1098,8 +1114,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-       kvpfree(j->buf[1].data, j->buf[1].buf_size);
-       kvpfree(j->buf[0].data, j->buf[0].buf_size);
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+               kvpfree(j->buf[i].data, j->buf[i].buf_size);
        free_fifo(&j->pin);
 }
 
@@ -1107,6 +1125,7 @@ int bch2_fs_journal_init(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        static struct lock_class_key res_key;
+       unsigned i;
        int ret = 0;
 
        pr_verbose_init(c->opts, "");
@@ -1115,33 +1134,34 @@ int bch2_fs_journal_init(struct journal *j)
        spin_lock_init(&j->err_lock);
        init_waitqueue_head(&j->wait);
        INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-       INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
+       init_waitqueue_head(&j->reclaim_wait);
        init_waitqueue_head(&j->pin_flush_wait);
        mutex_init(&j->reclaim_lock);
        mutex_init(&j->discard_lock);
 
        lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
-       j->buf[0].buf_size      = JOURNAL_ENTRY_SIZE_MIN;
-       j->buf[1].buf_size      = JOURNAL_ENTRY_SIZE_MIN;
        j->write_delay_ms       = 1000;
        j->reclaim_delay_ms     = 100;
 
-       /* Btree roots: */
-       j->entry_u64s_reserved +=
-               BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-
        atomic64_set(&j->reservations.counter,
                ((union journal_res_state)
                 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 
-       if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-           !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
-           !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
+       if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
                ret = -ENOMEM;
                goto out;
        }
 
+       for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+               j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+               j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+               if (!j->buf[i].data) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+       }
+
        j->pin.front = j->pin.back = 1;
 out:
        pr_verbose_init(c->opts, "ret %i", ret);
@@ -1150,15 +1170,14 @@ out:
 
 /* debug: */
 
-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        union journal_res_state s;
        struct bch_dev *ca;
-       unsigned iter;
+       unsigned i;
 
        rcu_read_lock();
-       spin_lock(&j->lock);
        s = READ_ONCE(j->reservations);
 
        pr_buf(out,
@@ -1166,16 +1185,30 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
               "seq:\t\t\t%llu\n"
               "last_seq:\t\t%llu\n"
               "last_seq_ondisk:\t%llu\n"
+              "flushed_seq_ondisk:\t%llu\n"
               "prereserved:\t\t%u/%u\n"
+              "each entry reserved:\t%u\n"
+              "nr flush writes:\t%llu\n"
+              "nr noflush writes:\t%llu\n"
+              "nr direct reclaim:\t%llu\n"
+              "nr background reclaim:\t%llu\n"
               "current entry sectors:\t%u\n"
+              "current entry error:\t%u\n"
               "current entry:\t\t",
               fifo_used(&j->pin),
               journal_cur_seq(j),
               journal_last_seq(j),
               j->last_seq_ondisk,
+              j->flushed_seq_ondisk,
               j->prereserved.reserved,
               j->prereserved.remaining,
-              j->cur_entry_sectors);
+              j->entry_u64s_reserved,
+              j->nr_flush_writes,
+              j->nr_noflush_writes,
+              j->nr_direct_reclaim,
+              j->nr_background_reclaim,
+              j->cur_entry_sectors,
+              j->cur_entry_error);
 
        switch (s.cur_entry_offset) {
        case JOURNAL_ENTRY_ERROR_VAL:
@@ -1192,16 +1225,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        }
 
        pr_buf(out,
-              "current entry refs:\t%u\n"
-              "prev entry unwritten:\t",
-              journal_state_count(s, s.idx));
-
-       if (s.prev_buf_unwritten)
-               pr_buf(out, "yes, ref %u sectors %u\n",
-                      journal_state_count(s, !s.idx),
-                      journal_prev_buf(j)->sectors);
-       else
-               pr_buf(out, "no\n");
+              "current entry:\t\tidx %u refcount %u\n",
+              s.idx, journal_state_count(s, s.idx));
+
+       i = s.idx;
+       while (i != s.unwritten_idx) {
+               i = (i - 1) & JOURNAL_BUF_MASK;
+
+               pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
+                      i, journal_state_count(s, i), j->buf[i].sectors);
+       }
 
        pr_buf(out,
               "need write:\t\t%i\n"
@@ -1209,22 +1242,40 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
               test_bit(JOURNAL_NEED_WRITE,     &j->flags),
               test_bit(JOURNAL_REPLAY_DONE,    &j->flags));
 
-       for_each_member_device_rcu(ca, c, iter,
+       pr_buf(out, "space:\n");
+       pr_buf(out, "\tdiscarded\t%u:%u\n",
+              j->space[journal_space_discarded].next_entry,
+              j->space[journal_space_discarded].total);
+       pr_buf(out, "\tclean ondisk\t%u:%u\n",
+              j->space[journal_space_clean_ondisk].next_entry,
+              j->space[journal_space_clean_ondisk].total);
+       pr_buf(out, "\tclean\t\t%u:%u\n",
+              j->space[journal_space_clean].next_entry,
+              j->space[journal_space_clean].total);
+       pr_buf(out, "\ttotal\t\t%u:%u\n",
+              j->space[journal_space_total].next_entry,
+              j->space[journal_space_total].total);
+
+       for_each_member_device_rcu(ca, c, i,
                                   &c->rw_devs[BCH_DATA_journal]) {
                struct journal_device *ja = &ca->journal;
 
+               if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
+                       continue;
+
                if (!ja->nr)
                        continue;
 
                pr_buf(out,
                       "dev %u:\n"
                       "\tnr\t\t%u\n"
+                      "\tbucket size\t%u\n"
                       "\tavailable\t%u:%u\n"
-                      "\tdiscard_idx\t\t%u\n"
-                      "\tdirty_idx_ondisk\t%u (seq %llu)\n"
-                      "\tdirty_idx\t\t%u (seq %llu)\n"
+                      "\tdiscard_idx\t%u\n"
+                      "\tdirty_ondisk\t%u (seq %llu)\n"
+                      "\tdirty_idx\t%u (seq %llu)\n"
                       "\tcur_idx\t\t%u (seq %llu)\n",
-                      iter, ja->nr,
+                      i, ja->nr, ca->mi.bucket_size,
                       bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
                       ja->sectors_free,
                       ja->discard_idx,
@@ -1233,10 +1284,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                       ja->cur_idx,             ja->bucket_seq[ja->cur_idx]);
        }
 
-       spin_unlock(&j->lock);
        rcu_read_unlock();
 }
 
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+       spin_lock(&j->lock);
+       __bch2_journal_debug_to_text(out, j);
+       spin_unlock(&j->lock);
+}
+
 void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
 {
        struct journal_entry_pin_list *pin_list;
index f60bc964ee1f4cb99527b0c7eff6086fc63c70ee..cc497125889f2cbea8f3b754d3c72fd4c1c33aeb 100644 (file)
@@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j)
        return j->buf + j->reservations.idx;
 }
 
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
-       return j->buf + !j->reservations.idx;
-}
-
 /* Sequence number of oldest dirty journal entry */
 
 static inline u64 journal_last_seq(struct journal *j)
@@ -141,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
 
 static inline u64 journal_cur_seq(struct journal *j)
 {
-       BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+       EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
 
        return j->pin.back - 1;
 }
@@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64);
 
 static inline int journal_state_count(union journal_res_state s, int idx)
 {
-       return idx == 0 ? s.buf0_count : s.buf1_count;
+       switch (idx) {
+       case 0: return s.buf0_count;
+       case 1: return s.buf1_count;
+       case 2: return s.buf2_count;
+       case 3: return s.buf3_count;
+       }
+       BUG();
 }
 
 static inline void journal_state_inc(union journal_res_state *s)
 {
        s->buf0_count += s->idx == 0;
        s->buf1_count += s->idx == 1;
+       s->buf2_count += s->idx == 2;
+       s->buf3_count += s->idx == 3;
 }
 
 static inline void bch2_journal_set_has_inode(struct journal *j,
@@ -210,11 +213,13 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type
                                          enum btree_id id, unsigned level,
                                          const void *data, unsigned u64s)
 {
-       memset(entry, 0, sizeof(*entry));
        entry->u64s     = cpu_to_le16(u64s);
-       entry->type     = type;
        entry->btree_id = id;
        entry->level    = level;
+       entry->type     = type;
+       entry->pad[0]   = 0;
+       entry->pad[1]   = 0;
+       entry->pad[2]   = 0;
        memcpy_u64s_small(entry->_data, data, u64s);
 
        return jset_u64s(u64s);
@@ -255,21 +260,24 @@ static inline bool journal_entry_empty(struct jset *j)
        return true;
 }
 
-void __bch2_journal_buf_put(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *);
 
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
-                                      bool need_write_just_set)
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
 {
        union journal_res_state s;
 
        s.v = atomic64_sub_return(((union journal_res_state) {
                                    .buf0_count = idx == 0,
                                    .buf1_count = idx == 1,
+                                   .buf2_count = idx == 2,
+                                   .buf3_count = idx == 3,
                                    }).v, &j->reservations.counter);
-       if (!journal_state_count(s, idx)) {
-               EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
-               __bch2_journal_buf_put(j, need_write_just_set);
-       }
+
+       EBUG_ON(((s.idx - idx) & 3) >
+               ((s.idx - s.unwritten_idx) & 3));
+
+       if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
+               __bch2_journal_buf_put(j);
 }
 
 /*
@@ -289,7 +297,7 @@ static inline void bch2_journal_res_put(struct journal *j,
                                       BCH_JSET_ENTRY_btree_keys,
                                       0, 0, NULL, 0);
 
-       bch2_journal_buf_put(j, res->idx, false);
+       bch2_journal_buf_put(j, res->idx);
 
        res->ref = 0;
 }
@@ -300,7 +308,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 #define JOURNAL_RES_GET_NONBLOCK       (1 << 0)
 #define JOURNAL_RES_GET_CHECK          (1 << 1)
 #define JOURNAL_RES_GET_RESERVED       (1 << 2)
-#define JOURNAL_RES_GET_RECLAIM                (1 << 3)
 
 static inline int journal_res_get_fast(struct journal *j,
                                       struct journal_res *res,
@@ -325,11 +332,18 @@ static inline int journal_res_get_fast(struct journal *j,
                    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
                        return 0;
 
-               if (flags & JOURNAL_RES_GET_CHECK)
-                       return 1;
-
                new.cur_entry_offset += res->u64s;
                journal_state_inc(&new);
+
+               /*
+                * If the refcount would overflow, we have to wait:
+                * XXX - tracepoint this:
+                */
+               if (!journal_state_count(new, new.idx))
+                       return 0;
+
+               if (flags & JOURNAL_RES_GET_CHECK)
+                       return 1;
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
@@ -371,7 +385,7 @@ out:
 static inline bool journal_check_may_get_unreserved(struct journal *j)
 {
        union journal_preres_state s = READ_ONCE(j->prereserved);
-       bool ret = s.reserved <= s.remaining &&
+       bool ret = s.reserved < s.remaining &&
                fifo_free(&j->pin) > 8;
 
        lockdep_assert_held(&j->lock);
@@ -397,7 +411,12 @@ static inline void bch2_journal_preres_put(struct journal *j,
 
        s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
        res->u64s = 0;
-       closure_wake_up(&j->preres_wait);
+
+       if (unlikely(s.waiting)) {
+               clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
+                         (unsigned long *) &j->prereserved.v);
+               closure_wake_up(&j->preres_wait);
+       }
 
        if (s.reserved <= s.remaining &&
            !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
@@ -413,32 +432,32 @@ int __bch2_journal_preres_get(struct journal *,
 static inline int bch2_journal_preres_get_fast(struct journal *j,
                                               struct journal_preres *res,
                                               unsigned new_u64s,
-                                              unsigned flags)
+                                              unsigned flags,
+                                              bool set_waiting)
 {
        int d = new_u64s - res->u64s;
        union journal_preres_state old, new;
        u64 v = atomic64_read(&j->prereserved.counter);
+       int ret;
 
        do {
                old.v = new.v = v;
-
-               new.reserved += d;
-
-               /*
-                * If we're being called from the journal reclaim path, we have
-                * to unconditionally give out the pre-reservation, there's
-                * nothing else sensible we can do - otherwise we'd recurse back
-                * into the reclaim path and deadlock:
-                */
-
-               if (!(flags & JOURNAL_RES_GET_RECLAIM) &&
-                   new.reserved > new.remaining)
+               ret = 0;
+
+               if ((flags & JOURNAL_RES_GET_RESERVED) ||
+                   new.reserved + d < new.remaining) {
+                       new.reserved += d;
+                       ret = 1;
+               } else if (set_waiting && !new.waiting)
+                       new.waiting = true;
+               else
                        return 0;
        } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
                                       old.v, new.v)) != old.v);
 
-       res->u64s += d;
-       return 1;
+       if (ret)
+               res->u64s += d;
+       return ret;
 }
 
 static inline int bch2_journal_preres_get(struct journal *j,
@@ -449,7 +468,7 @@ static inline int bch2_journal_preres_get(struct journal *j,
        if (new_u64s <= res->u64s)
                return 0;
 
-       if (bch2_journal_preres_get_fast(j, res, new_u64s, flags))
+       if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
                return 0;
 
        if (flags & JOURNAL_RES_GET_NONBLOCK)
@@ -464,13 +483,8 @@ void bch2_journal_entry_res_resize(struct journal *,
                                   struct journal_entry_res *,
                                   unsigned);
 
-u64 bch2_journal_last_unwritten_seq(struct journal *);
-int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
-
-void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
-void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
 void bch2_journal_flush_async(struct journal *, struct closure *);
-void bch2_journal_meta_async(struct journal *, struct closure *);
 
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
@@ -486,11 +500,6 @@ static inline int bch2_journal_error(struct journal *j)
 
 struct bch_dev;
 
-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
-       return true;
-}
-
 static inline void bch2_journal_set_replay_done(struct journal *j)
 {
        BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
@@ -500,6 +509,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 void bch2_journal_unblock(struct journal *);
 void bch2_journal_block(struct journal *);
 
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
 
index bd0e6b371701b93dc56c43a2ad9835367b87b2bd..c7fa03cfbde6c316095c66a0d896afdd8319b6c2 100644 (file)
@@ -5,15 +5,33 @@
 #include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "replicas.h"
 
 #include <trace/events/bcachefs.h>
 
+static void __journal_replay_free(struct journal_replay *i)
+{
+       list_del(&i->list);
+       kvpfree(i, offsetof(struct journal_replay, j) +
+               vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+       i->ignore = true;
+
+       if (!c->opts.read_entire_journal)
+               __journal_replay_free(i);
+}
+
 struct journal_list {
        struct closure          cl;
        struct mutex            lock;
@@ -29,35 +47,37 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+                            struct bch_extent_ptr entry_ptr,
                             struct journal_list *jlist, struct jset *j,
                             bool bad)
 {
-       struct journal_replay *i, *pos;
-       struct bch_devs_list devs = { .nr = 0 };
+       struct journal_replay *i, *pos, *dup = NULL;
+       struct bch_extent_ptr *ptr;
        struct list_head *where;
        size_t bytes = vstruct_bytes(j);
-       __le64 last_seq;
-       int ret;
-
-       last_seq = !list_empty(jlist->head)
-               ? list_last_entry(jlist->head, struct journal_replay,
-                                 list)->j.last_seq
-               : 0;
+       u64 last_seq = 0;
+       int ret = JOURNAL_ENTRY_ADD_OK;
 
-       if (!c->opts.read_entire_journal) {
-               /* Is this entry older than the range we need? */
-               if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-                       ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-                       goto out;
+       list_for_each_entry_reverse(i, jlist->head, list) {
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq = le64_to_cpu(i->j.last_seq);
+                       break;
                }
+       }
 
-               /* Drop entries we don't need anymore */
+       /* Is this entry older than the range we need? */
+       if (!c->opts.read_entire_journal &&
+           le64_to_cpu(j->seq) < last_seq) {
+               ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+               goto out;
+       }
+
+       /* Drop entries we don't need anymore */
+       if (!JSET_NO_FLUSH(j)) {
                list_for_each_entry_safe(i, pos, jlist->head, list) {
                        if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
                                break;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+                       journal_replay_free(c, i);
                }
        }
 
@@ -70,30 +90,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 
        where = jlist->head;
 add:
-       i = where->next != jlist->head
+       dup = where->next != jlist->head
                ? container_of(where->next, struct journal_replay, list)
                : NULL;
 
+       if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
+               dup = NULL;
+
        /*
         * Duplicate journal entries? If so we want the one that didn't have a
         * checksum error:
         */
-       if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
-               if (i->bad) {
-                       devs = i->devs;
-                       list_del(&i->list);
-                       kvpfree(i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&i->j));
+       if (dup) {
+               if (dup->bad) {
+                       /* we'll replace @dup: */
                } else if (bad) {
+                       i = dup;
                        goto found;
                } else {
-                       fsck_err_on(bytes != vstruct_bytes(&i->j) ||
-                                   memcmp(j, &i->j, bytes), c,
+                       fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
+                                   memcmp(j, &dup->j, bytes), c,
                                    "found duplicate but non identical journal entries (seq %llu)",
                                    le64_to_cpu(j->seq));
+                       i = dup;
                        goto found;
                }
-
        }
 
        i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
@@ -102,16 +123,34 @@ add:
                goto out;
        }
 
-       list_add(&i->list, where);
-       i->devs = devs;
-       i->bad  = bad;
+       i->nr_ptrs       = 0;
+       i->bad          = bad;
+       i->ignore       = false;
        memcpy(&i->j, j, bytes);
+
+       if (dup) {
+               i->nr_ptrs = dup->nr_ptrs;
+               memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
+               __journal_replay_free(dup);
+       }
+
+       list_add(&i->list, where);
 found:
-       if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
-               bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
-       else
-               fsck_err_on(1, c, "duplicate journal entries on same device");
-       ret = JOURNAL_ENTRY_ADD_OK;
+       for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
+               if (ptr->dev == ca->dev_idx) {
+                       bch_err(c, "duplicate journal entry %llu on same device",
+                               le64_to_cpu(i->j.seq));
+                       goto out;
+               }
+       }
+
+       if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
+               bch_err(c, "found too many copies of journal entry %llu",
+                       le64_to_cpu(i->j.seq));
+               goto out;
+       }
+
+       i->ptrs[i->nr_ptrs++] = entry_ptr;
 out:
 fsck_err:
        return ret;
@@ -161,46 +200,54 @@ static void journal_entry_null_range(void *start, void *end)
 #define journal_entry_err_on(cond, c, msg, ...)                                \
        ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
 
-static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+#define FSCK_DELETED_KEY       5
+
+static int journal_validate_key(struct bch_fs *c, const char *where,
                                struct jset_entry *entry,
                                unsigned level, enum btree_id btree_id,
-                               struct bkey_i *k,
-                               const char *type, int write)
+                               struct bkey_i *k, const char *type,
+                               unsigned version, int big_endian, int write)
 {
        void *next = vstruct_next(entry);
        const char *invalid;
-       unsigned version = le32_to_cpu(jset->version);
        int ret = 0;
 
        if (journal_entry_err_on(!k->k.u64s, c,
-                       "invalid %s in journal: k->u64s 0", type)) {
+                       "invalid %s in %s entry offset %zi/%u: k->u64s 0",
+                       type, where,
+                       (u64 *) k - entry->_data,
+                       le16_to_cpu(entry->u64s))) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
+               return FSCK_DELETED_KEY;
        }
 
        if (journal_entry_err_on((void *) bkey_next(k) >
                                (void *) vstruct_next(entry), c,
-                       "invalid %s in journal: extends past end of journal entry",
-                       type)) {
+                       "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
+                       type, where,
+                       (u64 *) k - entry->_data,
+                       le16_to_cpu(entry->u64s))) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
+               return FSCK_DELETED_KEY;
        }
 
        if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-                       "invalid %s in journal: bad format %u",
-                       type, k->k.format)) {
-               le16_add_cpu(&entry->u64s, -k->k.u64s);
+                       "invalid %s in %s entry offset %zi/%u: bad format %u",
+                       type, where,
+                       (u64 *) k - entry->_data,
+                       le16_to_cpu(entry->u64s),
+                       k->k.format)) {
+               le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
                journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
+               return FSCK_DELETED_KEY;
        }
 
        if (!write)
-               bch2_bkey_compat(level, btree_id, version,
-                           JSET_BIG_ENDIAN(jset), write,
-                           NULL, bkey_to_packed(k));
+               bch2_bkey_compat(level, btree_id, version, big_endian,
+                                write, NULL, bkey_to_packed(k));
 
        invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
                                    __btree_node_type(level, btree_id));
@@ -208,46 +255,50 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset,
                char buf[160];
 
                bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
-               mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
-                                type, invalid, buf);
+               mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
+                                type, where,
+                                (u64 *) k - entry->_data,
+                                le16_to_cpu(entry->u64s),
+                                invalid, buf);
 
-               le16_add_cpu(&entry->u64s, -k->k.u64s);
+               le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
                journal_entry_null_range(vstruct_next(entry), next);
-               return 0;
+               return FSCK_DELETED_KEY;
        }
 
        if (write)
-               bch2_bkey_compat(level, btree_id, version,
-                           JSET_BIG_ENDIAN(jset), write,
-                           NULL, bkey_to_packed(k));
+               bch2_bkey_compat(level, btree_id, version, big_endian,
+                                write, NULL, bkey_to_packed(k));
 fsck_err:
        return ret;
 }
 
 static int journal_entry_validate_btree_keys(struct bch_fs *c,
-                                            struct jset *jset,
+                                            const char *where,
                                             struct jset_entry *entry,
-                                            int write)
+                                            unsigned version, int big_endian, int write)
 {
-       struct bkey_i *k;
+       struct bkey_i *k = entry->start;
 
-       vstruct_for_each(entry, k) {
-               int ret = journal_validate_key(c, jset, entry,
+       while (k != vstruct_last(entry)) {
+               int ret = journal_validate_key(c, where, entry,
                                               entry->level,
                                               entry->btree_id,
-                                              k, "key", write);
-               if (ret)
-                       return ret;
+                                              k, "key", version, big_endian, write);
+               if (ret == FSCK_DELETED_KEY)
+                       continue;
+
+               k = bkey_next(k);
        }
 
        return 0;
 }
 
 static int journal_entry_validate_btree_root(struct bch_fs *c,
-                                            struct jset *jset,
+                                            const char *where,
                                             struct jset_entry *entry,
-                                            int write)
+                                            unsigned version, int big_endian, int write)
 {
        struct bkey_i *k = entry->start;
        int ret = 0;
@@ -266,25 +317,25 @@ static int journal_entry_validate_btree_root(struct bch_fs *c,
                return 0;
        }
 
-       return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
-                                   "btree root", write);
+       return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
+                                   "btree root", version, big_endian, write);
 fsck_err:
        return ret;
 }
 
 static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
-                                           struct jset *jset,
+                                           const char *where,
                                            struct jset_entry *entry,
-                                           int write)
+                                           unsigned version, int big_endian, int write)
 {
        /* obsolete, don't care: */
        return 0;
 }
 
 static int journal_entry_validate_blacklist(struct bch_fs *c,
-                                           struct jset *jset,
+                                           const char *where,
                                            struct jset_entry *entry,
-                                           int write)
+                                           unsigned version, int big_endian, int write)
 {
        int ret = 0;
 
@@ -297,9 +348,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
-                                              struct jset *jset,
+                                              const char *where,
                                               struct jset_entry *entry,
-                                              int write)
+                                              unsigned version, int big_endian, int write)
 {
        struct jset_entry_blacklist_v2 *bl_entry;
        int ret = 0;
@@ -323,9 +374,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_usage(struct bch_fs *c,
-                                       struct jset *jset,
+                                       const char *where,
                                        struct jset_entry *entry,
-                                       int write)
+                                       unsigned version, int big_endian, int write)
 {
        struct jset_entry_usage *u =
                container_of(entry, struct jset_entry_usage, entry);
@@ -344,9 +395,9 @@ fsck_err:
 }
 
 static int journal_entry_validate_data_usage(struct bch_fs *c,
-                                       struct jset *jset,
+                                       const char *where,
                                        struct jset_entry *entry,
-                                       int write)
+                                       unsigned version, int big_endian, int write)
 {
        struct jset_entry_data_usage *u =
                container_of(entry, struct jset_entry_data_usage, entry);
@@ -365,9 +416,72 @@ fsck_err:
        return ret;
 }
 
+static int journal_entry_validate_clock(struct bch_fs *c,
+                                       const char *where,
+                                       struct jset_entry *entry,
+                                       unsigned version, int big_endian, int write)
+{
+       struct jset_entry_clock *clock =
+               container_of(entry, struct jset_entry_clock, entry);
+       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+       int ret = 0;
+
+       if (journal_entry_err_on(bytes != sizeof(*clock),
+                                c, "invalid journal entry clock: bad size")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       if (journal_entry_err_on(clock->rw > 1,
+                                c, "invalid journal entry clock: bad rw")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+fsck_err:
+       return ret;
+}
+
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+                                           const char *where,
+                                           struct jset_entry *entry,
+                                           unsigned version, int big_endian, int write)
+{
+       struct jset_entry_dev_usage *u =
+               container_of(entry, struct jset_entry_dev_usage, entry);
+       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+       unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+       unsigned dev;
+       int ret = 0;
+
+       if (journal_entry_err_on(bytes < expected,
+                                c, "invalid journal entry dev usage: bad size (%u < %u)",
+                                bytes, expected)) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       dev = le32_to_cpu(u->dev);
+
+       if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+                                c, "invalid journal entry dev usage: bad dev")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       if (journal_entry_err_on(u->pad,
+                                c, "invalid journal entry dev usage: bad pad")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+fsck_err:
+       return ret;
+}
+
 struct jset_entry_ops {
-       int (*validate)(struct bch_fs *, struct jset *,
-                       struct jset_entry *, int);
+       int (*validate)(struct bch_fs *, const char *,
+                       struct jset_entry *, unsigned, int, int);
 };
 
 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
@@ -379,22 +493,29 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #undef x
 };
 
-static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
-                                 struct jset_entry *entry, int write)
+int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian, int write)
 {
        return entry->type < BCH_JSET_ENTRY_NR
-               ? bch2_jset_entry_ops[entry->type].validate(c, jset,
-                                                           entry, write)
+               ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+                               version, big_endian, write)
                : 0;
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
                                 int write)
 {
+       char buf[100];
        struct jset_entry *entry;
        int ret = 0;
 
        vstruct_for_each(jset, entry) {
+               scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
+                         le64_to_cpu(jset->seq),
+                         (u64 *) entry - jset->_data,
+                         le32_to_cpu(jset->u64s));
+
                if (journal_entry_err_on(vstruct_next(entry) >
                                         vstruct_last(jset), c,
                                "journal entry extends past end of jset")) {
@@ -402,7 +523,9 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
                        break;
                }
 
-               ret = journal_entry_validate(c, jset, entry, write);
+               ret = bch2_journal_entry_validate(c, buf, entry,
+                                       le32_to_cpu(jset->version),
+                                       JSET_BIG_ENDIAN(jset), write);
                if (ret)
                        break;
        }
@@ -430,52 +553,70 @@ static int jset_validate(struct bch_fs *c,
                                  version < bcachefs_metadata_version_min) ||
                                 version >= bcachefs_metadata_version_max, c,
                        "%s sector %llu seq %llu: unknown journal entry version %u",
-                       ca->name, sector, le64_to_cpu(jset->seq),
+                       ca ? ca->name : c->name,
+                       sector, le64_to_cpu(jset->seq),
                        version)) {
-               /* XXX: note we might have missing journal entries */
-               return JOURNAL_ENTRY_BAD;
+               /* don't try to continue: */
+               return EINVAL;
        }
 
+       if (bytes > (sectors_read << 9) &&
+           sectors_read < bucket_sectors_left)
+               return JOURNAL_ENTRY_REREAD;
+
        if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
                        "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
-                       ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
-               /* XXX: note we might have missing journal entries */
-               return JOURNAL_ENTRY_BAD;
+                       ca ? ca->name : c->name,
+                       sector, le64_to_cpu(jset->seq), bytes)) {
+               ret = JOURNAL_ENTRY_BAD;
+               le32_add_cpu(&jset->u64s,
+                            -((bytes - (bucket_sectors_left << 9)) / 8));
        }
 
-       if (bytes > sectors_read << 9)
-               return JOURNAL_ENTRY_REREAD;
-
-       if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+       if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
                        "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
-                       ca->name, sector, le64_to_cpu(jset->seq),
-                       JSET_CSUM_TYPE(jset)))
-               return JOURNAL_ENTRY_BAD;
+                       ca ? ca->name : c->name,
+                       sector, le64_to_cpu(jset->seq),
+                       JSET_CSUM_TYPE(jset))) {
+               ret = JOURNAL_ENTRY_BAD;
+               goto csum_done;
+       }
+
+       if (write)
+               goto csum_done;
 
        csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
        if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
                                 "%s sector %llu seq %llu: journal checksum bad",
-                                ca->name, sector, le64_to_cpu(jset->seq))) {
-               /* XXX: retry IO, when we start retrying checksum errors */
-               /* XXX: note we might have missing journal entries */
-               return JOURNAL_ENTRY_BAD;
-       }
+                                ca ? ca->name : c->name,
+                                sector, le64_to_cpu(jset->seq)))
+               ret = JOURNAL_ENTRY_BAD;
 
        bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
                     jset->encrypted_start,
                     vstruct_end(jset) - (void *) jset->encrypted_start);
-
-       if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
-                                "invalid journal entry: last_seq > seq")) {
+csum_done:
+       /* last_seq is ignored when JSET_NO_FLUSH is true */
+       if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
+                                le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+                                "invalid journal entry: last_seq > seq (%llu > %llu)",
+                                le64_to_cpu(jset->last_seq),
+                                le64_to_cpu(jset->seq))) {
                jset->last_seq = jset->seq;
                return JOURNAL_ENTRY_BAD;
        }
-
-       return 0;
 fsck_err:
        return ret;
 }
 
+static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+{
+       unsigned sectors = vstruct_sectors(jset, c->block_bits);
+
+       return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
+               jset_validate_entries(c, jset, WRITE);
+}
+
 struct journal_read_buf {
        void            *data;
        size_t          size;
@@ -536,10 +677,17 @@ reread:
                        bio_put(bio);
 
                        if (bch2_dev_io_err_on(ret, ca,
-                                              "journal read from sector %llu",
+                                              "journal read error: sector %llu",
                                               offset) ||
-                           bch2_meta_read_fault("journal"))
-                               return -EIO;
+                           bch2_meta_read_fault("journal")) {
+                               /*
+                                * We don't error out of the recovery process
+                                * here, since the relevant journal entry may be
+                                * found on a different device, and missing or
+                                * no journal entries will be handled later
+                                */
+                               return 0;
+                       }
 
                        j = buf->data;
                }
@@ -589,7 +737,10 @@ reread:
                ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
                mutex_lock(&jlist->lock);
-               ret = journal_entry_add(c, ca, jlist, j, ret != 0);
+               ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
+                                       .dev = ca->dev_idx,
+                                       .offset = offset,
+                                       }, jlist, j, ret != 0);
                mutex_unlock(&jlist->lock);
 
                switch (ret) {
@@ -677,14 +828,35 @@ err:
        goto out;
 }
 
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+                                     struct journal_replay *j)
+{
+       unsigned i;
+
+       for (i = 0; i < j->nr_ptrs; i++) {
+               struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+               u64 offset;
+
+               div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
+
+               if (i)
+                       pr_buf(out, " ");
+               pr_buf(out, "%u:%llu (offset %llu)",
+                      j->ptrs[i].dev,
+                      (u64) j->ptrs[i].offset, offset);
+       }
+}
+
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+                     u64 *blacklist_seq, u64 *start_seq)
 {
        struct journal_list jlist;
-       struct journal_replay *i;
+       struct journal_replay *i, *t;
        struct bch_dev *ca;
        unsigned iter;
        size_t keys = 0, entries = 0;
        bool degraded = false;
+       u64 seq, last_seq = 0;
        int ret = 0;
 
        closure_init_stack(&jlist.cl);
@@ -697,8 +869,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
                        continue;
 
-               if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
-                    ca->mi.state == BCH_MEMBER_STATE_RO) &&
+               if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+                    ca->mi.state == BCH_MEMBER_STATE_ro) &&
                    percpu_ref_tryget(&ca->io_ref))
                        closure_call(&ca->journal.read,
                                     bch2_journal_read_device,
@@ -713,23 +885,129 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
        if (jlist.ret)
                return jlist.ret;
 
+       if (list_empty(list)) {
+               bch_info(c, "journal read done, but no entries found");
+               return 0;
+       }
+
+       i = list_last_entry(list, struct journal_replay, list);
+       *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+       /*
+        * Find most recent flush entry, and ignore newer non flush entries -
+        * those entries will be blacklisted:
+        */
+       list_for_each_entry_safe_reverse(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               if (!JSET_NO_FLUSH(&i->j)) {
+                       last_seq        = le64_to_cpu(i->j.last_seq);
+                       *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
+                       break;
+               }
+
+               journal_replay_free(c, i);
+       }
+
+       if (!last_seq) {
+               fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+               return -1;
+       }
+
+       /* Drop blacklisted entries and entries older than last_seq: */
+       list_for_each_entry_safe(i, t, list, list) {
+               if (i->ignore)
+                       continue;
+
+               seq = le64_to_cpu(i->j.seq);
+               if (seq < last_seq) {
+                       journal_replay_free(c, i);
+                       continue;
+               }
+
+               if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+                       fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+                                   "found blacklisted journal entry %llu", seq);
+
+                       journal_replay_free(c, i);
+               }
+       }
+
+       /* Check for missing entries: */
+       seq = last_seq;
+       list_for_each_entry(i, list, list) {
+               if (i->ignore)
+                       continue;
+
+               BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+               while (seq < le64_to_cpu(i->j.seq)) {
+                       u64 missing_start, missing_end;
+                       char buf1[200], buf2[200];
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       if (seq == le64_to_cpu(i->j.seq))
+                               break;
+
+                       missing_start = seq;
+
+                       while (seq < le64_to_cpu(i->j.seq) &&
+                              !bch2_journal_seq_is_blacklisted(c, seq, false))
+                               seq++;
+
+                       if (i->list.prev != list) {
+                               struct printbuf out = PBUF(buf1);
+                               struct journal_replay *p = list_prev_entry(i, list);
+
+                               bch2_journal_ptrs_to_text(&out, c, p);
+                               pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+                       } else
+                               sprintf(buf1, "(none)");
+                       bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+
+                       missing_end = seq - 1;
+                       fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+                                "  prev at %s\n"
+                                "  next at %s",
+                                missing_start, missing_end,
+                                last_seq, *blacklist_seq - 1,
+                                buf1, buf2);
+               }
+
+               seq++;
+       }
+
        list_for_each_entry(i, list, list) {
                struct jset_entry *entry;
                struct bkey_i *k, *_n;
-               struct bch_replicas_padded replicas;
+               struct bch_replicas_padded replicas = {
+                       .e.data_type = BCH_DATA_journal,
+                       .e.nr_required = 1,
+               };
+               unsigned ptr;
                char buf[80];
 
+               if (i->ignore)
+                       continue;
+
                ret = jset_validate_entries(c, &i->j, READ);
                if (ret)
                        goto fsck_err;
 
+               for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+                       replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+
+               bch2_replicas_entry_sort(&replicas.e);
+
                /*
                 * If we're mounting in degraded mode - if we didn't read all
                 * the devices - this is wrong:
                 */
 
-               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
-
                if (!degraded &&
                    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
                     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
@@ -746,12 +1024,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                entries++;
        }
 
-       if (!list_empty(list)) {
-               i = list_last_entry(list, struct journal_replay, list);
+       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+                keys, entries, *start_seq);
 
-               bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
-                        keys, entries, le64_to_cpu(i->j.seq));
-       }
+       if (*start_seq != *blacklist_seq)
+               bch_info(c, "dropped unflushed entries %llu-%llu",
+                        *blacklist_seq, *start_seq - 1);
 fsck_err:
        return ret;
 }
@@ -785,7 +1063,7 @@ static void __journal_write_alloc(struct journal *j,
                 * it:
                 */
                if (!ca->mi.durability ||
-                   ca->mi.state != BCH_MEMBER_STATE_RW ||
+                   ca->mi.state != BCH_MEMBER_STATE_rw ||
                    !ja->nr ||
                    bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
                                         ca->dev_idx) ||
@@ -820,16 +1098,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
                               unsigned sectors)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_devs_mask devs;
        struct journal_device *ja;
        struct bch_dev *ca;
        struct dev_alloc_list devs_sorted;
+       unsigned target = c->opts.metadata_target ?:
+               c->opts.foreground_target;
        unsigned i, replicas = 0, replicas_want =
                READ_ONCE(c->opts.metadata_replicas);
 
        rcu_read_lock();
+retry:
+       devs = target_rw_devs(c, BCH_DATA_journal, target);
 
-       devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
-                                         &c->rw_devs[BCH_DATA_journal]);
+       devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
 
        __journal_write_alloc(j, w, &devs_sorted,
                              sectors, &replicas, replicas_want);
@@ -861,9 +1143,17 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
        __journal_write_alloc(j, w, &devs_sorted,
                              sectors, &replicas, replicas_want);
+
+       if (replicas < replicas_want && target) {
+               /* Retry from all devices: */
+               target = 0;
+               goto retry;
+       }
 done:
        rcu_read_unlock();
 
+       BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+
        return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
 }
 
@@ -924,41 +1214,61 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
                return;
 
        memcpy(new_buf, buf->data, buf->buf_size);
-       kvpfree(buf->data, buf->buf_size);
-       buf->data       = new_buf;
-       buf->buf_size   = new_size;
+
+       spin_lock(&j->lock);
+       swap(buf->data,         new_buf);
+       swap(buf->buf_size,     new_size);
+       spin_unlock(&j->lock);
+
+       kvpfree(new_buf, new_size);
+}
+
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
+{
+       return j->buf + j->reservations.unwritten_idx;
 }
 
 static void journal_write_done(struct closure *cl)
 {
        struct journal *j = container_of(cl, struct journal, io);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *w = journal_prev_buf(j);
+       struct journal_buf *w = journal_last_unwritten_buf(j);
        struct bch_devs_list devs =
                bch2_bkey_devs(bkey_i_to_s_c(&w->key));
        struct bch_replicas_padded replicas;
-       u64 seq = le64_to_cpu(w->data->seq);
-       u64 last_seq = le64_to_cpu(w->data->last_seq);
+       union journal_res_state old, new;
+       u64 v, seq, last_seq;
+       int err = 0;
 
        bch2_time_stats_update(j->write_time, j->write_start_time);
 
        if (!devs.nr) {
                bch_err(c, "unable to write journal to sufficient devices");
-               goto err;
+               err = -EIO;
+       } else {
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+               if (bch2_mark_replicas(c, &replicas.e))
+                       err = -EIO;
        }
 
-       bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
-
-       if (bch2_mark_replicas(c, &replicas.e))
-               goto err;
+       if (err)
+               bch2_fatal_error(c);
 
        spin_lock(&j->lock);
+       seq = le64_to_cpu(w->data->seq);
+       last_seq = le64_to_cpu(w->data->last_seq);
+
        if (seq >= j->pin.front)
                journal_seq_pin(j, seq)->devs = devs;
 
        j->seq_ondisk           = seq;
-       j->last_seq_ondisk      = last_seq;
-       bch2_journal_space_available(j);
+       if (err && (!j->err_seq || seq < j->err_seq))
+               j->err_seq      = seq;
+
+       if (!JSET_NO_FLUSH(w->data)) {
+               j->flushed_seq_ondisk = seq;
+               j->last_seq_ondisk = last_seq;
+       }
 
        /*
         * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -967,14 +1277,21 @@ static void journal_write_done(struct closure *cl)
         * Must come before signaling write completion, for
         * bch2_fs_journal_stop():
         */
-       mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
-out:
+       journal_reclaim_kick(&c->journal);
+
        /* also must come before signalling write completion: */
        closure_debug_destroy(cl);
 
-       BUG_ON(!j->reservations.prev_buf_unwritten);
-       atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
-                    &j->reservations.counter);
+       v = atomic64_read(&j->reservations.counter);
+       do {
+               old.v = new.v = v;
+               BUG_ON(new.idx == new.unwritten_idx);
+
+               new.unwritten_idx++;
+       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+                                      old.v, new.v)) != old.v);
+
+       bch2_journal_space_available(j);
 
        closure_wake_up(&w->wait);
        journal_wake(j);
@@ -982,11 +1299,10 @@ out:
        if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
                mod_delayed_work(system_freezable_wq, &j->write_work, 0);
        spin_unlock(&j->lock);
-       return;
-err:
-       bch2_fatal_error(c);
-       spin_lock(&j->lock);
-       goto out;
+
+       if (new.unwritten_idx != new.idx &&
+           !journal_state_count(new, new.unwritten_idx))
+               closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -994,10 +1310,10 @@ static void journal_write_endio(struct bio *bio)
        struct bch_dev *ca = bio->bi_private;
        struct journal *j = &ca->fs->journal;
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
                               bch2_blk_status_to_str(bio->bi_status)) ||
            bch2_meta_write_fault("journal")) {
-               struct journal_buf *w = journal_prev_buf(j);
+               struct journal_buf *w = journal_last_unwritten_buf(j);
                unsigned long flags;
 
                spin_lock_irqsave(&j->err_lock, flags);
@@ -1009,27 +1325,93 @@ static void journal_write_endio(struct bio *bio)
        percpu_ref_put(&ca->io_ref);
 }
 
+static void do_journal_write(struct closure *cl)
+{
+       struct journal *j = container_of(cl, struct journal, io);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       struct journal_buf *w = journal_last_unwritten_buf(j);
+       struct bch_extent_ptr *ptr;
+       struct bio *bio;
+       unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+       extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+               ca = bch_dev_bkey_exists(c, ptr->dev);
+               if (!percpu_ref_tryget(&ca->io_ref)) {
+                       /* XXX: fix this */
+                       bch_err(c, "missing device for journal write\n");
+                       continue;
+               }
+
+               this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+                            sectors);
+
+               bio = ca->journal.bio;
+               bio_reset(bio);
+               bio_set_dev(bio, ca->disk_sb.bdev);
+               bio->bi_iter.bi_sector  = ptr->offset;
+               bio->bi_end_io          = journal_write_endio;
+               bio->bi_private         = ca;
+               bio->bi_opf             = REQ_OP_WRITE|REQ_SYNC|REQ_META;
+
+               BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+               ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
+               if (!JSET_NO_FLUSH(w->data))
+                       bio->bi_opf    |= REQ_FUA;
+               if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+                       bio->bi_opf    |= REQ_PREFLUSH;
+
+               bch2_bio_map(bio, w->data, sectors << 9);
+
+               trace_journal_write(bio);
+               closure_bio_submit(bio, cl);
+
+               ca->journal.bucket_seq[ca->journal.cur_idx] =
+                       le64_to_cpu(w->data->seq);
+       }
+
+       continue_at(cl, journal_write_done, system_highpri_wq);
+       return;
+}
+
 void bch2_journal_write(struct closure *cl)
 {
        struct journal *j = container_of(cl, struct journal, io);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
-       struct journal_buf *w = journal_prev_buf(j);
+       struct journal_buf *w = journal_last_unwritten_buf(j);
        struct jset_entry *start, *end;
        struct jset *jset;
        struct bio *bio;
-       struct bch_extent_ptr *ptr;
+       char *journal_debug_buf = NULL;
        bool validate_before_checksum = false;
-       unsigned i, sectors, bytes, u64s;
+       unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
        int ret;
 
-       bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
+       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 
        journal_buf_realloc(j, w);
        jset = w->data;
 
        j->write_start_time = local_clock();
 
+       spin_lock(&j->lock);
+       if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+           !w->must_flush &&
+           (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+           test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+               w->noflush = true;
+               SET_JSET_NO_FLUSH(jset, true);
+               jset->last_seq = 0;
+
+               j->nr_noflush_writes++;
+       } else {
+               j->last_flush_write = jiffies;
+               j->nr_flush_writes++;
+       }
+       spin_unlock(&j->lock);
+
        /*
         * New btree roots are set by journalling them; when the journal entry
         * gets written we have to propagate them to c->btree_roots
@@ -1046,8 +1428,8 @@ void bch2_journal_write(struct closure *cl)
 
        end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
 
-       end     = bch2_journal_super_entries_add_common(c, end,
-                                               le64_to_cpu(jset->seq));
+       bch2_journal_super_entries_add_common(c, &end,
+                               le64_to_cpu(jset->seq));
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
 
@@ -1056,10 +1438,7 @@ void bch2_journal_write(struct closure *cl)
 
        journal_write_compact(jset);
 
-       jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
-       jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
        jset->magic             = cpu_to_le64(jset_magic(c));
-
        jset->version           = c->sb.version < bcachefs_metadata_version_new_versioning
                ? cpu_to_le32(BCH_JSET_VERSION_OLD)
                : cpu_to_le32(c->sb.version);
@@ -1067,14 +1446,17 @@ void bch2_journal_write(struct closure *cl)
        SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
        SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
+       if (journal_entry_empty(jset))
+               j->last_empty_seq = le64_to_cpu(jset->seq);
+
        if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
                validate_before_checksum = true;
 
-       if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
+       if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
                validate_before_checksum = true;
 
        if (validate_before_checksum &&
-           jset_validate_entries(c, jset, WRITE))
+           jset_validate_for_write(c, jset))
                goto err;
 
        bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -1085,7 +1467,7 @@ void bch2_journal_write(struct closure *cl)
                                  journal_nonce(jset), jset);
 
        if (!validate_before_checksum &&
-           jset_validate_entries(c, jset, WRITE))
+           jset_validate_for_write(c, jset))
                goto err;
 
        sectors = vstruct_sectors(jset, c->block_bits);
@@ -1104,6 +1486,12 @@ retry_alloc:
                goto retry_alloc;
        }
 
+       if (ret) {
+               journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+               if (journal_debug_buf)
+                       __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+       }
+
        /*
         * write is allocated, no longer need to account for it in
         * bch2_journal_space_available():
@@ -1118,7 +1506,9 @@ retry_alloc:
        spin_unlock(&j->lock);
 
        if (ret) {
-               bch_err(c, "Unable to allocate journal write");
+               bch_err(c, "Unable to allocate journal write:\n%s",
+                       journal_debug_buf);
+               kfree(journal_debug_buf);
                bch2_fatal_error(c);
                continue_at(cl, journal_write_done, system_highpri_wq);
                return;
@@ -1131,36 +1521,14 @@ retry_alloc:
        if (c->opts.nochanges)
                goto no_io;
 
-       extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-               ca = bch_dev_bkey_exists(c, ptr->dev);
-               if (!percpu_ref_tryget(&ca->io_ref)) {
-                       /* XXX: fix this */
-                       bch_err(c, "missing device for journal write\n");
-                       continue;
-               }
-
-               this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
-                            sectors);
-
-               bio = ca->journal.bio;
-               bio_reset(bio);
-               bio_set_dev(bio, ca->disk_sb.bdev);
-               bio->bi_iter.bi_sector  = ptr->offset;
-               bio->bi_end_io          = journal_write_endio;
-               bio->bi_private         = ca;
-               bio_set_op_attrs(bio, REQ_OP_WRITE,
-                                REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
-               bch2_bio_map(bio, jset, sectors << 9);
-
-               trace_journal_write(bio);
-               closure_bio_submit(bio, cl);
+       for_each_rw_member(ca, c, i)
+               nr_rw_members++;
 
-               ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
-       }
+       if (nr_rw_members > 1)
+               w->separate_flush = true;
 
-       for_each_rw_member(ca, c, i)
-               if (journal_flushes_device(ca) &&
-                   !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+       if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+               for_each_rw_member(ca, c, i) {
                        percpu_ref_get(&ca->io_ref);
 
                        bio = ca->journal.bio;
@@ -1171,7 +1539,12 @@ retry_alloc:
                        bio->bi_private         = ca;
                        closure_bio_submit(bio, cl);
                }
+       }
+
+       bch2_bucket_seq_cleanup(c);
 
+       continue_at(cl, do_journal_write, system_highpri_wq);
+       return;
 no_io:
        bch2_bucket_seq_cleanup(c);
 
index 6958ee0f8cf23da1ab5a9c0588fedb3d8679678c..f34281a28f12bc64f06dc62383c16af1f3389129 100644 (file)
@@ -8,9 +8,12 @@
  */
 struct journal_replay {
        struct list_head        list;
-       struct bch_devs_list    devs;
+       struct bch_extent_ptr   ptrs[BCH_REPLICAS_MAX];
+       unsigned                nr_ptrs;
+
        /* checksum error, but we may want to try using it anyways: */
        bool                    bad;
+       bool                    ignore;
        /* must be last: */
        struct jset             j;
 };
@@ -37,7 +40,10 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
        for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
                vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
+                               unsigned, int, int);
+
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
 void bch2_journal_write(struct closure *);
 
index 57591983eebd420c604453e9f4db8d130c46aae3..7be6c65c1abea43c0f79f3e23c24a11b1c5d497b 100644 (file)
@@ -1,12 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
 #include "super.h"
 
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+#include <trace/events/bcachefs.h>
+
 /* Free space calculations: */
 
 static unsigned journal_space_from(struct journal_device *ja,
@@ -53,82 +59,108 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
                                       old.v, new.v)) != old.v);
 }
 
-static struct journal_space {
-       unsigned        next_entry;
-       unsigned        remaining;
-} __journal_space_available(struct journal *j, unsigned nr_devs_want,
-                           enum journal_space_from from)
+static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
 {
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bch_dev *ca;
-       unsigned sectors_next_entry     = UINT_MAX;
-       unsigned sectors_total          = UINT_MAX;
-       unsigned i, nr_devs = 0;
-       unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
-               ? journal_prev_buf(j)->sectors
-               : 0;
+       unsigned sectors = 0;
 
-       rcu_read_lock();
-       for_each_member_device_rcu(ca, c, i,
-                                  &c->rw_devs[BCH_DATA_journal]) {
-               struct journal_device *ja = &ca->journal;
-               unsigned buckets_this_device, sectors_this_device;
+       while (!sectors && *idx != j->reservations.idx) {
+               sectors = j->buf[*idx].sectors;
 
-               if (!ja->nr)
-                       continue;
+               *idx = (*idx + 1) & JOURNAL_BUF_MASK;
+       }
 
-               buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
-               sectors_this_device = ja->sectors_free;
+       return sectors;
+}
 
-               /*
-                * We that we don't allocate the space for a journal entry
-                * until we write it out - thus, account for it here:
-                */
-               if (unwritten_sectors >= sectors_this_device) {
-                       if (!buckets_this_device)
-                               continue;
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
+                           enum journal_space_from from)
+{
+       struct journal_device *ja = &ca->journal;
+       unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
 
-                       buckets_this_device--;
-                       sectors_this_device = ca->mi.bucket_size;
-               }
+       if (from == journal_space_total)
+               return (struct journal_space) {
+                       .next_entry     = ca->mi.bucket_size,
+                       .total          = ca->mi.bucket_size * ja->nr,
+               };
 
-               sectors_this_device -= unwritten_sectors;
+       buckets = bch2_journal_dev_buckets_available(j, ja, from);
+       sectors = ja->sectors_free;
 
-               if (sectors_this_device < ca->mi.bucket_size &&
-                   buckets_this_device) {
-                       buckets_this_device--;
-                       sectors_this_device = ca->mi.bucket_size;
+       /*
+        * We that we don't allocate the space for a journal entry
+        * until we write it out - thus, account for it here:
+        */
+       while ((unwritten = get_unwritten_sectors(j, &idx))) {
+               if (unwritten >= sectors) {
+                       if (!buckets) {
+                               sectors = 0;
+                               break;
+                       }
+
+                       buckets--;
+                       sectors = ca->mi.bucket_size;
                }
 
-               if (!sectors_this_device)
+               sectors -= unwritten;
+       }
+
+       if (sectors < ca->mi.bucket_size && buckets) {
+               buckets--;
+               sectors = ca->mi.bucket_size;
+       }
+
+       return (struct journal_space) {
+               .next_entry     = sectors,
+               .total          = sectors + buckets * ca->mi.bucket_size,
+       };
+}
+
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+                           enum journal_space_from from)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       unsigned i, pos, nr_devs = 0;
+       struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+       BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i,
+                                  &c->rw_devs[BCH_DATA_journal]) {
+               if (!ca->journal.nr)
                        continue;
 
-               sectors_next_entry = min(sectors_next_entry,
-                                        sectors_this_device);
+               space = journal_dev_space_available(j, ca, from);
+               if (!space.next_entry)
+                       continue;
 
-               sectors_total = min(sectors_total,
-                       buckets_this_device * ca->mi.bucket_size +
-                       sectors_this_device);
+               for (pos = 0; pos < nr_devs; pos++)
+                       if (space.total > dev_space[pos].total)
+                               break;
 
-               nr_devs++;
+               array_insert_item(dev_space, nr_devs, pos, space);
        }
        rcu_read_unlock();
 
        if (nr_devs < nr_devs_want)
                return (struct journal_space) { 0, 0 };
 
-       return (struct journal_space) {
-               .next_entry     = sectors_next_entry,
-               .remaining      = max_t(int, 0, sectors_total - sectors_next_entry),
-       };
+       /*
+        * We sorted largest to smallest, and we want the smallest out of the
+        * @nr_devs_want largest devices:
+        */
+       return dev_space[nr_devs_want - 1];
 }
 
 void bch2_journal_space_available(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
-       struct journal_space discarded, clean_ondisk, clean;
-       unsigned overhead, u64s_remaining = 0;
+       unsigned clean, clean_ondisk, total;
+       s64 u64s_remaining = 0;
        unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
                                       j->buf[1].buf_size >> 9);
        unsigned i, nr_online = 0, nr_devs_want;
@@ -164,31 +196,53 @@ void bch2_journal_space_available(struct journal *j)
        j->can_discard = can_discard;
 
        if (nr_online < c->opts.metadata_replicas_required) {
-               ret = -EROFS;
-               goto out;
-       }
-
-       if (!fifo_free(&j->pin)) {
-               ret = -ENOSPC;
+               ret = cur_entry_insufficient_devices;
                goto out;
        }
 
        nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
 
-       discarded       = __journal_space_available(j, nr_devs_want, journal_space_discarded);
-       clean_ondisk    = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
-       clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
+       for (i = 0; i < journal_space_nr; i++)
+               j->space[i] = __journal_space_available(j, nr_devs_want, i);
+
+       clean_ondisk    = j->space[journal_space_clean_ondisk].total;
+       clean           = j->space[journal_space_clean].total;
+       total           = j->space[journal_space_total].total;
 
-       if (!discarded.next_entry)
-               ret = -ENOSPC;
+       if (!clean_ondisk &&
+           j->reservations.idx ==
+           j->reservations.unwritten_idx) {
+               char *buf = kmalloc(4096, GFP_ATOMIC);
 
-       overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
-               journal_entry_overhead(j);
-       u64s_remaining = clean.remaining << 6;
-       u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
+               bch_err(c, "journal stuck");
+               if (buf) {
+                       __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
+                       pr_err("\n%s", buf);
+                       kfree(buf);
+               }
+
+               bch2_fatal_error(c);
+               ret = cur_entry_journal_stuck;
+       } else if (!j->space[journal_space_discarded].next_entry)
+               ret = cur_entry_journal_full;
+       else if (!fifo_free(&j->pin))
+               ret = cur_entry_journal_pin_full;
+
+       if ((j->space[journal_space_clean_ondisk].next_entry <
+            j->space[journal_space_clean_ondisk].total) &&
+           (clean - clean_ondisk <= total / 8) &&
+           (clean_ondisk * 2 > clean ))
+               set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+       else
+               clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
+       u64s_remaining  = (u64) clean << 6;
+       u64s_remaining -= (u64) total << 3;
+       u64s_remaining = max(0LL, u64s_remaining);
        u64s_remaining /= 4;
+       u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
 out:
-       j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
+       j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
        j->cur_entry_error      = ret;
        journal_set_remaining(j, u64s_remaining);
        journal_check_may_get_unreserved(j);
@@ -263,6 +317,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
        while (!fifo_empty(&j->pin) &&
               !atomic_read(&fifo_peek_front(&j->pin).count)) {
                BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+               BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
                BUG_ON(!fifo_pop(&j->pin, temp));
                popped = true;
        }
@@ -271,6 +326,14 @@ static void bch2_journal_reclaim_fast(struct journal *j)
                bch2_journal_space_available(j);
 }
 
+void __bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+       if (atomic_dec_and_test(&pin_list->count))
+               bch2_journal_reclaim_fast(j);
+}
+
 void bch2_journal_pin_put(struct journal *j, u64 seq)
 {
        struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
@@ -290,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j,
        if (!journal_pin_active(pin))
                return;
 
+       if (j->flush_in_progress == pin)
+               j->flush_in_progress_dropped = true;
+
        pin_list = journal_seq_pin(j, pin->seq);
        pin->seq = 0;
        list_del_init(&pin->list);
@@ -314,60 +380,39 @@ void bch2_journal_pin_drop(struct journal *j,
        spin_unlock(&j->lock);
 }
 
-static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
-                           struct journal_entry_pin *pin,
-                           journal_pin_flush_fn flush_fn)
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+                         struct journal_entry_pin *pin,
+                         journal_pin_flush_fn flush_fn)
 {
-       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-       __journal_pin_drop(j, pin);
-
-       BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
-
-       atomic_inc(&pin_list->count);
-       pin->seq        = seq;
-       pin->flush      = flush_fn;
-
-       list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
-}
+       struct journal_entry_pin_list *pin_list;
 
-void __bch2_journal_pin_add(struct journal *j, u64 seq,
-                           struct journal_entry_pin *pin,
-                           journal_pin_flush_fn flush_fn)
-{
        spin_lock(&j->lock);
-       bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
-       spin_unlock(&j->lock);
 
-       /*
-        * If the journal is currently full,  we might want to call flush_fn
-        * immediately:
-        */
-       journal_wake(j);
-}
-
-void bch2_journal_pin_update(struct journal *j, u64 seq,
-                            struct journal_entry_pin *pin,
-                            journal_pin_flush_fn flush_fn)
-{
-       if (journal_pin_active(pin) && pin->seq < seq)
+       if (seq < journal_last_seq(j)) {
+               /*
+                * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
+                * the src pin - with the pin dropped, the entry to pin might no
+                * longer to exist, but that means there's no longer anything to
+                * copy and we can bail out here:
+                */
+               spin_unlock(&j->lock);
                return;
+       }
 
-       spin_lock(&j->lock);
+       pin_list = journal_seq_pin(j, seq);
 
-       if (pin->seq != seq) {
-               bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
-       } else {
-               struct journal_entry_pin_list *pin_list =
-                       journal_seq_pin(j, seq);
+       __journal_pin_drop(j, pin);
 
-               /*
-                * If the pin is already pinning the right sequence number, it
-                * still might've already been flushed:
-                */
-               list_move(&pin->list, &pin_list->list);
-       }
+       atomic_inc(&pin_list->count);
+       pin->seq        = seq;
+       pin->flush      = flush_fn;
 
+       if (flush_fn == bch2_btree_key_cache_journal_flush)
+               list_add(&pin->list, &pin_list->key_cache_list);
+       else if (flush_fn)
+               list_add(&pin->list, &pin_list->list);
+       else
+               list_add(&pin->list, &pin_list->flushed);
        spin_unlock(&j->lock);
 
        /*
@@ -377,20 +422,6 @@ void bch2_journal_pin_update(struct journal *j, u64 seq,
        journal_wake(j);
 }
 
-void bch2_journal_pin_copy(struct journal *j,
-                          struct journal_entry_pin *dst,
-                          struct journal_entry_pin *src,
-                          journal_pin_flush_fn flush_fn)
-{
-       spin_lock(&j->lock);
-
-       if (journal_pin_active(src) &&
-           (!journal_pin_active(dst) || src->seq < dst->seq))
-               bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
-
-       spin_unlock(&j->lock);
-}
-
 /**
  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
  */
@@ -411,88 +442,106 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
  */
 
 static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
+journal_get_next_pin(struct journal *j,
+                    bool get_any,
+                    bool get_key_cache,
+                    u64 max_seq, u64 *seq)
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *ret = NULL;
 
-       if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
-               return NULL;
-
-       spin_lock(&j->lock);
-
-       fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
-               if (*seq > max_seq ||
-                   (ret = list_first_entry_or_null(&pin_list->list,
-                               struct journal_entry_pin, list)))
+       fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
+               if (*seq > max_seq && !get_any && !get_key_cache)
                        break;
 
-       if (ret) {
-               list_move(&ret->list, &pin_list->flushed);
-               BUG_ON(j->flush_in_progress);
-               j->flush_in_progress = ret;
-               j->last_flushed = jiffies;
-       }
+               if (*seq <= max_seq || get_any) {
+                       ret = list_first_entry_or_null(&pin_list->list,
+                               struct journal_entry_pin, list);
+                       if (ret)
+                               return ret;
+               }
 
-       spin_unlock(&j->lock);
+               if (*seq <= max_seq || get_any || get_key_cache) {
+                       ret = list_first_entry_or_null(&pin_list->key_cache_list,
+                               struct journal_entry_pin, list);
+                       if (ret)
+                               return ret;
+               }
+       }
 
-       return ret;
+       return NULL;
 }
 
 /* returns true if we did work */
-static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
-                              unsigned min_nr)
+static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+                                unsigned min_any,
+                                unsigned min_key_cache)
 {
        struct journal_entry_pin *pin;
-       bool ret = false;
+       size_t nr_flushed = 0;
+       journal_pin_flush_fn flush_fn;
        u64 seq;
+       int err;
+
+       if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
+               return 0;
 
        lockdep_assert_held(&j->reclaim_lock);
 
-       while ((pin = journal_get_next_pin(j, min_nr
-                               ? U64_MAX : seq_to_flush, &seq))) {
-               if (min_nr)
-                       min_nr--;
+       while (1) {
+               cond_resched();
+
+               j->last_flushed = jiffies;
+
+               spin_lock(&j->lock);
+               pin = journal_get_next_pin(j,
+                                          min_any != 0,
+                                          min_key_cache != 0,
+                                          seq_to_flush, &seq);
+               if (pin) {
+                       BUG_ON(j->flush_in_progress);
+                       j->flush_in_progress = pin;
+                       j->flush_in_progress_dropped = false;
+                       flush_fn = pin->flush;
+               }
+               spin_unlock(&j->lock);
+
+               if (!pin)
+                       break;
 
-               pin->flush(j, pin, seq);
+               if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
+                       min_key_cache--;
 
-               BUG_ON(j->flush_in_progress != pin);
+               if (min_any)
+                       min_any--;
+
+               err = flush_fn(j, pin, seq);
+
+               spin_lock(&j->lock);
+               /* Pin might have been dropped or rearmed: */
+               if (likely(!err && !j->flush_in_progress_dropped))
+                       list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
                j->flush_in_progress = NULL;
+               j->flush_in_progress_dropped = false;
+               spin_unlock(&j->lock);
+
                wake_up(&j->pin_flush_wait);
-               ret = true;
+
+               if (err)
+                       break;
+
+               nr_flushed++;
        }
 
-       return ret;
+       return nr_flushed;
 }
 
-/**
- * bch2_journal_reclaim - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-void bch2_journal_reclaim(struct journal *j)
+static u64 journal_seq_to_flush(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
-       unsigned iter, min_nr = 0;
        u64 seq_to_flush = 0;
-
-       lockdep_assert_held(&j->reclaim_lock);
-
-       bch2_journal_do_discards(j);
+       unsigned iter;
 
        spin_lock(&j->lock);
 
@@ -524,34 +573,174 @@ void bch2_journal_reclaim(struct journal *j)
                             (j->pin.size >> 1));
        spin_unlock(&j->lock);
 
+       return seq_to_flush;
+}
+
+/**
+ * bch2_journal_reclaim - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+static int __bch2_journal_reclaim(struct journal *j, bool direct)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
+       u64 seq_to_flush;
+       size_t min_nr, nr_flushed;
+       unsigned flags;
+       int ret = 0;
+
        /*
-        * If it's been longer than j->reclaim_delay_ms since we last flushed,
-        * make sure to flush at least one journal pin:
+        * We can't invoke memory reclaim while holding the reclaim_lock -
+        * journal reclaim is required to make progress for memory reclaim
+        * (cleaning the caches), so we can't get stuck in memory reclaim while
+        * we're holding the reclaim lock:
         */
-       if (time_after(jiffies, j->last_flushed +
-                      msecs_to_jiffies(j->reclaim_delay_ms)))
-               min_nr = 1;
+       lockdep_assert_held(&j->reclaim_lock);
+       flags = memalloc_noreclaim_save();
+
+       do {
+               if (kthread && kthread_should_stop())
+                       break;
+
+               if (bch2_journal_error(j)) {
+                       ret = -EIO;
+                       break;
+               }
+
+               bch2_journal_do_discards(j);
+
+               seq_to_flush = journal_seq_to_flush(j);
+               min_nr = 0;
+
+               /*
+                * If it's been longer than j->reclaim_delay_ms since we last flushed,
+                * make sure to flush at least one journal pin:
+                */
+               if (time_after(jiffies, j->last_flushed +
+                              msecs_to_jiffies(j->reclaim_delay_ms)))
+                       min_nr = 1;
+
+               if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+                       min_nr = 1;
+
+               if (fifo_free(&j->pin) <= 32)
+                       min_nr = 1;
+
+               trace_journal_reclaim_start(c,
+                               min_nr,
+                               j->prereserved.reserved,
+                               j->prereserved.remaining,
+                               atomic_read(&c->btree_cache.dirty),
+                               c->btree_cache.used,
+                               atomic_long_read(&c->btree_key_cache.nr_dirty),
+                               atomic_long_read(&c->btree_key_cache.nr_keys));
+
+               nr_flushed = journal_flush_pins(j, seq_to_flush,
+                                       min_nr,
+                                       min(bch2_nr_btree_keys_need_flush(c), 128UL));
+
+               if (direct)
+                       j->nr_direct_reclaim += nr_flushed;
+               else
+                       j->nr_background_reclaim += nr_flushed;
+               trace_journal_reclaim_finish(c, nr_flushed);
+
+               if (nr_flushed)
+                       wake_up(&j->reclaim_wait);
+       } while (min_nr && nr_flushed && !direct);
+
+       memalloc_noreclaim_restore(flags);
 
-       if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
-               seq_to_flush = max(seq_to_flush, journal_last_seq(j));
-               min_nr = 1;
+       return ret;
+}
+
+int bch2_journal_reclaim(struct journal *j)
+{
+       return __bch2_journal_reclaim(j, true);
+}
+
+static int bch2_journal_reclaim_thread(void *arg)
+{
+       struct journal *j = arg;
+       unsigned long next;
+       int ret = 0;
+
+       set_freezable();
+
+       kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
+
+       while (!ret && !kthread_should_stop()) {
+               j->reclaim_kicked = false;
+
+               mutex_lock(&j->reclaim_lock);
+               ret = __bch2_journal_reclaim(j, false);
+               mutex_unlock(&j->reclaim_lock);
+
+               next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+
+               while (1) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       if (kthread_should_stop())
+                               break;
+                       if (j->reclaim_kicked)
+                               break;
+                       if (time_after_eq(jiffies, next))
+                               break;
+                       schedule_timeout(next - jiffies);
+                       try_to_freeze();
+
+               }
+               __set_current_state(TASK_RUNNING);
        }
 
-       journal_flush_pins(j, seq_to_flush, min_nr);
+       return 0;
+}
 
-       if (!bch2_journal_error(j))
-               queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
-                                  msecs_to_jiffies(j->reclaim_delay_ms));
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+       struct task_struct *p = j->reclaim_thread;
+
+       j->reclaim_thread = NULL;
+
+       if (p) {
+               kthread_stop(p);
+               put_task_struct(p);
+       }
 }
 
-void bch2_journal_reclaim_work(struct work_struct *work)
+int bch2_journal_reclaim_start(struct journal *j)
 {
-       struct journal *j = container_of(to_delayed_work(work),
-                               struct journal, reclaim_work);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct task_struct *p;
 
-       mutex_lock(&j->reclaim_lock);
-       bch2_journal_reclaim(j);
-       mutex_unlock(&j->reclaim_lock);
+       if (j->reclaim_thread)
+               return 0;
+
+       p = kthread_create(bch2_journal_reclaim_thread, j,
+                          "bch-reclaim/%s", c->name);
+       if (IS_ERR(p)) {
+               bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
+               return PTR_ERR(p);
+       }
+
+       get_task_struct(p);
+       j->reclaim_thread = p;
+       wake_up_process(p);
+       return 0;
 }
 
 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
@@ -565,7 +754,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
        mutex_lock(&j->reclaim_lock);
 
-       *did_work = journal_flush_pins(j, seq_to_flush, 0);
+       *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
 
        spin_lock(&j->lock);
        /*
index 8128907a7623cb223718c55ed2e63c380b0ce796..adf1f5c981cdfa80e227516d7c203306a543eba3 100644 (file)
@@ -4,11 +4,16 @@
 
 #define JOURNAL_PIN    (32 * 1024)
 
-enum journal_space_from {
-       journal_space_discarded,
-       journal_space_clean_ondisk,
-       journal_space_clean,
-};
+static inline void journal_reclaim_kick(struct journal *j)
+{
+       struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+       if (p && !j->reclaim_kicked) {
+               j->reclaim_kicked = true;
+               if (p)
+                       wake_up_process(p);
+       }
+}
 
 unsigned bch2_journal_dev_buckets_available(struct journal *,
                                            struct journal_device *,
@@ -28,34 +33,48 @@ journal_seq_pin(struct journal *j, u64 seq)
        return &j->pin.data[seq & j->pin.mask];
 }
 
+void __bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 
-void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
-                           journal_pin_flush_fn);
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
+                         journal_pin_flush_fn);
 
 static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
                                        struct journal_entry_pin *pin,
                                        journal_pin_flush_fn flush_fn)
 {
        if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
-               __bch2_journal_pin_add(j, seq, pin, flush_fn);
+               bch2_journal_pin_set(j, seq, pin, flush_fn);
 }
 
-void bch2_journal_pin_update(struct journal *, u64,
-                            struct journal_entry_pin *,
-                            journal_pin_flush_fn);
+static inline void bch2_journal_pin_copy(struct journal *j,
+                                        struct journal_entry_pin *dst,
+                                        struct journal_entry_pin *src,
+                                        journal_pin_flush_fn flush_fn)
+{
+       /* Guard against racing with journal_pin_drop(src): */
+       u64 seq = READ_ONCE(src->seq);
 
-void bch2_journal_pin_copy(struct journal *,
-                          struct journal_entry_pin *,
-                          struct journal_entry_pin *,
-                          journal_pin_flush_fn);
+       if (seq)
+               bch2_journal_pin_add(j, seq, dst, flush_fn);
+}
+
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
+                                          struct journal_entry_pin *pin,
+                                          journal_pin_flush_fn flush_fn)
+{
+       if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
+               bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
 
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
 void bch2_journal_do_discards(struct journal *);
-void bch2_journal_reclaim(struct journal *);
-void bch2_journal_reclaim_work(struct work_struct *);
+int bch2_journal_reclaim(struct journal *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
 
 bool bch2_journal_flush_pins(struct journal *, u64);
 
index d0f1bbf8f6a7984ff5f96d997235b49d484d2eee..e1b63f3879f44e50cc2fdd92ca3de8db03a3c7fa 100644 (file)
@@ -118,7 +118,7 @@ out_write_sb:
 out:
        mutex_unlock(&c->sb_lock);
 
-       return ret;
+       return ret ?: bch2_blacklist_table_initialize(c);
 }
 
 static int journal_seq_blacklist_table_cmp(const void *_l,
@@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
        struct journal_seq_blacklist_table *t;
        unsigned i, nr = blacklist_nr_entries(bl);
 
-       BUG_ON(c->journal_seq_blacklist_table);
-
        if (!bl)
                return 0;
 
@@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
                        journal_seq_blacklist_table_cmp,
                        NULL);
 
+       kfree(c->journal_seq_blacklist_table);
        c->journal_seq_blacklist_table = t;
        return 0;
 }
index 154b51b891d33f1f6f89888e7afeef387392b75a..c24bc4aa9af2473ba49154efb63d5b5876c98da1 100644 (file)
@@ -9,16 +9,18 @@
 #include "super_types.h"
 #include "fifo.h"
 
-struct journal_res;
+#define JOURNAL_BUF_BITS       2
+#define JOURNAL_BUF_NR         (1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK       (JOURNAL_BUF_NR - 1)
 
 /*
- * We put two of these in struct journal; we used them for writes to the
- * journal that are being staged or in flight.
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
  */
 struct journal_buf {
        struct jset             *data;
 
-       BKEY_PADDED(key);
+       __BKEY_PADDED(key, BCH_REPLICAS_MAX);
 
        struct closure_waitlist wait;
 
@@ -27,6 +29,9 @@ struct journal_buf {
        unsigned                disk_sectors;   /* maximum size entry could have been, if
                                                   buf_size was bigger */
        unsigned                u64s_reserved;
+       bool                    noflush;        /* write has already been kicked off, and was noflush */
+       bool                    must_flush;     /* something wants a flush */
+       bool                    separate_flush;
        /* bloom filter: */
        unsigned long           has_inode[1024 / sizeof(unsigned long)];
 };
@@ -38,6 +43,7 @@ struct journal_buf {
 
 struct journal_entry_pin_list {
        struct list_head                list;
+       struct list_head                key_cache_list;
        struct list_head                flushed;
        atomic_t                        count;
        struct bch_devs_list            devs;
@@ -45,7 +51,7 @@ struct journal_entry_pin_list {
 
 struct journal;
 struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j,
+typedef int (*journal_pin_flush_fn)(struct journal *j,
                                struct journal_entry_pin *, u64);
 
 struct journal_entry_pin {
@@ -81,10 +87,12 @@ union journal_res_state {
 
        struct {
                u64             cur_entry_offset:20,
-                               idx:1,
-                               prev_buf_unwritten:1,
-                               buf0_count:21,
-                               buf1_count:21;
+                               idx:2,
+                               unwritten_idx:2,
+                               buf0_count:10,
+                               buf1_count:10,
+                               buf2_count:10,
+                               buf3_count:10;
        };
 };
 
@@ -98,8 +106,9 @@ union journal_preres_state {
        };
 
        struct {
-               u32             reserved;
-               u32             remaining;
+               u64             waiting:1,
+                               reserved:31,
+                               remaining:32;
        };
 };
 
@@ -116,6 +125,20 @@ union journal_preres_state {
 #define JOURNAL_ENTRY_CLOSED_VAL       (JOURNAL_ENTRY_OFFSET_MAX - 1)
 #define JOURNAL_ENTRY_ERROR_VAL                (JOURNAL_ENTRY_OFFSET_MAX)
 
+struct journal_space {
+       /* Units of 512 bytes sectors: */
+       unsigned        next_entry; /* How big the next journal entry can be */
+       unsigned        total;
+};
+
+enum journal_space_from {
+       journal_space_discarded,
+       journal_space_clean_ondisk,
+       journal_space_clean,
+       journal_space_total,
+       journal_space_nr,
+};
+
 /*
  * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
  * either because something's waiting on the write to complete or because it's
@@ -127,8 +150,8 @@ enum {
        JOURNAL_STARTED,
        JOURNAL_RECLAIM_STARTED,
        JOURNAL_NEED_WRITE,
-       JOURNAL_NOT_EMPTY,
        JOURNAL_MAY_GET_UNRESERVED,
+       JOURNAL_MAY_SKIP_FLUSH,
 };
 
 /* Embedded in struct bch_fs */
@@ -147,7 +170,14 @@ struct journal {
         * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
         * insufficient devices:
         */
-       int                     cur_entry_error;
+       enum {
+               cur_entry_ok,
+               cur_entry_blocked,
+               cur_entry_journal_full,
+               cur_entry_journal_pin_full,
+               cur_entry_journal_stuck,
+               cur_entry_insufficient_devices,
+       }                       cur_entry_error;
 
        union journal_preres_state prereserved;
 
@@ -160,7 +190,7 @@ struct journal {
         * Two journal entries -- one is currently open for new entries, the
         * other is possibly being written out.
         */
-       struct journal_buf      buf[2];
+       struct journal_buf      buf[JOURNAL_BUF_NR];
 
        spinlock_t              lock;
 
@@ -180,7 +210,10 @@ struct journal {
 
        /* seq, last_seq from the most recent journal entry successfully written */
        u64                     seq_ondisk;
+       u64                     flushed_seq_ondisk;
        u64                     last_seq_ondisk;
+       u64                     err_seq;
+       u64                     last_empty_seq;
 
        /*
         * FIFO of journal entries whose btree updates have not yet been
@@ -203,16 +236,24 @@ struct journal {
                struct journal_entry_pin_list *data;
        }                       pin;
 
+       struct journal_space    space[journal_space_nr];
+
        u64                     replay_journal_seq;
        u64                     replay_journal_seq_end;
 
        struct write_point      wp;
        spinlock_t              err_lock;
 
-       struct delayed_work     reclaim_work;
        struct mutex            reclaim_lock;
+       wait_queue_head_t       reclaim_wait;
+       struct task_struct      *reclaim_thread;
+       bool                    reclaim_kicked;
+       u64                     nr_direct_reclaim;
+       u64                     nr_background_reclaim;
+
        unsigned long           last_flushed;
        struct journal_entry_pin *flush_in_progress;
+       bool                    flush_in_progress_dropped;
        wait_queue_head_t       pin_flush_wait;
 
        /* protects advancing ja->discard_idx: */
@@ -221,11 +262,15 @@ struct journal {
 
        unsigned                write_delay_ms;
        unsigned                reclaim_delay_ms;
+       unsigned long           last_flush_write;
 
        u64                     res_get_blocked_start;
        u64                     need_write_time;
        u64                     write_start_time;
 
+       u64                     nr_flush_writes;
+       u64                     nr_noflush_writes;
+
        struct time_stats       *write_time;
        struct time_stats       *delay_time;
        struct time_stats       *blocked_time;
index 96c8690adc5bf51cfaecc8e8628a45109a07d7a1..ef69a19f494a1583d609a04632da248f599515e4 100644 (file)
@@ -4,7 +4,7 @@
  */
 
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
@@ -41,10 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        int ret = 0;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -53,11 +53,11 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k))) {
                if (!bch2_bkey_has_device(k, dev_idx)) {
-                       bch2_btree_iter_next(iter);
+                       bch2_btree_iter_advance(iter);
                        continue;
                }
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
 
                ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
                                    dev_idx, flags, false);
@@ -88,9 +88,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                if (ret)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        BUG_ON(ret == -EINTR);
 
@@ -99,8 +100,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
 
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-       return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
-               __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
+       return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
+               __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
 }
 
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
@@ -109,6 +110,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        struct btree_iter *iter;
        struct closure cl;
        struct btree *b;
+       struct bkey_buf k;
        unsigned id;
        int ret;
 
@@ -116,38 +118,42 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        if (flags & BCH_FORCE_IF_METADATA_LOST)
                return -EINVAL;
 
+       bch2_bkey_buf_init(&k);
        bch2_trans_init(&trans, c, 0, 0);
        closure_init_stack(&cl);
 
        for (id = 0; id < BTREE_ID_NR; id++) {
                for_each_btree_node(&trans, iter, id, POS_MIN,
                                    BTREE_ITER_PREFETCH, b) {
-                       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 retry:
                        if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
                                                  dev_idx))
                                continue;
 
-                       bkey_copy(&tmp.k, &b->key);
+                       bch2_bkey_buf_copy(&k, c, &b->key);
 
-                       ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k),
+                       ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
                                            dev_idx, flags, true);
                        if (ret) {
                                bch_err(c, "Cannot drop device without losing data");
-                               goto err;
+                               break;
                        }
 
-                       ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+                       ret = bch2_btree_node_update_key(c, iter, b, k.k);
                        if (ret == -EINTR) {
                                b = bch2_btree_iter_peek_node(iter);
+                               ret = 0;
                                goto retry;
                        }
                        if (ret) {
                                bch_err(c, "Error updating btree node key: %i", ret);
-                               goto err;
+                               break;
                        }
                }
                bch2_trans_iter_free(&trans, iter);
+
+               if (ret)
+                       goto err;
        }
 
        /* flush relevant btree updates */
@@ -157,6 +163,7 @@ retry:
        ret = 0;
 err:
        ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_bkey_buf_exit(&k, c);
 
        BUG_ON(ret == -EINTR);
 
index 6633d21f604ab00fc476b8530be18ebc5ba13606..5b108490d7c40f32ad1300932bc40f0869d485f7 100644 (file)
@@ -2,7 +2,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
@@ -61,8 +61,13 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
        struct migrate_write *m =
                container_of(op, struct migrate_write, op);
        struct keylist *keys = &op->insert_keys;
+       struct bkey_buf _new, _insert;
        int ret = 0;
 
+       bch2_bkey_buf_init(&_new);
+       bch2_bkey_buf_init(&_insert);
+       bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        iter = bch2_trans_get_iter(&trans, m->btree_id,
@@ -73,21 +78,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                struct bkey_s_c k;
                struct bkey_i *insert;
                struct bkey_i_extent *new;
-               BKEY_PADDED(k) _new, _insert;
                const union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
                bool did_work = false;
-               int nr;
+               bool extending = false, should_check_enospc;
+               s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 
                bch2_trans_reset(&trans, 0);
 
                k = bch2_btree_iter_peek_slot(iter);
                ret = bkey_err(k);
-               if (ret) {
-                       if (ret == -EINTR)
-                               continue;
-                       break;
-               }
+               if (ret)
+                       goto err;
 
                new = bkey_i_to_extent(bch2_keylist_front(keys));
 
@@ -95,11 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
                        goto nomatch;
 
-               bkey_reassemble(&_insert.k, k);
-               insert = &_insert.k;
+               bkey_reassemble(_insert.k, k);
+               insert = _insert.k;
 
-               bkey_copy(&_new.k, bch2_keylist_front(keys));
-               new = bkey_i_to_extent(&_new.k);
+               bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+               new = bkey_i_to_extent(_new.k);
                bch2_cut_front(iter->pos, &new->k_i);
 
                bch2_cut_front(iter->pos,       insert);
@@ -144,23 +146,21 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                               op->opts.background_target,
                                               op->opts.data_replicas);
 
-               /*
-                * If we're not fully overwriting @k, and it's compressed, we
-                * need a reservation for all the pointers in @insert
-                */
-               nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) -
-                        m->nr_ptrs_reserved;
+               ret = bch2_sum_sector_overwrites(&trans, iter, insert,
+                                                &extending,
+                                                &should_check_enospc,
+                                                &i_sectors_delta,
+                                                &disk_sectors_delta);
+               if (ret)
+                       goto err;
 
-               if (insert->k.size < k.k->size &&
-                   bch2_bkey_sectors_compressed(k) &&
-                   nr > 0) {
+               if (disk_sectors_delta > (s64) op->res.sectors) {
                        ret = bch2_disk_reservation_add(c, &op->res,
-                                       keylist_sectors(keys) * nr, 0);
+                                               disk_sectors_delta - op->res.sectors,
+                                               !should_check_enospc
+                                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
                        if (ret)
                                goto out;
-
-                       m->nr_ptrs_reserved += nr;
-                       goto next;
                }
 
                bch2_trans_update(&trans, iter, insert, 0);
@@ -168,8 +168,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                ret = bch2_trans_commit(&trans, &op->res,
                                op_journal_seq(op),
                                BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE|
                                m->data_opts.btree_insert_flags);
+err:
                if (!ret)
                        atomic_long_inc(&c->extent_migrate_done);
                if (ret == -EINTR)
@@ -196,7 +196,10 @@ nomatch:
                goto next;
        }
 out:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
+       bch2_bkey_buf_exit(&_insert, c);
+       bch2_bkey_buf_exit(&_new, c);
        BUG_ON(ret == -EINTR);
        return ret;
 }
@@ -207,9 +210,9 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
        BUG_ON(!m->op.wbio.bio.bi_vcnt);
 
        m->ptr          = rbio->pick.ptr;
-       m->offset       = rbio->pos.offset - rbio->pick.crc.offset;
+       m->offset       = rbio->data_pos.offset - rbio->pick.crc.offset;
        m->op.devs_have = rbio->devs_have;
-       m->op.pos       = rbio->pos;
+       m->op.pos       = rbio->data_pos;
        m->op.version   = rbio->version;
        m->op.crc       = rbio->pick.crc;
        m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
@@ -491,7 +494,9 @@ static int bch2_move_extent(struct btree_trans *trans,
         * ctxt when doing wakeup
         */
        closure_get(&ctxt->cl);
-       bch2_read_extent(trans, &io->rbio, k, 0,
+       bch2_read_extent(trans, &io->rbio,
+                        bkey_start_pos(k.k),
+                        btree_id, k, 0,
                         BCH_READ_NODECODE|
                         BCH_READ_LAST_FRAGMENT);
        return 0;
@@ -504,6 +509,32 @@ err:
        return ret;
 }
 
+static int lookup_inode(struct btree_trans *trans, struct bpos pos,
+                       struct bch_inode_unpacked *inode)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       int ret;
+
+       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
+                                  BTREE_ITER_ALL_SNAPSHOTS);
+       k = bch2_btree_iter_peek(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+       if (ret)
+               goto err;
+
+       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+       if (ret)
+               goto err;
+err:
+       bch2_trans_iter_put(trans, iter);
+       return ret;
+}
+
 static int __bch2_move_data(struct bch_fs *c,
                struct moving_context *ctxt,
                struct bch_ratelimit *rate,
@@ -516,7 +547,7 @@ static int __bch2_move_data(struct bch_fs *c,
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
@@ -525,12 +556,12 @@ static int __bch2_move_data(struct bch_fs *c,
        u64 delay, cur_inum = U64_MAX;
        int ret = 0, ret2;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
        stats->data_type = BCH_DATA_user;
        stats->btree_id = btree_id;
-       stats->pos      = POS_MIN;
+       stats->pos      = start;
 
        iter = bch2_trans_get_iter(&trans, btree_id, start,
                                   BTREE_ITER_PREFETCH);
@@ -561,7 +592,7 @@ static int __bch2_move_data(struct bch_fs *c,
                                try_to_freeze();
                        }
                } while (delay);
-peek:
+
                k = bch2_btree_iter_peek(iter);
 
                stats->pos = iter->pos;
@@ -577,18 +608,22 @@ peek:
                if (!bkey_extent_is_direct_data(k.k))
                        goto next_nondata;
 
-               if (btree_id == BTREE_ID_EXTENTS &&
+               if (btree_id == BTREE_ID_extents &&
                    cur_inum != k.k->p.inode) {
                        struct bch_inode_unpacked inode;
 
-                       /* don't hold btree locks while looking up inode: */
-                       bch2_trans_unlock(&trans);
-
                        io_opts = bch2_opts_to_inode_opts(c->opts);
-                       if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+
+                       ret = lookup_inode(&trans,
+                                       SPOS(0, k.k->p.inode, k.k->p.snapshot),
+                                       &inode);
+                       if (ret == -EINTR)
+                               continue;
+
+                       if (!ret)
                                bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
+
                        cur_inum = k.k->p.inode;
-                       goto peek;
                }
 
                switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
@@ -605,13 +640,19 @@ peek:
                }
 
                /* unlock before doing IO: */
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
                k = bkey_i_to_s_c(sk.k);
                bch2_trans_unlock(&trans);
 
                ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
                                        data_cmd, data_opts);
                if (ret2) {
+                       if (ret2 == -EINTR) {
+                               bch2_trans_reset(&trans, 0);
+                               bch2_trans_cond_resched(&trans);
+                               continue;
+                       }
+
                        if (ret2 == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
                                bch2_move_ctxt_wait_for_io(ctxt);
@@ -628,25 +669,28 @@ next:
                atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
                             &stats->sectors_seen);
 next_nondata:
-               bch2_btree_iter_next(iter);
+               bch2_btree_iter_advance(iter);
                bch2_trans_cond_resched(&trans);
        }
 out:
+
+       bch2_trans_iter_put(&trans, iter);
        ret = bch2_trans_exit(&trans) ?: ret;
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
 
 int bch2_move_data(struct bch_fs *c,
+                  enum btree_id start_btree_id, struct bpos start_pos,
+                  enum btree_id end_btree_id,   struct bpos end_pos,
                   struct bch_ratelimit *rate,
                   struct write_point_specifier wp,
-                  struct bpos start,
-                  struct bpos end,
                   move_pred_fn pred, void *arg,
                   struct bch_move_stats *stats)
 {
        struct moving_context ctxt = { .stats = stats };
+       enum btree_id id;
        int ret;
 
        closure_init_stack(&ctxt.cl);
@@ -655,10 +699,23 @@ int bch2_move_data(struct bch_fs *c,
 
        stats->data_type = BCH_DATA_user;
 
-       ret =   __bch2_move_data(c, &ctxt, rate, wp, start, end,
-                                pred, arg, stats, BTREE_ID_EXTENTS) ?:
-               __bch2_move_data(c, &ctxt, rate, wp, start, end,
-                                pred, arg, stats, BTREE_ID_REFLINK);
+       for (id = start_btree_id;
+            id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+            id++) {
+               stats->btree_id = id;
+
+               if (id != BTREE_ID_extents &&
+                   id != BTREE_ID_reflink)
+                       continue;
+
+               ret = __bch2_move_data(c, &ctxt, rate, wp,
+                                      id == start_btree_id ? start_pos : POS_MIN,
+                                      id == end_btree_id   ? end_pos   : POS_MAX,
+                                      pred, arg, stats, id);
+               if (ret)
+                       break;
+       }
+
 
        move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
        closure_sync(&ctxt.cl);
@@ -672,16 +729,22 @@ int bch2_move_data(struct bch_fs *c,
        return ret;
 }
 
+typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
+                                        struct btree *, struct bch_io_opts *,
+                                        struct data_opts *);
+
 static int bch2_move_btree(struct bch_fs *c,
-                          move_pred_fn pred,
-                          void *arg,
+                          enum btree_id start_btree_id, struct bpos start_pos,
+                          enum btree_id end_btree_id,   struct bpos end_pos,
+                          move_btree_pred pred, void *arg,
                           struct bch_move_stats *stats)
 {
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct btree_trans trans;
        struct btree_iter *iter;
        struct btree *b;
-       unsigned id;
+       enum btree_id id;
        struct data_opts data_opts;
        enum data_cmd cmd;
        int ret = 0;
@@ -690,16 +753,24 @@ static int bch2_move_btree(struct bch_fs *c,
 
        stats->data_type = BCH_DATA_btree;
 
-       for (id = 0; id < BTREE_ID_NR; id++) {
+       for (id = start_btree_id;
+            id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+            id++) {
                stats->btree_id = id;
 
-               for_each_btree_node(&trans, iter, id, POS_MIN,
+               for_each_btree_node(&trans, iter, id,
+                                   id == start_btree_id ? start_pos : POS_MIN,
                                    BTREE_ITER_PREFETCH, b) {
+                       if (kthread && kthread_should_stop())
+                               goto out;
+
+                       if ((cmp_int(id, end_btree_id) ?:
+                            bkey_cmp(b->key.k.p, end_pos)) > 0)
+                               break;
+
                        stats->pos = iter->pos;
 
-                       switch ((cmd = pred(c, arg,
-                                           bkey_i_to_s_c(&b->key),
-                                           &io_opts, &data_opts))) {
+                       switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
                        case DATA_SKIP:
                                goto next;
                        case DATA_SCRUB:
@@ -719,9 +790,12 @@ next:
 
                ret = bch2_trans_iter_free(&trans, iter) ?: ret;
        }
-
+out:
        bch2_trans_exit(&trans);
 
+       if (ret)
+               bch_err(c, "error %i in bch2_move_btree", ret);
+
        return ret;
 }
 
@@ -778,6 +852,83 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
        return DATA_REWRITE;
 }
 
+static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
+                                           struct btree *b,
+                                           struct bch_io_opts *io_opts,
+                                           struct data_opts *data_opts)
+{
+       return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
+                                       struct btree *b,
+                                       struct bch_io_opts *io_opts,
+                                       struct data_opts *data_opts)
+{
+       return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool bformat_needs_redo(struct bkey_format *f)
+{
+       unsigned i;
+
+       for (i = 0; i < f->nr_fields; i++) {
+               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+               u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+               u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+               if (f->bits_per_field[i] > unpacked_bits)
+                       return true;
+
+               if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+                       return true;
+
+               if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+                    unpacked_mask) <
+                   field_offset)
+                       return true;
+       }
+
+       return false;
+}
+
+static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+                                           struct btree *b,
+                                           struct bch_io_opts *io_opts,
+                                           struct data_opts *data_opts)
+{
+       if (b->version_ondisk != c->sb.version ||
+           btree_node_need_rewrite(b) ||
+           bformat_needs_redo(&b->format)) {
+               data_opts->target               = 0;
+               data_opts->nr_replicas          = 1;
+               data_opts->btree_insert_flags   = 0;
+               return DATA_REWRITE;
+       }
+
+       return DATA_SKIP;
+}
+
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
+{
+       int ret;
+
+       ret = bch2_move_btree(c,
+                             0,                POS_MIN,
+                             BTREE_ID_NR,      POS_MAX,
+                             rewrite_old_nodes_pred, c, stats);
+       if (!ret) {
+               mutex_lock(&c->sb_lock);
+               c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+               c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+               c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+               bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
+       }
+
+       return ret;
+}
+
 int bch2_data_job(struct bch_fs *c,
                  struct bch_move_stats *stats,
                  struct bch_ioctl_data op)
@@ -789,17 +940,20 @@ int bch2_data_job(struct bch_fs *c,
                stats->data_type = BCH_DATA_journal;
                ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
-               ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+               ret = bch2_move_btree(c,
+                                     op.start_btree,   op.start_pos,
+                                     op.end_btree,     op.end_pos,
+                                     rereplicate_btree_pred, c, stats) ?: ret;
 
                closure_wait_event(&c->btree_interior_update_wait,
                                   !bch2_btree_interior_updates_nr_pending(c));
 
                ret = bch2_replicas_gc2(c) ?: ret;
 
-               ret = bch2_move_data(c, NULL,
-                                    writepoint_hashed((unsigned long) current),
-                                    op.start,
-                                    op.end,
+               ret = bch2_move_data(c,
+                                    op.start_btree,    op.start_pos,
+                                    op.end_btree,      op.end_pos,
+                                    NULL, writepoint_hashed((unsigned long) current),
                                     rereplicate_pred, c, stats) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
                break;
@@ -810,16 +964,22 @@ int bch2_data_job(struct bch_fs *c,
                stats->data_type = BCH_DATA_journal;
                ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
-               ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+               ret = bch2_move_btree(c,
+                                     op.start_btree,   op.start_pos,
+                                     op.end_btree,     op.end_pos,
+                                     migrate_btree_pred, &op, stats) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
 
-               ret = bch2_move_data(c, NULL,
-                                    writepoint_hashed((unsigned long) current),
-                                    op.start,
-                                    op.end,
+               ret = bch2_move_data(c,
+                                    op.start_btree,    op.start_pos,
+                                    op.end_btree,      op.end_pos,
+                                    NULL, writepoint_hashed((unsigned long) current),
                                     migrate_pred, &op, stats) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
                break;
+       case BCH_DATA_OP_REWRITE_OLD_NODES:
+               ret = bch2_scan_old_btree_nodes(c, stats);
+               break;
        default:
                ret = -EINVAL;
        }
index b04bc669226de68582c59bdd8cb13ac035b219b2..5076153689d18bd3a55049eff957df4f376a2a19 100644 (file)
@@ -52,9 +52,13 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
                                struct bkey_s_c,
                                struct bch_io_opts *, struct data_opts *);
 
-int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+
+int bch2_move_data(struct bch_fs *,
+                  enum btree_id, struct bpos,
+                  enum btree_id, struct bpos,
+                  struct bch_ratelimit *,
                   struct write_point_specifier,
-                  struct bpos, struct bpos,
                   move_pred_fn, void *,
                   struct bch_move_stats *);
 
index ddfda1ef8a799a369006fb53f3836aabbaa059e3..03668e481f7ae8eaa303804c83cc845c18ba2d4b 100644 (file)
@@ -61,7 +61,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
        copygc_heap *h = &c->copygc_heap;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
+       struct extent_ptr_decoded p = { 0 };
 
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
@@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
                        data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
                        data_opts->rewrite_dev          = p.ptr.dev;
 
-                       if (p.has_ec) {
-                               struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
-
-                               data_opts->nr_replicas += m->nr_redundant;
-                       }
+                       if (p.has_ec)
+                               data_opts->nr_replicas += p.ec.redundancy;
 
                        return DATA_REWRITE;
                }
@@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
                            bucket_sectors_used(m) >= ca->mi.bucket_size)
                                continue;
 
-                       WARN_ON(m.stripe && !g->ec_redundancy);
+                       WARN_ON(m.stripe && !g->stripe_redundancy);
 
                        e = (struct copygc_heap_entry) {
                                .dev            = dev_idx,
                                .gen            = m.gen,
-                               .replicas       = 1 + g->ec_redundancy,
+                               .replicas       = 1 + g->stripe_redundancy,
                                .fragmentation  = bucket_sectors_used(m) * (1U << 15)
                                        / ca->mi.bucket_size,
                                .sectors        = bucket_sectors_used(m),
@@ -200,6 +197,11 @@ static int bch2_copygc(struct bch_fs *c)
                return -1;
        }
 
+       /*
+        * Our btree node allocations also come out of RESERVE_MOVINGGC:
+        */
+       sectors_to_move = (sectors_to_move * 3) / 4;
+
        for (i = h->data; i < h->data + h->used; i++)
                sectors_to_move += i->sectors * i->replicas;
 
@@ -217,9 +219,11 @@ static int bch2_copygc(struct bch_fs *c)
                        sizeof(h->data[0]),
                        bucket_offset_cmp, NULL);
 
-       ret = bch2_move_data(c, &c->copygc_pd.rate,
+       ret = bch2_move_data(c,
+                            0,                 POS_MIN,
+                            BTREE_ID_NR,       POS_MAX,
+                            &c->copygc_pd.rate,
                             writepoint_ptr(&c->copygc_write_point),
-                            POS_MIN, POS_MAX,
                             copygc_pred, NULL,
                             &move_stats);
 
@@ -286,7 +290,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 
                fragmented_allowed += ((__dev_buckets_available(ca, usage) *
                                        ca->mi.bucket_size) >> 1);
-               fragmented += usage.sectors_fragmented;
+               fragmented += usage.d[BCH_DATA_user].fragmented;
        }
 
        return max_t(s64, 0, fragmented_allowed - fragmented);
@@ -296,7 +300,7 @@ static int bch2_copygc_thread(void *arg)
 {
        struct bch_fs *c = arg;
        struct io_clock *clock = &c->io_clock[WRITE];
-       unsigned long last, wait;
+       u64 last, wait;
 
        set_freezable();
 
@@ -304,7 +308,7 @@ static int bch2_copygc_thread(void *arg)
                if (kthread_wait_freezable(c->copy_gc_enabled))
                        break;
 
-               last = atomic_long_read(&clock->now);
+               last = atomic64_read(&clock->now);
                wait = bch2_copygc_wait_amount(c);
 
                if (wait > clock->max_slop) {
@@ -345,9 +349,11 @@ int bch2_copygc_start(struct bch_fs *c)
        if (bch2_fs_init_fault("copygc_start"))
                return -ENOMEM;
 
-       t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
-       if (IS_ERR(t))
+       t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
+       if (IS_ERR(t)) {
+               bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
                return PTR_ERR(t);
+       }
 
        get_task_struct(t);
 
index 97a36ac0beea120f0bf5d2ad95131ce028875ab7..0cfbb56a57c103211c2761603747f65f17fb41a2 100644 (file)
@@ -9,72 +9,59 @@
 #include "super-io.h"
 #include "util.h"
 
+#define x(t, n) #t,
+
 const char * const bch2_error_actions[] = {
-       "continue",
-       "remount-ro",
-       "panic",
+       BCH_ERROR_ACTIONS()
        NULL
 };
 
 const char * const bch2_sb_features[] = {
-#define x(f, n) #f,
        BCH_SB_FEATURES()
-#undef x
+       NULL
+};
+
+const char * const bch2_sb_compat[] = {
+       BCH_SB_COMPAT()
+       NULL
+};
+
+const char * const bch2_btree_ids[] = {
+       BCH_BTREE_IDS()
        NULL
 };
 
 const char * const bch2_csum_opts[] = {
-       "none",
-       "crc32c",
-       "crc64",
+       BCH_CSUM_OPTS()
        NULL
 };
 
 const char * const bch2_compression_opts[] = {
-#define x(t, n) #t,
        BCH_COMPRESSION_OPTS()
-#undef x
        NULL
 };
 
 const char * const bch2_str_hash_types[] = {
-       "crc32c",
-       "crc64",
-       "siphash",
+       BCH_STR_HASH_OPTS()
        NULL
 };
 
 const char * const bch2_data_types[] = {
-#define x(t, n) #t,
        BCH_DATA_TYPES()
-#undef x
        NULL
 };
 
 const char * const bch2_cache_replacement_policies[] = {
-       "lru",
-       "fifo",
-       "random",
+       BCH_CACHE_REPLACEMENT_POLICIES()
        NULL
 };
 
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch2_cache_modes[] = {
-       "default",
-       "writethrough",
-       "writeback",
-       "writearound",
-       "none",
+const char * const bch2_member_states[] = {
+       BCH_MEMBER_STATES()
        NULL
 };
 
-const char * const bch2_dev_state[] = {
-       "readwrite",
-       "readonly",
-       "failed",
-       "spare",
-       NULL
-};
+#undef x
 
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 {
index 710a7ee6703922e01af1cb8a5a4bb649c48631d7..001e865c555560b1ceff0543915f50304fac7574 100644 (file)
 
 extern const char * const bch2_error_actions[];
 extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
+extern const char * const bch2_btree_ids[];
 extern const char * const bch2_csum_opts[];
 extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_data_types[];
 extern const char * const bch2_cache_replacement_policies[];
-extern const char * const bch2_cache_modes[];
-extern const char * const bch2_dev_state[];
+extern const char * const bch2_member_states[];
 
 /*
  * Mount options; we also store defaults in the superblock.
@@ -89,7 +90,7 @@ enum opt_type {
        x(errors,                       u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
          OPT_STR(bch2_error_actions),                                  \
-         BCH_SB_ERROR_ACTION,          BCH_ON_ERROR_RO,                \
+         BCH_SB_ERROR_ACTION,          BCH_ON_ERROR_ro,                \
          NULL,         "Action to take on filesystem error")           \
        x(metadata_replicas,            u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
@@ -114,12 +115,12 @@ enum opt_type {
        x(metadata_checksum,            u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
          OPT_STR(bch2_csum_opts),                                      \
-         BCH_SB_META_CSUM_TYPE,        BCH_CSUM_OPT_CRC32C,            \
+         BCH_SB_META_CSUM_TYPE,        BCH_CSUM_OPT_crc32c,            \
          NULL,         NULL)                                           \
        x(data_checksum,                u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
          OPT_STR(bch2_csum_opts),                                      \
-         BCH_SB_DATA_CSUM_TYPE,        BCH_CSUM_OPT_CRC32C,            \
+         BCH_SB_DATA_CSUM_TYPE,        BCH_CSUM_OPT_crc32c,            \
          NULL,         NULL)                                           \
        x(compression,                  u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
@@ -134,8 +135,13 @@ enum opt_type {
        x(str_hash,                     u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
          OPT_STR(bch2_str_hash_types),                                 \
-         BCH_SB_STR_HASH_TYPE,         BCH_STR_HASH_OPT_SIPHASH,       \
+         BCH_SB_STR_HASH_TYPE,         BCH_STR_HASH_OPT_siphash,       \
          NULL,         "Hash function for directory entries and xattrs")\
+       x(metadata_target,              u16,                            \
+         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FN(bch2_opt_target),                                      \
+         BCH_SB_METADATA_TARGET,       0,                              \
+         "(target)",   "Device or disk group for metadata writes")     \
        x(foreground_target,            u16,                            \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
          OPT_FN(bch2_opt_target),                                      \
@@ -207,16 +213,16 @@ enum opt_type {
          OPT_BOOL(),                                                   \
          BCH_SB_PRJQUOTA,              false,                          \
          NULL,         "Enable project quotas")                        \
-       x(reflink,                      u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
-         OPT_BOOL(),                                                   \
-         BCH_SB_REFLINK,               true,                           \
-         NULL,         "Enable reflink support")                       \
        x(degraded,                     u8,                             \
          OPT_MOUNT,                                                    \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Allow mounting in degraded mode")              \
+       x(very_degraded,                u8,                             \
+         OPT_MOUNT,                                                    \
+         OPT_BOOL(),                                                   \
+         NO_SB_OPT,                    false,                          \
+         NULL,         "Allow mounting in when data will be missing")  \
        x(discard,                      u8,                             \
          OPT_MOUNT|OPT_DEVICE,                                         \
          OPT_BOOL(),                                                   \
index d3032a46e7f31d226144215f956d78b62860dc79..8e272519ce0e38a036b147a2cc60448c56eca0ec 100644 (file)
@@ -363,7 +363,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0),
+       for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
                           BTREE_ITER_PREFETCH, k, ret) {
                if (k.k->p.inode != type)
                        break;
@@ -435,7 +435,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                switch (k.k->type) {
                case KEY_TYPE_inode:
@@ -526,7 +526,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                if (c->opts.usrquota)
                        return -EINVAL;
 
-               ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+               ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
                                              POS(QTYP_USR, 0),
                                              POS(QTYP_USR + 1, 0),
                                              NULL);
@@ -538,7 +538,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                if (c->opts.grpquota)
                        return -EINVAL;
 
-               ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+               ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
                                              POS(QTYP_GRP, 0),
                                              POS(QTYP_GRP + 1, 0),
                                              NULL);
@@ -550,7 +550,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                if (c->opts.prjquota)
                        return -EINVAL;
 
-               ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+               ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
                                              POS(QTYP_PRJ, 0),
                                              POS(QTYP_PRJ + 1, 0),
                                              NULL);
@@ -718,7 +718,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p,
+       iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
        k = bch2_btree_iter_peek_slot(iter);
 
@@ -746,7 +746,6 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
                          struct qc_dqblk *qdq)
 {
        struct bch_fs *c = sb->s_fs_info;
-       struct btree_trans trans;
        struct bkey_i_quota new_quota;
        int ret;
 
@@ -756,14 +755,10 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
        bkey_quota_init(&new_quota.k_i);
        new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
                            bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
                __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
 
-       bch2_trans_exit(&trans);
-
        return ret;
 }
 
index 44d2651be9700590177844f29e14b0938efc8235..a0dbf41d1d3763c432c0c0ba90f9ecc498623f74 100644 (file)
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
        unsigned long start, prev_start;
        unsigned long prev_run_time, prev_run_cputime;
        unsigned long cputime, prev_cputime;
-       unsigned long io_start;
+       u64 io_start;
        long throttle;
 
        set_freezable();
 
-       io_start        = atomic_long_read(&clock->now);
+       io_start        = atomic64_read(&clock->now);
        p               = rebalance_work(c);
        prev_start      = jiffies;
        prev_cputime    = curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
                                        (20 - w.dev_most_full_percent),
                                        50);
 
-                       if (atomic_long_read(&clock->now) + clock->max_slop <
+                       if (atomic64_read(&clock->now) + clock->max_slop <
                            r->throttled_until_iotime) {
                                r->throttled_until_cputime = start + throttle;
                                r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
                              max(p.dev_most_full_percent, 1U) /
                              max(w.dev_most_full_percent, 1U));
 
-               io_start        = atomic_long_read(&clock->now);
+               io_start        = atomic64_read(&clock->now);
                p               = w;
                prev_start      = start;
                prev_cputime    = cputime;
@@ -239,10 +239,11 @@ static int bch2_rebalance_thread(void *arg)
                rebalance_work_reset(c);
 
                bch2_move_data(c,
+                              0,               POS_MIN,
+                              BTREE_ID_NR,     POS_MAX,
                               /* ratelimiting disabled for now */
                               NULL, /*  &r->pd.rate, */
                               writepoint_ptr(&c->rebalance_write_point),
-                              POS_MIN, POS_MAX,
                               rebalance_pred, NULL,
                               &r->move_stats);
        }
@@ -274,16 +275,16 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
        case REBALANCE_THROTTLED:
                bch2_hprint(&PBUF(h1),
                            (r->throttled_until_iotime -
-                            atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+                            atomic64_read(&c->io_clock[WRITE].now)) << 9);
                pr_buf(out, "throttled for %lu sec or %s io\n",
                       (r->throttled_until_cputime - jiffies) / HZ,
                       h1);
                break;
        case REBALANCE_RUNNING:
-               pr_buf(out, "running\n");
-               pr_buf(out, "pos %llu:%llu\n",
-                      r->move_stats.pos.inode,
-                      r->move_stats.pos.offset);
+               pr_buf(out, "running\n"
+                      "pos ");
+               bch2_bpos_to_text(out, r->move_stats.pos);
+               pr_buf(out, "\n");
                break;
        }
 }
@@ -311,12 +312,17 @@ int bch2_rebalance_start(struct bch_fs *c)
 {
        struct task_struct *p;
 
+       if (c->rebalance.thread)
+               return 0;
+
        if (c->opts.nochanges)
                return 0;
 
-       p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
-       if (IS_ERR(p))
+       p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
+       if (IS_ERR(p)) {
+               bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
                return PTR_ERR(p);
+       }
 
        get_task_struct(p);
        rcu_assign_pointer(c->rebalance.thread, p);
index 192c6be20cedd841311518fbee9028f07f09b23b..2f62a643c39fbb0c08f024fbf58a7f3325755875 100644 (file)
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
        atomic64_t              work_unknown_dev;
 
        enum rebalance_state    state;
-       unsigned long           throttled_until_iotime;
+       u64                     throttled_until_iotime;
        unsigned long           throttled_until_cputime;
        struct bch_move_stats   move_stats;
 
index 32fed6b81a526a6f83549bc05cf56ab5907fb7a9..86593e92edd000a887e93016bffdf73038baa59b 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -15,6 +16,7 @@
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "move.h"
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
@@ -31,7 +33,7 @@ static void drop_alloc_keys(struct journal_keys *keys)
        size_t src, dst;
 
        for (src = 0, dst = 0; src < keys->nr; src++)
-               if (keys->d[src].btree_id != BTREE_ID_ALLOC)
+               if (keys->d[src].btree_id != BTREE_ID_alloc)
                        keys->d[dst++] = keys->d[src];
 
        keys->nr = dst;
@@ -39,78 +41,174 @@ static void drop_alloc_keys(struct journal_keys *keys)
 
 /* iterate over keys read from the journal: */
 
-static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
-                                             enum btree_id id, unsigned level,
-                                             struct bpos pos)
+static int __journal_key_cmp(enum btree_id     l_btree_id,
+                            unsigned           l_level,
+                            struct bpos        l_pos,
+                            struct journal_key *r)
+{
+       return (cmp_int(l_btree_id,     r->btree_id) ?:
+               cmp_int(l_level,        r->level) ?:
+               bpos_cmp(l_pos, r->k->k.p));
+}
+
+static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+{
+       return (cmp_int(l->btree_id,    r->btree_id) ?:
+               cmp_int(l->level,       r->level) ?:
+               bpos_cmp(l->k->k.p,     r->k->k.p));
+}
+
+static size_t journal_key_search(struct journal_keys *journal_keys,
+                                enum btree_id id, unsigned level,
+                                struct bpos pos)
 {
        size_t l = 0, r = journal_keys->nr, m;
 
        while (l < r) {
                m = l + ((r - l) >> 1);
-               if ((cmp_int(id,        journal_keys->d[m].btree_id) ?:
-                    cmp_int(level,     journal_keys->d[m].level) ?:
-                    bkey_cmp(pos,      journal_keys->d[m].k->k.p)) > 0)
+               if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
                        l = m + 1;
                else
                        r = m;
        }
 
        BUG_ON(l < journal_keys->nr &&
-              (cmp_int(id,     journal_keys->d[l].btree_id) ?:
-               cmp_int(level,  journal_keys->d[l].level) ?:
-               bkey_cmp(pos,   journal_keys->d[l].k->k.p)) > 0);
+              __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
 
        BUG_ON(l &&
-              (cmp_int(id,     journal_keys->d[l - 1].btree_id) ?:
-               cmp_int(level,  journal_keys->d[l - 1].level) ?:
-               bkey_cmp(pos,   journal_keys->d[l - 1].k->k.p)) <= 0);
+              __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
+
+       return l;
+}
+
+static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+{
+       struct bkey_i *n = iter->keys->d[idx].k;
+       struct btree_and_journal_iter *biter =
+               container_of(iter, struct btree_and_journal_iter, journal);
+
+       if (iter->idx > idx ||
+           (iter->idx == idx &&
+            biter->last &&
+            bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
+               iter->idx++;
+}
+
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bkey_i *k)
+{
+       struct journal_key n = {
+               .btree_id       = id,
+               .level          = level,
+               .k              = k,
+               .allocated      = true
+       };
+       struct journal_keys *keys = &c->journal_keys;
+       struct journal_iter *iter;
+       unsigned idx = journal_key_search(keys, id, level, k->k.p);
+
+       if (idx < keys->nr &&
+           journal_key_cmp(&n, &keys->d[idx]) == 0) {
+               if (keys->d[idx].allocated)
+                       kfree(keys->d[idx].k);
+               keys->d[idx] = n;
+               return 0;
+       }
+
+       if (keys->nr == keys->size) {
+               struct journal_keys new_keys = {
+                       .nr                     = keys->nr,
+                       .size                   = keys->size * 2,
+                       .journal_seq_base       = keys->journal_seq_base,
+               };
+
+               new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+               if (!new_keys.d) {
+                       bch_err(c, "%s: error allocating new key array (size %zu)",
+                               __func__, new_keys.size);
+                       return -ENOMEM;
+               }
+
+               memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+               kvfree(keys->d);
+               *keys = new_keys;
+       }
+
+       array_insert_item(keys->d, keys->nr, idx, n);
 
-       return l < journal_keys->nr ? journal_keys->d + l : NULL;
+       list_for_each_entry(iter, &c->journal_iters, list)
+               journal_iter_fix(c, iter, idx);
+
+       return 0;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bpos pos)
+{
+       struct bkey_i *whiteout =
+               kmalloc(sizeof(struct bkey), GFP_KERNEL);
+       int ret;
+
+       if (!whiteout) {
+               bch_err(c, "%s: error allocating new key", __func__);
+               return -ENOMEM;
+       }
+
+       bkey_init(&whiteout->k);
+       whiteout->k.p = pos;
+
+       ret = bch2_journal_key_insert(c, id, level, whiteout);
+       if (ret)
+               kfree(whiteout);
+       return ret;
 }
 
 static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
-       if (iter->k &&
-           iter->k < iter->keys->d + iter->keys->nr &&
-           iter->k->btree_id   == iter->btree_id &&
-           iter->k->level      == iter->level)
-               return iter->k->k;
+       struct journal_key *k = iter->idx - iter->keys->nr
+               ? iter->keys->d + iter->idx : NULL;
+
+       if (k &&
+           k->btree_id == iter->btree_id &&
+           k->level    == iter->level)
+               return k->k;
 
-       iter->k = NULL;
+       iter->idx = iter->keys->nr;
        return NULL;
 }
 
 static void bch2_journal_iter_advance(struct journal_iter *iter)
 {
-       if (iter->k)
-               iter->k++;
+       if (iter->idx < iter->keys->nr)
+               iter->idx++;
 }
 
-static void bch2_journal_iter_init(struct journal_iter *iter,
-                                  struct journal_keys *journal_keys,
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+       list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+                                  struct journal_iter *iter,
                                   enum btree_id id, unsigned level,
                                   struct bpos pos)
 {
        iter->btree_id  = id;
        iter->level     = level;
-       iter->keys      = journal_keys;
-       iter->k         = journal_key_search(journal_keys, id, level, pos);
+       iter->keys      = &c->journal_keys;
+       iter->idx       = journal_key_search(&c->journal_keys, id, level, pos);
+       list_add(&iter->list, &c->journal_iters);
 }
 
 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
 {
-       return iter->btree
-               ? bch2_btree_iter_peek(iter->btree)
-               : bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-                                                  iter->b, &iter->unpacked);
+       return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+                                               iter->b, &iter->unpacked);
 }
 
 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
 {
-       if (iter->btree)
-               bch2_btree_iter_next(iter->btree);
-       else
-               bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+       bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
 }
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
@@ -140,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
                        bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
 
                if (btree_k.k && journal_k.k) {
-                       int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+                       int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
 
                        if (!cmp)
                                bch2_journal_iter_advance_btree(iter);
@@ -158,8 +256,8 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
                ret = iter->last == journal ? journal_k : btree_k;
 
                if (iter->b &&
-                   bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
-                       iter->journal.k = NULL;
+                   bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+                       iter->journal.idx = iter->journal.keys->nr;
                        iter->last = none;
                        return bkey_s_c_null;
                }
@@ -180,31 +278,50 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
        return bch2_btree_and_journal_iter_peek(iter);
 }
 
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
-                                     struct btree_trans *trans,
-                                     struct journal_keys *journal_keys,
-                                     enum btree_id id, struct bpos pos)
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 {
-       memset(iter, 0, sizeof(*iter));
-
-       iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
-       bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+       bch2_journal_iter_exit(&iter->journal);
 }
 
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                               struct journal_keys *journal_keys,
+                                               struct bch_fs *c,
                                                struct btree *b)
 {
        memset(iter, 0, sizeof(*iter));
 
        iter->b = b;
        bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
-       bch2_journal_iter_init(&iter->journal, journal_keys,
+       bch2_journal_iter_init(c, &iter->journal,
                               b->c.btree_id, b->c.level, b->data->min_key);
 }
 
 /* Walk btree, overlaying keys from the journal: */
 
+static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
+                                          struct btree_and_journal_iter iter)
+{
+       unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
+       struct bkey_s_c k;
+       struct bkey_buf tmp;
+
+       BUG_ON(!b->c.level);
+
+       bch2_bkey_buf_init(&tmp);
+
+       while (i < nr &&
+              (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+               bch2_bkey_buf_reassemble(&tmp, c, k);
+
+               bch2_btree_node_prefetch(c, NULL, tmp.k,
+                                       b->c.btree_id, b->c.level - 1);
+
+               bch2_btree_and_journal_iter_advance(&iter);
+               i++;
+       }
+
+       bch2_bkey_buf_exit(&tmp, c);
+}
+
 static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
                                struct journal_keys *journal_keys,
                                enum btree_id btree_id,
@@ -213,9 +330,12 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 {
        struct btree_and_journal_iter iter;
        struct bkey_s_c k;
+       struct bkey_buf tmp;
+       struct btree *child;
        int ret = 0;
 
-       bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+       bch2_bkey_buf_init(&tmp);
+       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                ret = key_fn(c, btree_id, b->c.level, k);
@@ -223,34 +343,34 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
                        break;
 
                if (b->c.level) {
-                       struct btree *child;
-                       BKEY_PADDED(k) tmp;
-
-                       bkey_reassemble(&tmp.k, k);
-                       k = bkey_i_to_s_c(&tmp.k);
+                       bch2_bkey_buf_reassemble(&tmp, c, k);
 
                        bch2_btree_and_journal_iter_advance(&iter);
 
-                       if (b->c.level > 0) {
-                               child = bch2_btree_node_get_noiter(c, &tmp.k,
-                                                       b->c.btree_id, b->c.level - 1);
-                               ret = PTR_ERR_OR_ZERO(child);
-                               if (ret)
-                                       break;
+                       child = bch2_btree_node_get_noiter(c, tmp.k,
+                                               b->c.btree_id, b->c.level - 1,
+                                               false);
 
-                               ret   = (node_fn ? node_fn(c, b) : 0) ?:
-                                       bch2_btree_and_journal_walk_recurse(c, child,
-                                               journal_keys, btree_id, node_fn, key_fn);
-                               six_unlock_read(&child->c.lock);
+                       ret = PTR_ERR_OR_ZERO(child);
+                       if (ret)
+                               break;
 
-                               if (ret)
-                                       break;
-                       }
+                       btree_and_journal_iter_prefetch(c, b, iter);
+
+                       ret   = (node_fn ? node_fn(c, b) : 0) ?:
+                               bch2_btree_and_journal_walk_recurse(c, child,
+                                       journal_keys, btree_id, node_fn, key_fn);
+                       six_unlock_read(&child->c.lock);
+
+                       if (ret)
+                               break;
                } else {
                        bch2_btree_and_journal_iter_advance(&iter);
                }
        }
 
+       bch2_btree_and_journal_iter_exit(&iter);
+       bch2_bkey_buf_exit(&tmp, c);
        return ret;
 }
 
@@ -299,13 +419,19 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 
        return  cmp_int(l->btree_id,    r->btree_id) ?:
                cmp_int(l->level,       r->level) ?:
-               bkey_cmp(l->k->k.p, r->k->k.p) ?:
+               bpos_cmp(l->k->k.p, r->k->k.p) ?:
                cmp_int(l->journal_seq, r->journal_seq) ?:
                cmp_int(l->journal_offset, r->journal_offset);
 }
 
 void bch2_journal_keys_free(struct journal_keys *keys)
 {
+       struct journal_key *i;
+
+       for (i = keys->d; i < keys->d + keys->nr; i++)
+               if (i->allocated)
+                       kfree(i->k);
+
        kvfree(keys->d);
        keys->d = NULL;
        keys->nr = 0;
@@ -313,7 +439,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 
 static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
 {
-       struct journal_replay *p;
+       struct journal_replay *i;
        struct jset_entry *entry;
        struct bkey_i *k, *_n;
        struct journal_keys keys = { NULL };
@@ -323,35 +449,37 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
        if (list_empty(journal_entries))
                return keys;
 
-       keys.journal_seq_base =
-               le64_to_cpu(list_last_entry(journal_entries,
-                               struct journal_replay, list)->j.last_seq);
-
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                        continue;
 
-               for_each_jset_key(k, _n, entry, &p->j)
+               if (!keys.journal_seq_base)
+                       keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                        nr_keys++;
        }
 
+       keys.size = roundup_pow_of_two(nr_keys);
 
-       keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
+       keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
        if (!keys.d)
                goto err;
 
-       list_for_each_entry(p, journal_entries, list) {
-               if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+       list_for_each_entry(i, journal_entries, list) {
+               if (i->ignore)
                        continue;
 
-               for_each_jset_key(k, _n, entry, &p->j)
+               BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+               for_each_jset_key(k, _n, entry, &i->j)
                        keys.d[keys.nr++] = (struct journal_key) {
                                .btree_id       = entry->btree_id,
                                .level          = entry->level,
                                .k              = k,
-                               .journal_seq    = le64_to_cpu(p->j.seq) -
+                               .journal_seq    = le64_to_cpu(i->j.seq) -
                                        keys.journal_seq_base,
-                               .journal_offset = k->_data - p->j._data,
+                               .journal_offset = k->_data - i->j._data,
                        };
        }
 
@@ -362,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
                while (src + 1 < keys.d + keys.nr &&
                       src[0].btree_id  == src[1].btree_id &&
                       src[0].level     == src[1].level &&
-                      !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+                      !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
                        src++;
 
                *dst++ = *src++;
@@ -384,111 +512,6 @@ static void replay_now_at(struct journal *j, u64 seq)
                bch2_journal_pin_put(j, j->replay_journal_seq++);
 }
 
-static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
-                                 struct bkey_i *k)
-{
-       struct btree_trans trans;
-       struct btree_iter *iter, *split_iter;
-       /*
-        * We might cause compressed extents to be split, so we need to pass in
-        * a disk_reservation:
-        */
-       struct disk_reservation disk_res =
-               bch2_disk_reservation_init(c, 0);
-       struct bkey_i *split;
-       struct bpos atomic_end;
-       /*
-        * Some extents aren't equivalent - w.r.t. what the triggers do
-        * - if they're split:
-        */
-       bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
-               k->k.type == KEY_TYPE_reflink_p;
-       bool remark = false;
-       int ret;
-
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-retry:
-       bch2_trans_begin(&trans);
-
-       iter = bch2_trans_get_iter(&trans, btree_id,
-                                  bkey_start_pos(&k->k),
-                                  BTREE_ITER_INTENT);
-
-       do {
-               ret = bch2_btree_iter_traverse(iter);
-               if (ret)
-                       goto err;
-
-               atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
-
-               split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
-               ret = PTR_ERR_OR_ZERO(split);
-               if (ret)
-                       goto err;
-
-               if (!remark &&
-                   remark_if_split &&
-                   bkey_cmp(atomic_end, k->k.p) < 0) {
-                       ret = bch2_disk_reservation_add(c, &disk_res,
-                                       k->k.size *
-                                       bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
-                                       BCH_DISK_RESERVATION_NOFAIL);
-                       BUG_ON(ret);
-
-                       remark = true;
-               }
-
-               bkey_copy(split, k);
-               bch2_cut_front(iter->pos, split);
-               bch2_cut_back(atomic_end, split);
-
-               split_iter = bch2_trans_copy_iter(&trans, iter);
-               ret = PTR_ERR_OR_ZERO(split_iter);
-               if (ret)
-                       goto err;
-
-               /*
-                * It's important that we don't go through the
-                * extent_handle_overwrites() and extent_update_to_keys() path
-                * here: journal replay is supposed to treat extents like
-                * regular keys
-                */
-               __bch2_btree_iter_set_pos(split_iter, split->k.p, false);
-               bch2_trans_update(&trans, split_iter, split,
-                                 BTREE_TRIGGER_NORUN);
-
-               bch2_btree_iter_set_pos(iter, split->k.p);
-
-               if (remark) {
-                       ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split),
-                                                 0, split->k.size,
-                                                 BTREE_TRIGGER_INSERT);
-                       if (ret)
-                               goto err;
-               }
-       } while (bkey_cmp(iter->pos, k->k.p) < 0);
-
-       if (remark) {
-               ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
-                                         0, -((s64) k->k.size),
-                                         BTREE_TRIGGER_OVERWRITE);
-               if (ret)
-                       goto err;
-       }
-
-       ret = bch2_trans_commit(&trans, &disk_res, NULL,
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_LAZY_RW|
-                               BTREE_INSERT_JOURNAL_REPLAY);
-err:
-       if (ret == -EINTR)
-               goto retry;
-
-       bch2_disk_reservation_put(c, &disk_res);
-
-       return bch2_trans_exit(&trans) ?: ret;
-}
-
 static int __bch2_journal_replay_key(struct btree_trans *trans,
                                     enum btree_id id, unsigned level,
                                     struct bkey_i *k)
@@ -499,8 +522,6 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
        iter = bch2_trans_get_node_iter(trans, id, k->k.p,
                                        BTREE_MAX_DEPTH, level,
                                        BTREE_ITER_INTENT);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
 
        /*
         * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
@@ -508,7 +529,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
         * want that here, journal replay is supposed to treat extents like
         * regular keys:
         */
-       __bch2_btree_iter_set_pos(iter, k->k.p, false);
+       BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
 
        ret   = bch2_btree_iter_traverse(iter) ?:
                bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
@@ -516,14 +537,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
        return ret;
 }
 
-static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
-                                  unsigned level, struct bkey_i *k)
+static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
 {
-       return bch2_trans_do(c, NULL, NULL,
-                            BTREE_INSERT_NOFAIL|
-                            BTREE_INSERT_LAZY_RW|
-                            BTREE_INSERT_JOURNAL_REPLAY,
-                            __bch2_journal_replay_key(&trans, id, level, k));
+       unsigned commit_flags = BTREE_INSERT_NOFAIL|
+               BTREE_INSERT_LAZY_RW;
+
+       if (!k->allocated)
+               commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+
+       return bch2_trans_do(c, NULL, NULL, commit_flags,
+                            __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
 }
 
 static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
@@ -531,12 +554,11 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
        struct btree_iter *iter;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p,
+       iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
                                   BTREE_ITER_CACHED|
                                   BTREE_ITER_CACHED_NOFILL|
                                   BTREE_ITER_INTENT);
-       ret =   PTR_ERR_OR_ZERO(iter) ?:
-               bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+       ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
        bch2_trans_iter_put(trans, iter);
        return ret;
 }
@@ -559,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
        return  cmp_int(r->level,       l->level) ?:
                cmp_int(l->journal_seq, r->journal_seq) ?:
                cmp_int(l->btree_id,    r->btree_id) ?:
-               bkey_cmp(l->k->k.p,     r->k->k.p);
+               bpos_cmp(l->k->k.p,     r->k->k.p);
 }
 
 static int bch2_journal_replay(struct bch_fs *c,
@@ -584,7 +606,7 @@ static int bch2_journal_replay(struct bch_fs *c,
        for_each_journal_key(keys, i) {
                cond_resched();
 
-               if (!i->level && i->btree_id == BTREE_ID_ALLOC) {
+               if (!i->level && i->btree_id == BTREE_ID_alloc) {
                        j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
                        ret = bch2_alloc_replay_key(c, i->k);
                        if (ret)
@@ -600,7 +622,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 
                if (i->level) {
                        j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-                       ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+                       ret = bch2_journal_replay_key(c, i);
                        if (ret)
                                goto err;
                }
@@ -613,6 +635,7 @@ static int bch2_journal_replay(struct bch_fs *c,
         */
        set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
        set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
+       journal_reclaim_kick(j);
 
        j->replay_journal_seq = seq;
 
@@ -622,14 +645,12 @@ static int bch2_journal_replay(struct bch_fs *c,
        for_each_journal_key(keys, i) {
                cond_resched();
 
-               if (i->level || i->btree_id == BTREE_ID_ALLOC)
+               if (i->level || i->btree_id == BTREE_ID_alloc)
                        continue;
 
                replay_now_at(j, keys.journal_seq_base + i->journal_seq);
 
-               ret = i->k->k.size
-                       ? bch2_extent_replay_key(c, i->btree_id, i->k)
-                       : bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+               ret = bch2_journal_replay_key(c, i);
                if (ret)
                        goto err;
        }
@@ -641,47 +662,8 @@ static int bch2_journal_replay(struct bch_fs *c,
        bch2_journal_flush_all_pins(j);
        return bch2_journal_error(j);
 err:
-       bch_err(c, "journal replay: error %d while replaying key", ret);
-       return ret;
-}
-
-static bool journal_empty(struct list_head *journal)
-{
-       return list_empty(journal) ||
-               journal_entry_empty(&list_last_entry(journal,
-                                       struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
-                                                 struct list_head *journal)
-{
-       struct journal_replay *i =
-               list_last_entry(journal, struct journal_replay, list);
-       u64 start_seq   = le64_to_cpu(i->j.last_seq);
-       u64 end_seq     = le64_to_cpu(i->j.seq);
-       u64 seq         = start_seq;
-       int ret = 0;
-
-       list_for_each_entry(i, journal, list) {
-               if (le64_to_cpu(i->j.seq) < start_seq)
-                       continue;
-
-               fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
-                       "journal entries %llu-%llu missing! (replaying %llu-%llu)",
-                       seq, le64_to_cpu(i->j.seq) - 1,
-                       start_seq, end_seq);
-
-               seq = le64_to_cpu(i->j.seq);
-
-               fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
-                           "found blacklisted journal entry %llu", seq);
-
-               do {
-                       seq++;
-               } while (bch2_journal_seq_is_blacklisted(c, seq, false));
-       }
-fsck_err:
+       bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+               ret, bch2_btree_ids[i->btree_id], i->level);
        return ret;
 }
 
@@ -738,10 +720,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
        case BCH_JSET_ENTRY_data_usage: {
                struct jset_entry_data_usage *u =
                        container_of(entry, struct jset_entry_data_usage, entry);
+
                ret = bch2_replicas_set_usage(c, &u->r,
                                              le64_to_cpu(u->v));
                break;
        }
+       case BCH_JSET_ENTRY_dev_usage: {
+               struct jset_entry_dev_usage *u =
+                       container_of(entry, struct jset_entry_dev_usage, entry);
+               struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+               unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+               unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+                       sizeof(struct jset_entry_dev_usage_type);
+               unsigned i;
+
+               ca->usage_base->buckets_ec              = le64_to_cpu(u->buckets_ec);
+               ca->usage_base->buckets_unavailable     = le64_to_cpu(u->buckets_unavailable);
+
+               for (i = 0; i < nr_types; i++) {
+                       ca->usage_base->d[i].buckets    = le64_to_cpu(u->d[i].buckets);
+                       ca->usage_base->d[i].sectors    = le64_to_cpu(u->d[i].sectors);
+                       ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+               }
+
+               break;
+       }
        case BCH_JSET_ENTRY_blacklist: {
                struct jset_entry_blacklist *bl_entry =
                        container_of(entry, struct jset_entry_blacklist, entry);
@@ -760,6 +763,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
                                le64_to_cpu(bl_entry->end) + 1);
                break;
        }
+       case BCH_JSET_ENTRY_clock: {
+               struct jset_entry_clock *clock =
+                       container_of(entry, struct jset_entry_clock, entry);
+
+               atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+       }
        }
 
        return ret;
@@ -769,13 +778,11 @@ static int journal_replay_early(struct bch_fs *c,
                                struct bch_sb_field_clean *clean,
                                struct list_head *journal)
 {
+       struct journal_replay *i;
        struct jset_entry *entry;
        int ret;
 
        if (clean) {
-               c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
                for (entry = clean->start;
                     entry != vstruct_end(&clean->field);
                     entry = vstruct_next(entry)) {
@@ -784,18 +791,16 @@ static int journal_replay_early(struct bch_fs *c,
                                return ret;
                }
        } else {
-               struct journal_replay *i =
-                       list_last_entry(journal, struct journal_replay, list);
-
-               c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+               list_for_each_entry(i, journal, list) {
+                       if (i->ignore)
+                               continue;
 
-               list_for_each_entry(i, journal, list)
                        vstruct_for_each(&i->j, entry) {
                                ret = journal_replay_entry_early(c, entry);
                                if (ret)
                                        return ret;
                        }
+               }
        }
 
        bch2_fs_usage_initialize(c);
@@ -844,9 +849,6 @@ static int verify_superblock_clean(struct bch_fs *c,
        struct bch_sb_field_clean *clean = *cleanp;
        int ret = 0;
 
-       if (!c->sb.clean || !j)
-               return 0;
-
        if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
                        "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
                        le64_to_cpu(clean->journal_seq),
@@ -856,13 +858,6 @@ static int verify_superblock_clean(struct bch_fs *c,
                return 0;
        }
 
-       mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-                       "superblock read clock %u doesn't match journal %u after clean shutdown",
-                       clean->read_clock, j->read_clock);
-       mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-                       "superblock write clock %u doesn't match journal %u after clean shutdown",
-                       clean->write_clock, j->write_clock);
-
        for (i = 0; i < BTREE_ID_NR; i++) {
                char buf1[200], buf2[200];
                struct bkey_i *k1, *k2;
@@ -913,9 +908,11 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
                return ERR_PTR(-ENOMEM);
        }
 
-       if (le16_to_cpu(c->disk_sb.sb->version) <
-           bcachefs_metadata_version_bkey_renumber)
-               bch2_sb_clean_renumber(clean, READ);
+       ret = bch2_sb_clean_validate(c, clean, READ);
+       if (ret) {
+               mutex_unlock(&c->sb_lock);
+               return ERR_PTR(ret);
+       }
 
        mutex_unlock(&c->sb_lock);
 
@@ -936,29 +933,29 @@ static int read_btree_roots(struct bch_fs *c)
                if (!r->alive)
                        continue;
 
-               if (i == BTREE_ID_ALLOC &&
+               if (i == BTREE_ID_alloc &&
                    c->opts.reconstruct_alloc) {
-                       c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+                       c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                        continue;
                }
 
                if (r->error) {
-                       __fsck_err(c, i == BTREE_ID_ALLOC
+                       __fsck_err(c, i == BTREE_ID_alloc
                                   ? FSCK_CAN_IGNORE : 0,
                                   "invalid btree root %s",
                                   bch2_btree_ids[i]);
-                       if (i == BTREE_ID_ALLOC)
-                               c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+                       if (i == BTREE_ID_alloc)
+                               c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                }
 
                ret = bch2_btree_root_read(c, i, &r->key, r->level);
                if (ret) {
-                       __fsck_err(c, i == BTREE_ID_ALLOC
+                       __fsck_err(c, i == BTREE_ID_alloc
                                   ? FSCK_CAN_IGNORE : 0,
                                   "error reading btree root %s",
                                   bch2_btree_ids[i]);
-                       if (i == BTREE_ID_ALLOC)
-                               c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+                       if (i == BTREE_ID_alloc)
+                               c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                }
        }
 
@@ -973,8 +970,9 @@ int bch2_fs_recovery(struct bch_fs *c)
 {
        const char *err = "cannot allocate memory";
        struct bch_sb_field_clean *clean = NULL;
-       u64 journal_seq;
-       bool write_sb = false, need_write_alloc = false;
+       struct jset *last_journal_entry = NULL;
+       u64 blacklist_seq, journal_seq;
+       bool write_sb = false;
        int ret;
 
        if (c->sb.clean)
@@ -987,30 +985,70 @@ int bch2_fs_recovery(struct bch_fs *c)
                bch_info(c, "recovering from clean shutdown, journal seq %llu",
                         le64_to_cpu(clean->journal_seq));
 
+       if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+               bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+               ret = -EINVAL;
+               goto err;
+       }
+
+       if (!c->sb.clean &&
+           !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+               bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+               ret = -EINVAL;
+               goto err;
+       }
+
+       if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
+               bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
+               ret = -EINVAL;
+               goto err;
+
+       }
+
+       if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
+               bch_info(c, "alloc_v2 feature bit not set, fsck required");
+               c->opts.fsck = true;
+               c->opts.fix_errors = FSCK_OPT_YES;
+       }
+
        if (!c->replicas.entries ||
            c->opts.rebuild_replicas) {
                bch_info(c, "building replicas info");
                set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
        }
 
+       ret = bch2_blacklist_table_initialize(c);
+       if (ret) {
+               bch_err(c, "error initializing blacklist table");
+               goto err;
+       }
+
        if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
-               struct jset *j;
+               struct journal_replay *i;
 
-               ret = bch2_journal_read(c, &c->journal_entries);
+               ret = bch2_journal_read(c, &c->journal_entries,
+                                       &blacklist_seq, &journal_seq);
                if (ret)
                        goto err;
 
-               if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+               list_for_each_entry_reverse(i, &c->journal_entries, list)
+                       if (!i->ignore) {
+                               last_journal_entry = &i->j;
+                               break;
+                       }
+
+               if (mustfix_fsck_err_on(c->sb.clean &&
+                                       last_journal_entry &&
+                                       !journal_entry_empty(last_journal_entry), c,
                                "filesystem marked clean but journal not empty")) {
-                       c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+                       c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
                        c->sb.clean = false;
                }
 
-               if (!c->sb.clean && list_empty(&c->journal_entries)) {
-                       bch_err(c, "no journal entries found");
-                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
-                       goto err;
+               if (!last_journal_entry) {
+                       fsck_err_on(!c->sb.clean, c, "no journal entries found");
+                       goto use_clean;
                }
 
                c->journal_keys = journal_keys_sort(&c->journal_entries);
@@ -1019,27 +1057,25 @@ int bch2_fs_recovery(struct bch_fs *c)
                        goto err;
                }
 
-               j = &list_last_entry(&c->journal_entries,
-                                    struct journal_replay, list)->j;
-
-               ret = verify_superblock_clean(c, &clean, j);
-               if (ret)
-                       goto err;
-
-               journal_seq = le64_to_cpu(j->seq) + 1;
+               if (c->sb.clean && last_journal_entry) {
+                       ret = verify_superblock_clean(c, &clean,
+                                                     last_journal_entry);
+                       if (ret)
+                               goto err;
+               }
        } else {
-               journal_seq = le64_to_cpu(clean->journal_seq) + 1;
-       }
+use_clean:
+               if (!clean) {
+                       bch_err(c, "no superblock clean section found");
+                       ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+                       goto err;
 
-       if (!c->sb.clean &&
-           !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
-               bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
-               ret = -EINVAL;
-               goto err;
+               }
+               blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
        }
 
        if (c->opts.reconstruct_alloc) {
-               c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+               c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                drop_alloc_keys(&c->journal_keys);
        }
 
@@ -1047,30 +1083,23 @@ int bch2_fs_recovery(struct bch_fs *c)
        if (ret)
                goto err;
 
-       if (!c->sb.clean) {
+       /*
+        * After an unclean shutdown, skip then next few journal sequence
+        * numbers as they may have been referenced by btree writes that
+        * happened before their corresponding journal writes - those btree
+        * writes need to be ignored, by skipping and blacklisting the next few
+        * journal sequence numbers:
+        */
+       if (!c->sb.clean)
+               journal_seq += 8;
+
+       if (blacklist_seq != journal_seq) {
                ret = bch2_journal_seq_blacklist_add(c,
-                                                    journal_seq,
-                                                    journal_seq + 4);
+                                       blacklist_seq, journal_seq);
                if (ret) {
                        bch_err(c, "error creating new journal seq blacklist entry");
                        goto err;
                }
-
-               journal_seq += 4;
-
-               /*
-                * The superblock needs to be written before we do any btree
-                * node writes: it will be in the read_write() path
-                */
-       }
-
-       ret = bch2_blacklist_table_initialize(c);
-
-       if (!list_empty(&c->journal_entries)) {
-               ret = verify_journal_entries_not_blacklisted_or_missing(c,
-                                                       &c->journal_entries);
-               if (ret)
-                       goto err;
        }
 
        ret = bch2_fs_journal_start(&c->journal, journal_seq,
@@ -1098,36 +1127,20 @@ int bch2_fs_recovery(struct bch_fs *c)
 
        set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
-       if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
-           !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
-               /*
-                * interior btree node updates aren't consistent with the
-                * journal; after an unclean shutdown we have to walk all
-                * pointers to metadata:
-                */
-               bch_info(c, "starting metadata mark and sweep");
-               err = "error in mark and sweep";
-               ret = bch2_gc(c, &c->journal_keys, true, true);
-               if (ret < 0)
-                       goto err;
-               if (ret)
-                       need_write_alloc = true;
-               bch_verbose(c, "mark and sweep done");
-       }
-
        if (c->opts.fsck ||
-           !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+           !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
+           !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
            test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
                bch_info(c, "starting mark and sweep");
                err = "error in mark and sweep";
-               ret = bch2_gc(c, &c->journal_keys, true, false);
-               if (ret < 0)
-                       goto err;
+               ret = bch2_gc(c, true);
                if (ret)
-                       need_write_alloc = true;
+                       goto err;
                bch_verbose(c, "mark and sweep done");
        }
 
+       bch2_stripes_heap_start(c);
+
        clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
        set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
@@ -1148,7 +1161,8 @@ int bch2_fs_recovery(struct bch_fs *c)
                goto err;
        bch_verbose(c, "journal replay done");
 
-       if (need_write_alloc && !c->opts.nochanges) {
+       if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
+           !c->opts.nochanges) {
                /*
                 * note that even when filesystem was clean there might be work
                 * to do here, if we ran gc (because of fsck) which recalculated
@@ -1163,8 +1177,6 @@ int bch2_fs_recovery(struct bch_fs *c)
                        goto err;
                }
                bch_verbose(c, "alloc write done");
-
-               set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
        }
 
        if (!c->sb.clean) {
@@ -1203,18 +1215,30 @@ int bch2_fs_recovery(struct bch_fs *c)
                bch_verbose(c, "quotas done");
        }
 
+       if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+           !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
+               struct bch_move_stats stats = { 0 };
+
+               bch_info(c, "scanning for old btree nodes");
+               ret = bch2_fs_read_write(c);
+               if (ret)
+                       goto err;
+
+               ret = bch2_scan_old_btree_nodes(c, &stats);
+               if (ret)
+                       goto err;
+               bch_info(c, "scanning for old btree nodes done");
+       }
+
        mutex_lock(&c->sb_lock);
        if (c->opts.version_upgrade) {
-               if (c->sb.version < bcachefs_metadata_version_new_versioning)
-                       c->disk_sb.sb->version_min =
-                               le16_to_cpu(bcachefs_metadata_version_min);
                c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
                c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
                write_sb = true;
        }
 
        if (!test_bit(BCH_FS_ERROR, &c->flags)) {
-               c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+               c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
                write_sb = true;
        }
 
@@ -1265,17 +1289,17 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch_notice(c, "initializing new filesystem");
 
        mutex_lock(&c->sb_lock);
-       for_each_online_member(ca, c, i)
-               bch2_mark_dev_superblock(c, ca, 0);
-       mutex_unlock(&c->sb_lock);
+       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
 
-       mutex_lock(&c->sb_lock);
-       c->disk_sb.sb->version = c->disk_sb.sb->version_min =
-               le16_to_cpu(bcachefs_metadata_version_current);
-       c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
-       c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+       if (c->opts.version_upgrade) {
+               c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
+               c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+               bch2_write_super(c);
+       }
 
-       bch2_write_super(c);
+       for_each_online_member(ca, c, i)
+               bch2_mark_dev_superblock(c, ca, 0);
        mutex_unlock(&c->sb_lock);
 
        set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
@@ -1320,10 +1344,11 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch2_inode_init(c, &root_inode, 0, 0,
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
        root_inode.bi_inum = BCACHEFS_ROOT_INO;
-       bch2_inode_pack(&packed_inode, &root_inode);
+       bch2_inode_pack(c, &packed_inode, &root_inode);
+       packed_inode.inode.k.p.snapshot = U32_MAX;
 
        err = "error creating root directory";
-       ret = bch2_btree_insert(c, BTREE_ID_INODES,
+       ret = bch2_btree_insert(c, BTREE_ID_inodes,
                                &packed_inode.inode.k_i,
                                NULL, NULL, 0);
        if (ret)
@@ -1338,8 +1363,10 @@ int bch2_fs_initialize(struct bch_fs *c)
                                  &lostfound,
                                  0, 0, S_IFDIR|0700, 0,
                                  NULL, NULL));
-       if (ret)
+       if (ret) {
+               bch_err(c, "error creating lost+found");
                goto err;
+       }
 
        if (enabled_qtypes(c)) {
                ret = bch2_fs_quota_read(c);
index a66827c9addf71a4b3eaeb08a0151e0d0c5cd9c2..fa91851b9ed7a2e890cb498b9012fe451c327813 100644 (file)
@@ -6,10 +6,11 @@
        for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
 
 struct journal_iter {
+       struct list_head        list;
        enum btree_id           btree_id;
        unsigned                level;
+       size_t                  idx;
        struct journal_keys     *keys;
-       struct journal_key      *k;
 };
 
 /*
@@ -17,8 +18,6 @@ struct journal_iter {
  */
 
 struct btree_and_journal_iter {
-       struct btree_iter       *btree;
-
        struct btree            *b;
        struct btree_node_iter  node_iter;
        struct bkey             unpacked;
@@ -32,16 +31,18 @@ struct btree_and_journal_iter {
        }                       last;
 };
 
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+                           unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+                           unsigned, struct bpos);
+
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
-                                     struct btree_trans *,
-                                     struct journal_keys *,
-                                     enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-                                               struct journal_keys *,
+                                               struct bch_fs *,
                                                struct btree *);
 
 typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b);
index 8abcbfb3bd645c4ce337a6619d6944178c0ee41f..0978ad92614c649d61b3ed40cf0f89cb772f740c 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "extents.h"
 #include "inode.h"
@@ -119,7 +119,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
        if (orig->k.type == KEY_TYPE_inline_data)
                bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
 
-       for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
+       for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
                           POS(0, c->reflink_hint),
                           BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
                if (reflink_iter->pos.inode) {
@@ -157,8 +157,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
        bch2_trans_update(trans, reflink_iter, r_v, 0);
 
        r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
-       if (IS_ERR(r_p))
-               return PTR_ERR(r_p);
+       if (IS_ERR(r_p)) {
+               ret = PTR_ERR(r_p);
+               goto err;
+       }
 
        orig->k.type = KEY_TYPE_reflink_p;
        r_p = bkey_i_to_reflink_p(orig);
@@ -198,16 +200,12 @@ s64 bch2_remap_range(struct bch_fs *c,
        struct btree_trans trans;
        struct btree_iter *dst_iter, *src_iter;
        struct bkey_s_c src_k;
-       BKEY_PADDED(k) new_dst;
-       struct bkey_on_stack new_src;
+       struct bkey_buf new_dst, new_src;
        struct bpos dst_end = dst_start, src_end = src_start;
        struct bpos dst_want, src_want;
        u64 src_done, dst_done;
        int ret = 0, ret2 = 0;
 
-       if (!c->opts.reflink)
-               return -EOPNOTSUPP;
-
        if (!percpu_ref_tryget(&c->writes))
                return -EROFS;
 
@@ -216,28 +214,27 @@ s64 bch2_remap_range(struct bch_fs *c,
        dst_end.offset += remap_sectors;
        src_end.offset += remap_sectors;
 
-       bkey_on_stack_init(&new_src);
+       bch2_bkey_buf_init(&new_dst);
+       bch2_bkey_buf_init(&new_src);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-       src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+       src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
                                       BTREE_ITER_INTENT);
-       dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+       dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
                                       BTREE_ITER_INTENT);
 
-       while (1) {
+       while (ret == 0 || ret == -EINTR) {
                bch2_trans_begin(&trans);
 
-               trans.mem_top = 0;
-
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
-                       goto err;
+                       break;
                }
 
                src_k = get_next_src(src_iter, src_end);
                ret = bkey_err(src_k);
                if (ret)
-                       goto btree_err;
+                       continue;
 
                src_done = bpos_min(src_iter->pos, src_end).offset -
                        src_start.offset;
@@ -246,8 +243,6 @@ s64 bch2_remap_range(struct bch_fs *c,
                if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
                        ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
                                             journal_seq, i_sectors_delta);
-                       if (ret)
-                               goto btree_err;
                        continue;
                }
 
@@ -257,7 +252,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        break;
 
                if (src_k.k->type != KEY_TYPE_reflink_p) {
-                       bkey_on_stack_reassemble(&new_src, c, src_k);
+                       bch2_bkey_buf_reassemble(&new_src, c, src_k);
                        src_k = bkey_i_to_s_c(new_src.k);
 
                        bch2_cut_front(src_iter->pos,   new_src.k);
@@ -266,7 +261,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        ret = bch2_make_extent_indirect(&trans, src_iter,
                                                new_src.k);
                        if (ret)
-                               goto btree_err;
+                               continue;
 
                        BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
                }
@@ -275,7 +270,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        struct bkey_s_c_reflink_p src_p =
                                bkey_s_c_to_reflink_p(src_k);
                        struct bkey_i_reflink_p *dst_p =
-                               bkey_reflink_p_init(&new_dst.k);
+                               bkey_reflink_p_init(new_dst.k);
 
                        u64 offset = le64_to_cpu(src_p.v->idx) +
                                (src_iter->pos.offset -
@@ -286,29 +281,25 @@ s64 bch2_remap_range(struct bch_fs *c,
                        BUG();
                }
 
-               new_dst.k.k.p = dst_iter->pos;
-               bch2_key_resize(&new_dst.k.k,
+               new_dst.k->k.p = dst_iter->pos;
+               bch2_key_resize(&new_dst.k->k,
                                min(src_k.k->p.offset - src_iter->pos.offset,
                                    dst_end.offset - dst_iter->pos.offset));
 
-               ret = bch2_extent_update(&trans, dst_iter, &new_dst.k,
+               ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
                                         NULL, journal_seq,
                                         new_i_size, i_sectors_delta);
                if (ret)
-                       goto btree_err;
+                       continue;
 
                dst_done = dst_iter->pos.offset - dst_start.offset;
                src_want = POS(src_start.inode, src_start.offset + dst_done);
                bch2_btree_iter_set_pos(src_iter, src_want);
-btree_err:
-               if (ret == -EINTR)
-                       ret = 0;
-               if (ret)
-                       goto err;
        }
+       bch2_trans_iter_put(&trans, dst_iter);
+       bch2_trans_iter_put(&trans, src_iter);
 
-       BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
-err:
+       BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
        BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
 
        dst_done = dst_iter->pos.offset - dst_start.offset;
@@ -330,10 +321,13 @@ err:
                        ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
                                bch2_trans_commit(&trans, NULL, journal_seq, 0);
                }
+
+               bch2_trans_iter_put(&trans, inode_iter);
        } while (ret2 == -EINTR);
 
        ret = bch2_trans_exit(&trans) ?: ret;
-       bkey_on_stack_exit(&new_src, c);
+       bch2_bkey_buf_exit(&new_src, c);
+       bch2_bkey_buf_exit(&new_dst, c);
 
        percpu_ref_put(&c->writes);
 
index 91518c0d67948b5a8d3832dc2168f90763614953..1e297171b0fae4ae781a3081fc92a574a3b3dee8 100644 (file)
@@ -11,11 +11,6 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 
 /* Replicas tracking - in memory: */
 
-static inline int u8_cmp(u8 l, u8 r)
-{
-       return cmp_int(l, r);
-}
-
 static void verify_replicas_entry(struct bch_replicas_entry *e)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -31,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
 #endif
 }
 
-static void replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
 {
        bubble_sort(e->devs, e->nr_devs, u8_cmp);
 }
@@ -127,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
                break;
        }
 
-       replicas_entry_sort(e);
+       bch2_replicas_entry_sort(e);
 }
 
 void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
@@ -147,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
        for (i = 0; i < devs.nr; i++)
                e->devs[e->nr_devs++] = devs.devs[i];
 
-       replicas_entry_sort(e);
+       bch2_replicas_entry_sort(e);
 }
 
 static struct bch_replicas_cpu
@@ -164,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
        BUG_ON(!new_entry->data_type);
        verify_replicas_entry(new_entry);
 
-       new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
+       new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
        if (!new.entries)
                return new;
 
@@ -202,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 int bch2_replicas_entry_idx(struct bch_fs *c,
                            struct bch_replicas_entry *search)
 {
-       replicas_entry_sort(search);
+       bch2_replicas_entry_sort(search);
 
        return __replicas_entry_idx(&c->replicas, search);
 }
@@ -275,53 +270,57 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
 static int replicas_table_update(struct bch_fs *c,
                                 struct bch_replicas_cpu *new_r)
 {
-       struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
-       struct bch_fs_usage *new_scratch = NULL;
+       struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
+       struct bch_fs_usage_online *new_scratch = NULL;
        struct bch_fs_usage __percpu *new_gc = NULL;
        struct bch_fs_usage *new_base = NULL;
-       unsigned bytes = sizeof(struct bch_fs_usage) +
+       unsigned i, bytes = sizeof(struct bch_fs_usage) +
+               sizeof(u64) * new_r->nr;
+       unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
                sizeof(u64) * new_r->nr;
-       int ret = -ENOMEM;
-
-       if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
-           !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
-                                               GFP_NOIO)) ||
-           !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
-                                               GFP_NOIO)) ||
-           !(new_scratch  = kmalloc(bytes, GFP_NOIO)) ||
+       int ret = 0;
+
+       memset(new_usage, 0, sizeof(new_usage));
+
+       for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+               if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+                                       sizeof(u64), GFP_KERNEL)))
+                       goto err;
+
+       if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+           !(new_scratch  = kmalloc(scratch_bytes, GFP_KERNEL)) ||
            (c->usage_gc &&
-            !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
-               bch_err(c, "error updating replicas table: memory allocation failure");
+            !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
                goto err;
-       }
 
+       for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+               if (c->usage[i])
+                       __replicas_table_update_pcpu(new_usage[i], new_r,
+                                                    c->usage[i], &c->replicas);
        if (c->usage_base)
                __replicas_table_update(new_base,               new_r,
                                        c->usage_base,          &c->replicas);
-       if (c->usage[0])
-               __replicas_table_update_pcpu(new_usage[0],      new_r,
-                                            c->usage[0],       &c->replicas);
-       if (c->usage[1])
-               __replicas_table_update_pcpu(new_usage[1],      new_r,
-                                            c->usage[1],       &c->replicas);
        if (c->usage_gc)
                __replicas_table_update_pcpu(new_gc,            new_r,
                                             c->usage_gc,       &c->replicas);
 
+       for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+               swap(c->usage[i],       new_usage[i]);
        swap(c->usage_base,     new_base);
-       swap(c->usage[0],       new_usage[0]);
-       swap(c->usage[1],       new_usage[1]);
        swap(c->usage_scratch,  new_scratch);
        swap(c->usage_gc,       new_gc);
        swap(c->replicas,       *new_r);
-       ret = 0;
-err:
+out:
        free_percpu(new_gc);
        kfree(new_scratch);
        free_percpu(new_usage[1]);
        free_percpu(new_usage[0]);
        kfree(new_base);
        return ret;
+err:
+       bch_err(c, "error updating replicas table: memory allocation failure");
+       ret = -ENOMEM;
+       goto out;
 }
 
 static unsigned reserve_journal_replicas(struct bch_fs *c,
@@ -465,6 +464,36 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
        return 0;
 }
 
+/* replicas delta list: */
+
+bool bch2_replicas_delta_list_marked(struct bch_fs *c,
+                                    struct replicas_delta_list *r)
+{
+       struct replicas_delta *d = r->d;
+       struct replicas_delta *top = (void *) r->d + r->used;
+
+       percpu_rwsem_assert_held(&c->mark_lock);
+
+       for (d = r->d; d != top; d = replicas_delta_next(d))
+               if (bch2_replicas_entry_idx(c, &d->r) < 0)
+                       return false;
+       return true;
+}
+
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
+                                 struct replicas_delta_list *r)
+{
+       struct replicas_delta *d = r->d;
+       struct replicas_delta *top = (void *) r->d + r->used;
+       int ret = 0;
+
+       for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
+               ret = bch2_mark_replicas(c, &d->r);
+       return ret;
+}
+
+/* bkey replicas: */
+
 bool bch2_bkey_replicas_marked(struct bch_fs *c,
                               struct bkey_s_c k)
 {
@@ -476,6 +505,11 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
        return __bch2_mark_bkey_replicas(c, k, false);
 }
 
+/*
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
+ * die at some point:
+ */
+
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
        unsigned i;
@@ -496,9 +530,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
                struct bch_replicas_cpu n;
 
                if (!__replicas_has_entry(&c->replicas_gc, e) &&
-                   (c->usage_base->replicas[i] ||
-                    percpu_u64_get(&c->usage[0]->replicas[i]) ||
-                    percpu_u64_get(&c->usage[1]->replicas[i]))) {
+                   bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
                        n = cpu_replicas_add_entry(&c->replicas_gc, e);
                        if (!n.entries) {
                                ret = -ENOSPC;
@@ -553,7 +585,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 
        c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
                                         c->replicas_gc.entry_size,
-                                        GFP_NOIO);
+                                        GFP_KERNEL);
        if (!c->replicas_gc.entries) {
                mutex_unlock(&c->sb_lock);
                bch_err(c, "error allocating c->replicas_gc");
@@ -571,6 +603,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
        return 0;
 }
 
+/* New much simpler mechanism for clearing out unneeded replicas entries: */
+
 int bch2_replicas_gc2(struct bch_fs *c)
 {
        struct bch_replicas_cpu new = { 0 };
@@ -605,7 +639,9 @@ retry:
                if (e->data_type == BCH_DATA_journal ||
                    c->usage_base->replicas[i] ||
                    percpu_u64_get(&c->usage[0]->replicas[i]) ||
-                   percpu_u64_get(&c->usage[1]->replicas[i]))
+                   percpu_u64_get(&c->usage[1]->replicas[i]) ||
+                   percpu_u64_get(&c->usage[2]->replicas[i]) ||
+                   percpu_u64_get(&c->usage[3]->replicas[i]))
                        memcpy(cpu_replicas_entry(&new, new.nr++),
                               e, new.entry_size);
        }
@@ -674,7 +710,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
                nr++;
        }
 
-       cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+       cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
        if (!cpu_r->entries)
                return -ENOMEM;
 
@@ -684,7 +720,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
        for_each_replicas_entry(sb_r, e) {
                dst = cpu_replicas_entry(cpu_r, idx++);
                memcpy(dst, e, replicas_entry_bytes(e));
-               replicas_entry_sort(dst);
+               bch2_replicas_entry_sort(dst);
        }
 
        return 0;
@@ -706,7 +742,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
        entry_size += sizeof(struct bch_replicas_entry) -
                sizeof(struct bch_replicas_entry_v0);
 
-       cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+       cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
        if (!cpu_r->entries)
                return -ENOMEM;
 
@@ -721,7 +757,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
                dst->nr_devs    = e->nr_devs;
                dst->nr_required = 1;
                memcpy(dst->devs, e->devs, e->nr_devs);
-               replicas_entry_sort(dst);
+               bch2_replicas_entry_sort(dst);
        }
 
        return 0;
@@ -961,92 +997,53 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 
 /* Query replicas: */
 
-struct replicas_status __bch2_replicas_status(struct bch_fs *c,
-                                             struct bch_devs_mask online_devs)
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
+                          unsigned flags, bool print)
 {
-       struct bch_sb_field_members *mi;
        struct bch_replicas_entry *e;
-       unsigned i, nr_online, nr_offline;
-       struct replicas_status ret;
-
-       memset(&ret, 0, sizeof(ret));
-
-       for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
-               ret.replicas[i].redundancy = INT_MAX;
-
-       mi = bch2_sb_get_members(c->disk_sb.sb);
+       bool ret = true;
 
        percpu_down_read(&c->mark_lock);
-
        for_each_cpu_replicas_entry(&c->replicas, e) {
-               if (e->data_type >= ARRAY_SIZE(ret.replicas))
-                       panic("e %p data_type %u\n", e, e->data_type);
-
-               nr_online = nr_offline = 0;
+               unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
+               bool metadata = e->data_type < BCH_DATA_user;
 
                for (i = 0; i < e->nr_devs; i++) {
-                       BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
-                                               e->devs[i]));
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
 
-                       if (test_bit(e->devs[i], online_devs.d))
-                               nr_online++;
-                       else
-                               nr_offline++;
+                       nr_online += test_bit(e->devs[i], devs.d);
+                       nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
                }
 
-               ret.replicas[e->data_type].redundancy =
-                       min(ret.replicas[e->data_type].redundancy,
-                           (int) nr_online - (int) e->nr_required);
-
-               ret.replicas[e->data_type].nr_offline =
-                       max(ret.replicas[e->data_type].nr_offline,
-                           nr_offline);
-       }
-
-       percpu_up_read(&c->mark_lock);
-
-       for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
-               if (ret.replicas[i].redundancy == INT_MAX)
-                       ret.replicas[i].redundancy = 0;
+               if (nr_failed == e->nr_devs)
+                       continue;
 
-       return ret;
-}
+               if (nr_online < e->nr_required)
+                       dflags |= metadata
+                               ? BCH_FORCE_IF_METADATA_LOST
+                               : BCH_FORCE_IF_DATA_LOST;
 
-struct replicas_status bch2_replicas_status(struct bch_fs *c)
-{
-       return __bch2_replicas_status(c, bch2_online_devs(c));
-}
+               if (nr_online < e->nr_devs)
+                       dflags |= metadata
+                               ? BCH_FORCE_IF_METADATA_DEGRADED
+                               : BCH_FORCE_IF_DATA_DEGRADED;
 
-static bool have_enough_devs(struct replicas_status s,
-                            enum bch_data_type type,
-                            bool force_if_degraded,
-                            bool force_if_lost)
-{
-       return (!s.replicas[type].nr_offline || force_if_degraded) &&
-               (s.replicas[type].redundancy >= 0 || force_if_lost);
-}
+               if (dflags & ~flags) {
+                       if (print) {
+                               char buf[100];
 
-bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
-{
-       return (have_enough_devs(s, BCH_DATA_journal,
-                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
-                                flags & BCH_FORCE_IF_METADATA_LOST) &&
-               have_enough_devs(s, BCH_DATA_btree,
-                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
-                                flags & BCH_FORCE_IF_METADATA_LOST) &&
-               have_enough_devs(s, BCH_DATA_user,
-                                flags & BCH_FORCE_IF_DATA_DEGRADED,
-                                flags & BCH_FORCE_IF_DATA_LOST));
-}
+                               bch2_replicas_entry_to_text(&PBUF(buf), e);
+                               bch_err(c, "insufficient devices online (%u) for replicas entry %s",
+                                       nr_online, buf);
+                       }
+                       ret = false;
+                       break;
+               }
 
-int bch2_replicas_online(struct bch_fs *c, bool meta)
-{
-       struct replicas_status s = bch2_replicas_status(c);
+       }
+       percpu_up_read(&c->mark_lock);
 
-       return (meta
-               ? min(s.replicas[BCH_DATA_journal].redundancy,
-                     s.replicas[BCH_DATA_btree].redundancy)
-               : s.replicas[BCH_DATA_user].redundancy) + 1;
+       return ret;
 }
 
 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
@@ -1068,8 +1065,9 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 
 int bch2_fs_replicas_init(struct bch_fs *c)
 {
-       c->journal.entry_u64s_reserved +=
-               reserve_journal_replicas(c, &c->replicas);
+       bch2_journal_entry_res_resize(&c->journal,
+                       &c->replicas_journal_res,
+                       reserve_journal_replicas(c, &c->replicas));
 
        return replicas_table_update(c, &c->replicas);
 }
index 8b95164fbb56636fbee194ae4d61a0e4f63178bc..c77e873efc340555368b30786f74b308543556d2 100644 (file)
@@ -5,6 +5,7 @@
 #include "eytzinger.h"
 #include "replicas_types.h"
 
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
 void bch2_replicas_entry_to_text(struct printbuf *,
                                 struct bch_replicas_entry *);
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
@@ -25,6 +26,31 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
 int bch2_mark_replicas(struct bch_fs *,
                       struct bch_replicas_entry *);
 
+struct replicas_delta {
+       s64                     delta;
+       struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+       unsigned                size;
+       unsigned                used;
+
+       struct                  {} memset_start;
+       u64                     nr_inodes;
+       u64                     persistent_reserved[BCH_REPLICAS_MAX];
+       struct                  {} memset_end;
+       struct replicas_delta   d[0];
+};
+
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+       return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
+
 void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
 bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
@@ -38,19 +64,9 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
        e->devs[0]      = dev;
 }
 
-struct replicas_status {
-       struct {
-               int             redundancy;
-               unsigned        nr_offline;
-       }                       replicas[BCH_DATA_NR];
-};
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *,
-                                             struct bch_devs_mask);
-struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct replicas_status, unsigned);
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
+                          unsigned, bool);
 
-int bch2_replicas_online(struct bch_fs *, bool);
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 
 int bch2_replicas_gc_end(struct bch_fs *, int);
index dea9b7252b88b9a24e59f91f21cd53e44e1a6e2d..9f0bd44051991263b33e32eedff96aa07b0690db 100644 (file)
@@ -18,11 +18,11 @@ static inline enum bch_str_hash_type
 bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
 {
        switch (opt) {
-       case BCH_STR_HASH_OPT_CRC32C:
+       case BCH_STR_HASH_OPT_crc32c:
                return BCH_STR_HASH_CRC32C;
-       case BCH_STR_HASH_OPT_CRC64:
+       case BCH_STR_HASH_OPT_crc64:
                return BCH_STR_HASH_CRC64;
-       case BCH_STR_HASH_OPT_SIPHASH:
+       case BCH_STR_HASH_OPT_siphash:
                return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
                        ? BCH_STR_HASH_SIPHASH
                        : BCH_STR_HASH_SIPHASH_OLD;
@@ -156,7 +156,7 @@ bch2_hash_lookup(struct btree_trans *trans,
                if (k.k->type == desc.key_type) {
                        if (!desc.cmp_key(k, key))
                                return iter;
-               } else if (k.k->type == KEY_TYPE_whiteout) {
+               } else if (k.k->type == KEY_TYPE_hash_whiteout) {
                        ;
                } else {
                        /* hole, not found */
@@ -205,14 +205,12 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
        int ret;
 
        iter = bch2_trans_copy_iter(trans, start);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
 
        bch2_btree_iter_next_slot(iter);
 
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
                if (k.k->type != desc.key_type &&
-                   k.k->type != KEY_TYPE_whiteout)
+                   k.k->type != KEY_TYPE_hash_whiteout)
                        break;
 
                if (k.k->type == desc.key_type &&
@@ -253,13 +251,10 @@ int bch2_hash_set(struct btree_trans *trans,
                }
 
                if (!slot &&
-                   !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+                   !(flags & BCH_HASH_SET_MUST_REPLACE))
                        slot = bch2_trans_copy_iter(trans, iter);
-                       if (IS_ERR(slot))
-                               return PTR_ERR(slot);
-               }
 
-               if (k.k->type != KEY_TYPE_whiteout)
+               if (k.k->type != KEY_TYPE_hash_whiteout)
                        goto not_found;
        }
 
@@ -308,7 +303,7 @@ int bch2_hash_delete_at(struct btree_trans *trans,
 
        bkey_init(&delete->k);
        delete->k.p = iter->pos;
-       delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
+       delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
 
        bch2_trans_update(trans, iter, delete, 0);
        return 0;
index cee6cc9387340c7a4b288d3427fd3fa6b415eeb7..17936974d8a02afd6d766cfa97799a931d9a025b 100644 (file)
@@ -9,6 +9,7 @@
 #include "error.h"
 #include "io.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "quota.h"
@@ -276,19 +277,19 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
                return "Bad number of member devices";
 
        if (!BCH_SB_META_REPLICAS_WANT(sb) ||
-           BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+           BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
                return "Invalid number of metadata replicas";
 
        if (!BCH_SB_META_REPLICAS_REQ(sb) ||
-           BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+           BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
                return "Invalid number of metadata replicas";
 
        if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
-           BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+           BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
                return "Invalid number of data replicas";
 
        if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
-           BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+           BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
                return "Invalid number of data replicas";
 
        if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
@@ -361,6 +362,7 @@ static void bch2_sb_update(struct bch_fs *c)
        c->sb.uuid              = src->uuid;
        c->sb.user_uuid         = src->user_uuid;
        c->sb.version           = le16_to_cpu(src->version);
+       c->sb.version_min       = le16_to_cpu(src->version_min);
        c->sb.nr_devices        = src->nr_devices;
        c->sb.clean             = BCH_SB_CLEAN(src);
        c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
@@ -375,7 +377,6 @@ static void bch2_sb_update(struct bch_fs *c)
                ca->mi = bch2_mi_to_cpu(mi->members + i);
 }
 
-/* doesn't copy member info */
 static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
 {
        struct bch_sb_field *src_f, *dst_f;
@@ -614,9 +615,6 @@ got_super:
            bdev_logical_block_size(sb->bdev))
                goto err;
 
-       if (sb->mode & FMODE_WRITE)
-               bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
-                       |= BDI_CAP_STABLE_WRITES;
        ret = 0;
        sb->have_layout = true;
 out:
@@ -636,7 +634,7 @@ static void write_super_endio(struct bio *bio)
 
        /* XXX: return errors directly */
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
                               bch2_blk_status_to_str(bio->bi_status)))
                ca->sb_write_error = 1;
 
@@ -712,6 +710,8 @@ int bch2_write_super(struct bch_fs *c)
        if (test_bit(BCH_FS_ERROR, &c->flags))
                SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
 
+       SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+
        for_each_online_member(ca, c, i)
                bch2_sb_from_fs(c, ca);
 
@@ -770,15 +770,13 @@ int bch2_write_super(struct bch_fs *c)
        nr_wrote = dev_mask_nr(&sb_written);
 
        can_mount_with_written =
-               bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
-                                     BCH_FORCE_IF_DEGRADED);
+               bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
 
        for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
                sb_written.d[i] = ~sb_written.d[i];
 
        can_mount_without_written =
-               bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
-                                     BCH_FORCE_IF_DEGRADED);
+               bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
 
        /*
         * If we would be able to mount _without_ the devices we successfully
@@ -789,6 +787,7 @@ int bch2_write_super(struct bch_fs *c)
         * mount with the devices we did successfully write to:
         */
        if (bch2_fs_fatal_err_on(!nr_wrote ||
+                                !can_mount_with_written ||
                                 (can_mount_without_written &&
                                  !can_mount_with_written), c,
                "Unable to write superblock to sufficient devices"))
@@ -936,14 +935,23 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
 
 /* BCH_SB_FIELD_clean: */
 
-void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
+int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
 {
        struct jset_entry *entry;
+       int ret;
 
        for (entry = clean->start;
             entry < (struct jset_entry *) vstruct_end(&clean->field);
-            entry = vstruct_next(entry))
-               bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write);
+            entry = vstruct_next(entry)) {
+               ret = bch2_journal_entry_validate(c, "superblock", entry,
+                                                 le16_to_cpu(c->disk_sb.sb->version),
+                                                 BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+                                                 write);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
 }
 
 int bch2_fs_mark_dirty(struct bch_fs *c)
@@ -957,104 +965,118 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 
        mutex_lock(&c->sb_lock);
        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-       c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
-       c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
-       c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
+       c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
        ret = bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
        return ret;
 }
 
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
 {
-       memset(entry, 0, u64s * sizeof(u64));
+       struct jset_entry *entry = *end;
+       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
 
+       memset(entry, 0, u64s * sizeof(u64));
        /*
         * The u64s field counts from the start of data, ignoring the shared
         * fields.
         */
        entry->u64s = u64s - 1;
-}
 
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
-       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-       entry_init_u64s(entry, u64s);
+       *end = vstruct_next(*end);
+       return entry;
 }
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
-                                     struct jset_entry *entry,
-                                     u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+                                          struct jset_entry **end,
+                                          u64 journal_seq)
 {
-       unsigned i;
+       struct bch_dev *ca;
+       unsigned i, dev;
 
-       percpu_down_write(&c->mark_lock);
+       percpu_down_read(&c->mark_lock);
 
        if (!journal_seq) {
-               bch2_fs_usage_acc_to_base(c, 0);
-               bch2_fs_usage_acc_to_base(c, 1);
+               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+                       bch2_fs_usage_acc_to_base(c, i);
        } else {
-               bch2_fs_usage_acc_to_base(c, journal_seq & 1);
+               bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
        }
 
        {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_INODES;
                u->v            = cpu_to_le64(c->usage_base->nr_inodes);
-
-               entry = vstruct_next(entry);
        }
 
        {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_KEY_VERSION;
                u->v            = cpu_to_le64(atomic64_read(&c->key_version));
-
-               entry = vstruct_next(entry);
        }
 
        for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_RESERVED;
                u->entry.level  = i;
                u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
-               entry = vstruct_next(entry);
        }
 
        for (i = 0; i < c->replicas.nr; i++) {
                struct bch_replicas_entry *e =
                        cpu_replicas_entry(&c->replicas, i);
                struct jset_entry_data_usage *u =
-                       container_of(entry, struct jset_entry_data_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+                                    struct jset_entry_data_usage, entry);
 
-               entry_init_size(entry, sizeof(*u) + e->nr_devs);
                u->entry.type   = BCH_JSET_ENTRY_data_usage;
                u->v            = cpu_to_le64(c->usage_base->replicas[i]);
                memcpy(&u->r, e, replicas_entry_bytes(e));
+       }
 
-               entry = vstruct_next(entry);
+       for_each_member_device(ca, c, dev) {
+               unsigned b = sizeof(struct jset_entry_dev_usage) +
+                       sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+               struct jset_entry_dev_usage *u =
+                       container_of(jset_entry_init(end, b),
+                                    struct jset_entry_dev_usage, entry);
+
+               u->entry.type = BCH_JSET_ENTRY_dev_usage;
+               u->dev = cpu_to_le32(dev);
+               u->buckets_ec           = cpu_to_le64(ca->usage_base->buckets_ec);
+               u->buckets_unavailable  = cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+               for (i = 0; i < BCH_DATA_NR; i++) {
+                       u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+                       u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+                       u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+               }
        }
 
-       percpu_up_write(&c->mark_lock);
+       percpu_up_read(&c->mark_lock);
 
-       return entry;
+       for (i = 0; i < 2; i++) {
+               struct jset_entry_clock *clock =
+                       container_of(jset_entry_init(end, sizeof(*clock)),
+                                    struct jset_entry_clock, entry);
+
+               clock->entry.type = BCH_JSET_ENTRY_clock;
+               clock->rw       = i;
+               clock->time     = atomic64_read(&c->io_clock[i].now);
+       }
 }
 
 void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1062,6 +1084,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
        struct bch_sb_field_clean *sb_clean;
        struct jset_entry *entry;
        unsigned u64s;
+       int ret;
 
        mutex_lock(&c->sb_lock);
        if (BCH_SB_CLEAN(c->disk_sb.sb))
@@ -1069,8 +1092,8 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
        SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
 
-       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
-       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
+       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
+       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
        c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
        c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
 
@@ -1083,24 +1106,28 @@ void bch2_fs_mark_clean(struct bch_fs *c)
        }
 
        sb_clean->flags         = 0;
-       sb_clean->read_clock    = cpu_to_le16(c->bucket_clock[READ].hand);
-       sb_clean->write_clock   = cpu_to_le16(c->bucket_clock[WRITE].hand);
        sb_clean->journal_seq   = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
 
        /* Trying to catch outstanding bug: */
        BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
 
        entry = sb_clean->start;
-       entry = bch2_journal_super_entries_add_common(c, entry, 0);
+       bch2_journal_super_entries_add_common(c, &entry, 0);
        entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
        BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
 
        memset(entry, 0,
               vstruct_end(&sb_clean->field) - (void *) entry);
 
-       if (le16_to_cpu(c->disk_sb.sb->version) <
-           bcachefs_metadata_version_bkey_renumber)
-               bch2_sb_clean_renumber(sb_clean, WRITE);
+       /*
+        * this should be in the write path, and we should be validating every
+        * superblock section:
+        */
+       ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
+       if (ret) {
+               bch_err(c, "error writing marking filesystem clean: validate error");
+               goto out;
+       }
 
        bch2_write_super(c);
 out:
index 7a068158efcae906103b729488bc0ed83e2280a4..b64ac2fbbf8bde6fdaff7f258fc84de88ba9948d 100644 (file)
@@ -122,11 +122,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_clean: */
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
-                                     struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+                                          struct jset_entry **, u64);
 
-void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
+int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
 
 int bch2_fs_mark_dirty(struct bch_fs *);
 void bch2_fs_mark_clean(struct bch_fs *);
index 015bbd9f21fd933774c0982b285a42014da3ba35..670e9cdceb1eaeefc8e27e6bd8d694dacbd0eacd 100644 (file)
@@ -49,7 +49,6 @@
 #include <linux/device.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
-#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/random.h>
@@ -149,6 +148,23 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
        return c;
 }
 
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i, nr = 0, u64s =
+               ((sizeof(struct jset_entry_dev_usage) +
+                 sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
+               sizeof(u64);
+
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i, NULL)
+               nr++;
+       rcu_read_unlock();
+
+       bch2_journal_entry_res_resize(&c->journal,
+                       &c->dev_usage_journal_res, u64s * nr);
+}
+
 /* Filesystem RO/RW: */
 
 /*
@@ -175,9 +191,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        bch2_copygc_stop(c);
        bch2_gc_thread_stop(c);
 
-       bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-       bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
        /*
         * Flush journal before stopping allocators, because flushing journal
         * blacklist entries involves allocating new btree nodes:
@@ -236,10 +249,7 @@ nowrote_alloc:
         * the journal kicks off btree writes via reclaim - wait for in flight
         * writes after stopping journal:
         */
-       if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-               bch2_btree_flush_all_writes(c);
-       else
-               bch2_btree_verify_flushed(c);
+       bch2_btree_flush_all_writes(c);
 
        /*
         * After stopping journal:
@@ -259,7 +269,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
 void bch2_fs_read_only(struct bch_fs *c)
 {
        if (!test_bit(BCH_FS_RW, &c->flags)) {
-               cancel_delayed_work_sync(&c->journal.reclaim_work);
+               BUG_ON(c->journal.reclaim_thread);
                return;
        }
 
@@ -386,6 +396,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
             (!early || c->opts.read_only)))
                return -EROFS;
 
+       bch_info(c, "going read-write");
+
        ret = bch2_fs_mark_dirty(c);
        if (ret)
                goto err;
@@ -403,9 +415,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-       bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
        for_each_rw_member(ca, c, i) {
                ret = bch2_dev_allocator_start(ca);
                if (ret) {
@@ -417,6 +426,15 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
        set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
 
+       for_each_rw_member(ca, c, i)
+               bch2_wake_allocator(ca);
+
+       ret = bch2_journal_reclaim_start(&c->journal);
+       if (ret) {
+               bch_err(c, "error starting journal reclaim: %i", ret);
+               return ret;
+       }
+
        if (!early) {
                ret = bch2_fs_read_write_late(c);
                if (ret)
@@ -425,9 +443,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
        percpu_ref_reinit(&c->writes);
        set_bit(BCH_FS_RW, &c->flags);
-
-       queue_delayed_work(c->journal_reclaim_wq,
-                          &c->journal.reclaim_work, 0);
        return 0;
 err:
        __bch2_fs_read_only(c);
@@ -451,6 +466,7 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 static void __bch2_fs_free(struct bch_fs *c)
 {
        unsigned i;
+       int cpu;
 
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_exit(&c->times[i]);
@@ -472,9 +488,16 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_journal_entries_free(&c->journal_entries);
        percpu_free_rwsem(&c->mark_lock);
        kfree(c->usage_scratch);
-       free_percpu(c->usage[1]);
-       free_percpu(c->usage[0]);
+       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+               free_percpu(c->usage[i]);
        kfree(c->usage_base);
+
+       if (c->btree_iters_bufs)
+               for_each_possible_cpu(cpu)
+                       kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+
+       free_percpu(c->online_reserved);
+       free_percpu(c->btree_iters_bufs);
        free_percpu(c->pcpu);
        mempool_exit(&c->large_bkey_pool);
        mempool_exit(&c->btree_bounce_pool);
@@ -485,10 +508,9 @@ static void __bch2_fs_free(struct bch_fs *c)
        kfree(c->replicas_gc.entries);
        kfree(rcu_dereference_protected(c->disk_groups, 1));
        kfree(c->journal_seq_blacklist_table);
+       kfree(c->unused_inode_hints);
        free_heap(&c->copygc_heap);
 
-       if (c->journal_reclaim_wq)
-               destroy_workqueue(c->journal_reclaim_wq);
        if (c->copygc_wq)
                destroy_workqueue(c->copygc_wq);
        if (c->wq)
@@ -679,6 +701,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                  bch2_blacklist_entries_gc);
 
        INIT_LIST_HEAD(&c->journal_entries);
+       INIT_LIST_HEAD(&c->journal_iters);
 
        INIT_LIST_HEAD(&c->fsck_errors);
        mutex_init(&c->fsck_error_lock);
@@ -708,6 +731,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        bch2_fs_btree_cache_init_early(&c->btree_cache);
 
+       mutex_init(&c->sectors_available_lock);
+
        if (percpu_init_rwsem(&c->mark_lock))
                goto err;
 
@@ -736,12 +761,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                (btree_blocks(c) + 1) * 2 *
                sizeof(struct sort_iter_set);
 
+       c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
        if (!(c->wq = alloc_workqueue("bcachefs",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
-           !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+           !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
-           !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
-                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
            percpu_ref_init(&c->writes, bch2_writes_disabled,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
            mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
@@ -750,9 +775,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+           !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
+           !(c->online_reserved = alloc_percpu(u64)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
            mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+           !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+                                             sizeof(u64), GFP_KERNEL)) ||
            bch2_io_clock_init(&c->io_clock[READ]) ||
            bch2_io_clock_init(&c->io_clock[WRITE]) ||
            bch2_fs_journal_init(&c->journal) ||
@@ -774,6 +803,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                    bch2_dev_alloc(c, i))
                        goto err;
 
+       bch2_journal_entry_res_resize(&c->journal,
+                       &c->btree_root_journal_res,
+                       BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
+       bch2_dev_usage_journal_reserve(c);
+       bch2_journal_entry_res_resize(&c->journal,
+                       &c->clock_journal_res,
+                       (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
+
        mutex_lock(&bch_fs_list_lock);
        err = bch2_fs_online(c);
        mutex_unlock(&bch_fs_list_lock);
@@ -971,6 +1008,8 @@ static void bch2_dev_release(struct kobject *kobj)
 
 static void bch2_dev_free(struct bch_dev *ca)
 {
+       bch2_dev_allocator_stop(ca);
+
        cancel_work_sync(&ca->io_error_work);
 
        if (ca->kobj.state_in_sysfs &&
@@ -1139,6 +1178,14 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
        if (!ca)
                goto err;
 
+       ca->fs = c;
+
+       if (ca->mi.state == BCH_MEMBER_STATE_rw &&
+           bch2_dev_allocator_start(ca)) {
+               bch2_dev_free(ca);
+               goto err;
+       }
+
        bch2_dev_attach(c, ca, dev_idx);
 out:
        pr_verbose_init(c->opts, "ret %i", ret);
@@ -1209,13 +1256,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
        if (ret)
                return ret;
 
-       if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
-           !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
-               mutex_lock(&c->sb_lock);
-               bch2_mark_dev_superblock(ca->fs, ca, 0);
-               mutex_unlock(&c->sb_lock);
-       }
-
        bch2_dev_sysfs_online(c, ca);
 
        if (c->sb.nr_devices == 1)
@@ -1241,23 +1281,22 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
                            enum bch_member_state new_state, int flags)
 {
        struct bch_devs_mask new_online_devs;
-       struct replicas_status s;
        struct bch_dev *ca2;
        int i, nr_rw = 0, required;
 
        lockdep_assert_held(&c->state_lock);
 
        switch (new_state) {
-       case BCH_MEMBER_STATE_RW:
+       case BCH_MEMBER_STATE_rw:
                return true;
-       case BCH_MEMBER_STATE_RO:
-               if (ca->mi.state != BCH_MEMBER_STATE_RW)
+       case BCH_MEMBER_STATE_ro:
+               if (ca->mi.state != BCH_MEMBER_STATE_rw)
                        return true;
 
                /* do we have enough devices to write to?  */
                for_each_member_device(ca2, c, i)
                        if (ca2 != ca)
-                               nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+                               nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
 
                required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
                               ? c->opts.metadata_replicas
@@ -1267,19 +1306,17 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
                               : c->opts.data_replicas_required);
 
                return nr_rw >= required;
-       case BCH_MEMBER_STATE_FAILED:
-       case BCH_MEMBER_STATE_SPARE:
-               if (ca->mi.state != BCH_MEMBER_STATE_RW &&
-                   ca->mi.state != BCH_MEMBER_STATE_RO)
+       case BCH_MEMBER_STATE_failed:
+       case BCH_MEMBER_STATE_spare:
+               if (ca->mi.state != BCH_MEMBER_STATE_rw &&
+                   ca->mi.state != BCH_MEMBER_STATE_ro)
                        return true;
 
                /* do we have enough devices to read from?  */
                new_online_devs = bch2_online_devs(c);
                __clear_bit(ca->dev_idx, new_online_devs.d);
 
-               s = __bch2_replicas_status(c, new_online_devs);
-
-               return bch2_have_enough_devs(s, flags);
+               return bch2_have_enough_devs(c, new_online_devs, flags, false);
        default:
                BUG();
        }
@@ -1287,14 +1324,18 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 
 static bool bch2_fs_may_start(struct bch_fs *c)
 {
-       struct replicas_status s;
        struct bch_sb_field_members *mi;
        struct bch_dev *ca;
-       unsigned i, flags = c->opts.degraded
-               ? BCH_FORCE_IF_DEGRADED
-               : 0;
+       unsigned i, flags = 0;
+
+       if (c->opts.very_degraded)
+               flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
+
+       if (c->opts.degraded)
+               flags |= BCH_FORCE_IF_DEGRADED;
 
-       if (!c->opts.degraded) {
+       if (!c->opts.degraded &&
+           !c->opts.very_degraded) {
                mutex_lock(&c->sb_lock);
                mi = bch2_sb_get_members(c->disk_sb.sb);
 
@@ -1305,8 +1346,8 @@ static bool bch2_fs_may_start(struct bch_fs *c)
                        ca = bch_dev_locked(c, i);
 
                        if (!bch2_dev_is_online(ca) &&
-                           (ca->mi.state == BCH_MEMBER_STATE_RW ||
-                            ca->mi.state == BCH_MEMBER_STATE_RO)) {
+                           (ca->mi.state == BCH_MEMBER_STATE_rw ||
+                            ca->mi.state == BCH_MEMBER_STATE_ro)) {
                                mutex_unlock(&c->sb_lock);
                                return false;
                        }
@@ -1314,9 +1355,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
                mutex_unlock(&c->sb_lock);
        }
 
-       s = bch2_replicas_status(c);
-
-       return bch2_have_enough_devs(s, flags);
+       return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
 }
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
@@ -1341,7 +1380,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
        lockdep_assert_held(&c->state_lock);
 
-       BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+       BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
 
        bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
@@ -1364,10 +1403,10 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        if (!bch2_dev_state_allowed(c, ca, new_state, flags))
                return -EINVAL;
 
-       if (new_state != BCH_MEMBER_STATE_RW)
+       if (new_state != BCH_MEMBER_STATE_rw)
                __bch2_dev_read_only(c, ca);
 
-       bch_notice(ca, "%s", bch2_dev_state[new_state]);
+       bch_notice(ca, "%s", bch2_member_states[new_state]);
 
        mutex_lock(&c->sb_lock);
        mi = bch2_sb_get_members(c->disk_sb.sb);
@@ -1375,7 +1414,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
-       if (new_state == BCH_MEMBER_STATE_RW &&
+       if (new_state == BCH_MEMBER_STATE_rw &&
            __bch2_dev_read_write(c, ca))
                ret = -ENOMEM;
 
@@ -1408,7 +1447,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 
        for (i = 0; i < ca->mi.nbuckets; i++) {
                ret = bch2_btree_key_cache_flush(&trans,
-                               BTREE_ID_ALLOC, POS(ca->dev_idx, i));
+                               BTREE_ID_alloc, POS(ca->dev_idx, i));
                if (ret)
                        break;
        }
@@ -1417,7 +1456,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
        if (ret)
                return ret;
 
-       return bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+       return bch2_btree_delete_range(c, BTREE_ID_alloc,
                                       POS(ca->dev_idx, 0),
                                       POS(ca->dev_idx + 1, 0),
                                       NULL);
@@ -1437,7 +1476,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
         */
        percpu_ref_put(&ca->ref);
 
-       if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+       if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
                bch_err(ca, "Cannot remove without losing data");
                goto err;
        }
@@ -1517,28 +1556,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
        mutex_unlock(&c->sb_lock);
        up_write(&c->state_lock);
+
+       bch2_dev_usage_journal_reserve(c);
        return 0;
 err:
-       if (ca->mi.state == BCH_MEMBER_STATE_RW &&
+       if (ca->mi.state == BCH_MEMBER_STATE_rw &&
            !percpu_ref_is_zero(&ca->io_ref))
                __bch2_dev_read_write(c, ca);
        up_write(&c->state_lock);
        return ret;
 }
 
-static void dev_usage_clear(struct bch_dev *ca)
-{
-       struct bucket_array *buckets;
-
-       percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
-       down_read(&ca->bucket_lock);
-       buckets = bucket_array(ca);
-
-       memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
-       up_read(&ca->bucket_lock);
-}
-
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
@@ -1589,15 +1617,13 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
         * allocate the journal, reset all the marks, then remark after we
         * attach...
         */
-       bch2_mark_dev_superblock(ca->fs, ca, 0);
+       bch2_mark_dev_superblock(NULL, ca, 0);
 
        err = "journal alloc failed";
        ret = bch2_dev_journal_alloc(ca);
        if (ret)
                goto err;
 
-       dev_usage_clear(ca);
-
        down_write(&c->state_lock);
        mutex_lock(&c->sb_lock);
 
@@ -1648,17 +1674,17 @@ have_slot:
        ca->disk_sb.sb->dev_idx = dev_idx;
        bch2_dev_attach(c, ca, dev_idx);
 
-       bch2_mark_dev_superblock(c, ca, 0);
-
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
-       err = "alloc write failed";
-       ret = bch2_dev_alloc_write(c, ca, 0);
+       bch2_dev_usage_journal_reserve(c);
+
+       err = "error marking superblock";
+       ret = bch2_trans_mark_dev_sb(c, NULL, ca);
        if (ret)
-               goto err;
+               goto err_late;
 
-       if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+       if (ca->mi.state == BCH_MEMBER_STATE_rw) {
                err = __bch2_dev_read_write(c, ca);
                if (err)
                        goto err_late;
@@ -1677,6 +1703,7 @@ err:
        bch_err(c, "Unable to add device: %s", err);
        return ret;
 err_late:
+       up_write(&c->state_lock);
        bch_err(c, "Error going rw after adding device: %s", err);
        return -EINVAL;
 }
@@ -1712,7 +1739,13 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
        }
 
        ca = bch_dev_locked(c, dev_idx);
-       if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+
+       if (bch2_trans_mark_dev_sb(c, NULL, ca)) {
+               err = "bch2_trans_mark_dev_sb() error";
+               goto err;
+       }
+
+       if (ca->mi.state == BCH_MEMBER_STATE_rw) {
                err = __bch2_dev_read_write(c, ca);
                if (err)
                        goto err;
@@ -1746,7 +1779,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
                return 0;
        }
 
-       if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+       if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
                bch_err(ca, "Cannot offline required disk");
                up_write(&c->state_lock);
                return -EINVAL;
@@ -2005,6 +2038,7 @@ static void bcachefs_exit(void)
        bch2_debug_exit();
        bch2_vfs_exit();
        bch2_chardev_exit();
+       bch2_btree_key_cache_exit();
        if (bcachefs_kset)
                kset_unregister(bcachefs_kset);
 }
@@ -2012,9 +2046,9 @@ static void bcachefs_exit(void)
 static int __init bcachefs_init(void)
 {
        bch2_bkey_pack_test();
-       bch2_inode_pack_test();
 
        if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+           bch2_btree_key_cache_init() ||
            bch2_chardev_init() ||
            bch2_vfs_init() ||
            bch2_debug_init())
index 02c81f3555c3559d7a7d2b2f65bce28ded807899..bef27906e4809d5d5a049746be96c84f13d71195 100644 (file)
@@ -34,7 +34,7 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca)
 static inline bool bch2_dev_is_readable(struct bch_dev *ca)
 {
        return bch2_dev_is_online(ca) &&
-               ca->mi.state != BCH_MEMBER_STATE_FAILED;
+               ca->mi.state != BCH_MEMBER_STATE_failed;
 }
 
 static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
@@ -42,8 +42,8 @@ static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
        if (!percpu_ref_tryget(&ca->io_ref))
                return false;
 
-       if (ca->mi.state == BCH_MEMBER_STATE_RW ||
-           (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+       if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+           (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
                return true;
 
        percpu_ref_put(&ca->io_ref);
@@ -158,11 +158,11 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
        __for_each_online_member(ca, c, iter, ~0)
 
 #define for_each_rw_member(ca, c, iter)                                        \
-       __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
+       __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
 
 #define for_each_readable_member(ca, c, iter)                          \
        __for_each_online_member(ca, c, iter,                           \
-               (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+               (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
 
 /*
  * If a key exists that references a device, the device won't be going away and
index 20406ebd6f5bad7cd89252a293366262155e9cef..069973a38f12d7b1398c1b12a858d5a7e263be7c 100644 (file)
@@ -20,7 +20,7 @@ struct bch_devs_mask {
 
 struct bch_devs_list {
        u8                      nr;
-       u8                      devs[BCH_REPLICAS_MAX + 1];
+       u8                      devs[BCH_BKEY_PTRS_MAX];
 };
 
 struct bch_member_cpu {
index 0cb29f43d99d9a28c6e70139bc637d45b1ba0d6f..2d008979b256011435ea8312628e3925d48c8cff 100644 (file)
@@ -153,6 +153,8 @@ read_attribute(io_latency_stats_read);
 read_attribute(io_latency_stats_write);
 read_attribute(congested);
 
+read_attribute(btree_avg_write_size);
+
 read_attribute(bucket_quantiles_last_read);
 read_attribute(bucket_quantiles_last_write);
 read_attribute(bucket_quantiles_fragmentation);
@@ -165,6 +167,7 @@ read_attribute(journal_debug);
 read_attribute(journal_pins);
 read_attribute(btree_updates);
 read_attribute(dirty_btree_nodes);
+read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 read_attribute(stripes_heap);
@@ -198,9 +201,6 @@ read_attribute(new_stripes);
 
 rw_attribute(pd_controllers_update_seconds);
 
-read_attribute(meta_replicas_have);
-read_attribute(data_replicas_have);
-
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
@@ -208,12 +208,6 @@ read_attribute(io_timers_write);
 write_attribute(perf_test);
 #endif /* CONFIG_BCACHEFS_TESTS */
 
-#define BCH_DEBUG_PARAM(name, description)                             \
-       rw_attribute(name);
-
-       BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
 #define x(_name)                                               \
        static struct attribute sysfs_time_stat_##_name =               \
                { .name = #_name, .mode = S_IRUGO };
@@ -238,9 +232,17 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
        return ret;
 }
 
+static size_t bch2_btree_avg_write_size(struct bch_fs *c)
+{
+       u64 nr = atomic64_read(&c->btree_writes_nr);
+       u64 sectors = atomic64_read(&c->btree_writes_sectors);
+
+       return nr ? div64_u64(sectors, nr) : 0;
+}
+
 static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 {
-       struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
+       struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
 
        if (!fs_usage)
                return -ENOMEM;
@@ -269,7 +271,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret)
+       for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
                if (k.k->type == KEY_TYPE_extent) {
                        struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
                        const union bch_extent_entry *entry;
@@ -326,6 +328,7 @@ SHOW(bch2_fs)
        sysfs_print(block_size,                 block_bytes(c));
        sysfs_print(btree_node_size,            btree_bytes(c));
        sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
+       sysfs_hprint(btree_avg_write_size,      bch2_btree_avg_write_size(c));
 
        sysfs_print(read_realloc_races,
                    atomic_long_read(&c->read_realloc_races));
@@ -352,9 +355,6 @@ SHOW(bch2_fs)
 
        sysfs_print(promote_whole_extents,      c->promote_whole_extents);
 
-       sysfs_printf(meta_replicas_have, "%i",  bch2_replicas_online(c, true));
-       sysfs_printf(data_replicas_have, "%i",  bch2_replicas_online(c, false));
-
        /* Debugging: */
 
        if (attr == &sysfs_alloc_debug)
@@ -380,6 +380,11 @@ SHOW(bch2_fs)
                return out.pos - buf;
        }
 
+       if (attr == &sysfs_btree_cache) {
+               bch2_btree_cache_to_text(&out, c);
+               return out.pos - buf;
+       }
+
        if (attr == &sysfs_btree_key_cache) {
                bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
                return out.pos - buf;
@@ -414,10 +419,6 @@ SHOW(bch2_fs)
                return out.pos - buf;
        }
 
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
-       BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
        return 0;
 }
 
@@ -462,17 +463,13 @@ STORE(bch2_fs)
 
        /* Debugging: */
 
-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
-       BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
        if (!test_bit(BCH_FS_STARTED, &c->flags))
                return -EPERM;
 
        /* Debugging: */
 
        if (attr == &sysfs_trigger_journal_flush)
-               bch2_journal_meta_async(&c->journal, NULL);
+               bch2_journal_meta(&c->journal);
 
        if (attr == &sysfs_trigger_btree_coalesce)
                bch2_coalesce(c);
@@ -483,7 +480,7 @@ STORE(bch2_fs)
                 */
 #if 0
                down_read(&c->state_lock);
-               bch2_gc(c, NULL, false, false);
+               bch2_gc(c, false, false);
                up_read(&c->state_lock);
 #else
                bch2_gc_gens(c);
@@ -511,10 +508,11 @@ STORE(bch2_fs)
                if (threads_str &&
                    !(ret = kstrtouint(threads_str, 10, &threads)) &&
                    !(ret = bch2_strtoull_h(nr_str, &nr)))
-                       bch2_btree_perf_test(c, test, nr, threads);
-               else
-                       size = ret;
+                       ret = bch2_btree_perf_test(c, test, nr, threads);
                kfree(tmp);
+
+               if (ret)
+                       size = ret;
        }
 #endif
        return size;
@@ -526,9 +524,7 @@ struct attribute *bch2_fs_files[] = {
        &sysfs_block_size,
        &sysfs_btree_node_size,
        &sysfs_btree_cache_size,
-
-       &sysfs_meta_replicas_have,
-       &sysfs_data_replicas_have,
+       &sysfs_btree_avg_write_size,
 
        &sysfs_journal_write_delay_ms,
        &sysfs_journal_reclaim_delay_ms,
@@ -564,6 +560,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_journal_pins,
        &sysfs_btree_updates,
        &sysfs_dirty_btree_nodes,
+       &sysfs_btree_cache,
        &sysfs_btree_key_cache,
        &sysfs_btree_transactions,
        &sysfs_stripes_heap,
@@ -590,11 +587,6 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_io_timers_write,
 
        &sysfs_internal_uuid,
-
-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
-       BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
        NULL
 };
 
@@ -716,7 +708,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
 {
        int rw = (private ? 1 : 0);
 
-       return bucket_last_io(c, bucket(ca, b), rw);
+       return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
 }
 
 static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -729,7 +721,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
 static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
                                     size_t b, void *private)
 {
-       return bucket_gc_gen(ca, b);
+       return bucket_gc_gen(bucket(ca, b));
 }
 
 static int unsigned_cmp(const void *_l, const void *_r)
@@ -808,63 +800,40 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
                nr[c->open_buckets[i].type]++;
 
        pr_buf(out,
-               "free_inc:               %zu/%zu\n"
-               "free[RESERVE_BTREE]:    %zu/%zu\n"
-               "free[RESERVE_MOVINGGC]: %zu/%zu\n"
-               "free[RESERVE_NONE]:     %zu/%zu\n"
-               "buckets:\n"
-               "    capacity:           %llu\n"
-               "    alloc:              %llu\n"
-               "    sb:                 %llu\n"
-               "    journal:            %llu\n"
-               "    meta:               %llu\n"
-               "    user:               %llu\n"
-               "    cached:             %llu\n"
-               "    erasure coded:      %llu\n"
-               "    available:          %lli\n"
-               "sectors:\n"
-               "    sb:                 %llu\n"
-               "    journal:            %llu\n"
-               "    meta:               %llu\n"
-               "    user:               %llu\n"
-               "    cached:             %llu\n"
-               "    erasure coded:      %llu\n"
-               "    fragmented:         %llu\n"
-               "    copygc threshold:   %llu\n"
-               "freelist_wait:          %s\n"
-               "open buckets:           %u/%u (reserved %u)\n"
-               "open_buckets_wait:      %s\n"
-               "open_buckets_btree:     %u\n"
-               "open_buckets_user:      %u\n"
-               "btree reserve cache:    %u\n",
-               fifo_used(&ca->free_inc),               ca->free_inc.size,
-               fifo_used(&ca->free[RESERVE_BTREE]),    ca->free[RESERVE_BTREE].size,
-               fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
-               fifo_used(&ca->free[RESERVE_NONE]),     ca->free[RESERVE_NONE].size,
-               ca->mi.nbuckets - ca->mi.first_bucket,
-               stats.buckets_alloc,
-               stats.buckets[BCH_DATA_sb],
-               stats.buckets[BCH_DATA_journal],
-               stats.buckets[BCH_DATA_btree],
-               stats.buckets[BCH_DATA_user],
-               stats.buckets[BCH_DATA_cached],
-               stats.buckets_ec,
-               __dev_buckets_available(ca, stats),
-               stats.sectors[BCH_DATA_sb],
-               stats.sectors[BCH_DATA_journal],
-               stats.sectors[BCH_DATA_btree],
-               stats.sectors[BCH_DATA_user],
-               stats.sectors[BCH_DATA_cached],
-               stats.sectors_ec,
-               stats.sectors_fragmented,
-               c->copygc_threshold,
-               c->freelist_wait.list.first             ? "waiting" : "empty",
-               c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
-               BTREE_NODE_OPEN_BUCKET_RESERVE,
-               c->open_buckets_wait.list.first         ? "waiting" : "empty",
-               nr[BCH_DATA_btree],
-               nr[BCH_DATA_user],
-               c->btree_reserve_cache_nr);
+              "\t\t buckets\t sectors      fragmented\n"
+              "capacity%16llu\n",
+              ca->mi.nbuckets - ca->mi.first_bucket);
+
+       for (i = 1; i < BCH_DATA_NR; i++)
+               pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+                      bch2_data_types[i], stats.d[i].buckets,
+                      stats.d[i].sectors, stats.d[i].fragmented);
+
+       pr_buf(out,
+              "ec\t%16llu\n"
+              "available%15llu\n"
+              "\n"
+              "free_inc\t\t%zu/%zu\n"
+              "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
+              "free[RESERVE_NONE]\t%zu/%zu\n"
+              "freelist_wait\t\t%s\n"
+              "open buckets\t\t%u/%u (reserved %u)\n"
+              "open_buckets_wait\t%s\n"
+              "open_buckets_btree\t%u\n"
+              "open_buckets_user\t%u\n"
+              "btree reserve cache\t%u\n",
+              stats.buckets_ec,
+              __dev_buckets_available(ca, stats),
+              fifo_used(&ca->free_inc),                ca->free_inc.size,
+              fifo_used(&ca->free[RESERVE_MOVINGGC]),  ca->free[RESERVE_MOVINGGC].size,
+              fifo_used(&ca->free[RESERVE_NONE]),      ca->free[RESERVE_NONE].size,
+              c->freelist_wait.list.first              ? "waiting" : "empty",
+              c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+              BTREE_NODE_OPEN_BUCKET_RESERVE,
+              c->open_buckets_wait.list.first          ? "waiting" : "empty",
+              nr[BCH_DATA_btree],
+              nr[BCH_DATA_user],
+              c->btree_reserve_cache_nr);
 }
 
 static const char * const bch2_rw[] = {
@@ -930,7 +899,7 @@ SHOW(bch2_dev)
        }
 
        if (attr == &sysfs_state_rw) {
-               bch2_string_opt_to_text(&out, bch2_dev_state,
+               bch2_string_opt_to_text(&out, bch2_member_states,
                                        ca->mi.state);
                pr_buf(&out, "\n");
                return out.pos - buf;
index 4dcace650416750b59ca4ae9c16db5544e196eca..7507b6bcc13f2e47e2b57bfabc831e8d56254dcd 100644 (file)
@@ -13,12 +13,12 @@ static void delete_test_keys(struct bch_fs *c)
 {
        int ret;
 
-       ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+       ret = bch2_btree_delete_range(c, BTREE_ID_extents,
                                      POS(0, 0), POS(0, U64_MAX),
                                      NULL);
        BUG_ON(ret);
 
-       ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+       ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
                                      POS(0, 0), POS(0, U64_MAX),
                                      NULL);
        BUG_ON(ret);
@@ -26,7 +26,7 @@ static void delete_test_keys(struct bch_fs *c)
 
 /* unit tests */
 
-static void test_delete(struct bch_fs *c, u64 nr)
+static int test_delete(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -37,28 +37,42 @@ static void test_delete(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
                                   BTREE_ITER_INTENT);
 
        ret = bch2_btree_iter_traverse(iter);
-       BUG_ON(ret);
+       if (ret) {
+               bch_err(c, "lookup error in test_delete: %i", ret);
+               goto err;
+       }
 
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                bch2_trans_update(&trans, iter, &k.k_i, 0));
-       BUG_ON(ret);
+       if (ret) {
+               bch_err(c, "update error in test_delete: %i", ret);
+               goto err;
+       }
 
        pr_info("deleting once");
        ret = bch2_btree_delete_at(&trans, iter, 0);
-       BUG_ON(ret);
+       if (ret) {
+               bch_err(c, "delete error (first) in test_delete: %i", ret);
+               goto err;
+       }
 
        pr_info("deleting twice");
        ret = bch2_btree_delete_at(&trans, iter, 0);
-       BUG_ON(ret);
-
+       if (ret) {
+               bch_err(c, "delete error (second) in test_delete: %i", ret);
+               goto err;
+       }
+err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void test_delete_written(struct bch_fs *c, u64 nr)
+static int test_delete_written(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -69,31 +83,42 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
                                   BTREE_ITER_INTENT);
 
        ret = bch2_btree_iter_traverse(iter);
-       BUG_ON(ret);
+       if (ret) {
+               bch_err(c, "lookup error in test_delete_written: %i", ret);
+               goto err;
+       }
 
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                bch2_trans_update(&trans, iter, &k.k_i, 0));
-       BUG_ON(ret);
+       if (ret) {
+               bch_err(c, "update error in test_delete_written: %i", ret);
+               goto err;
+       }
 
        bch2_journal_flush_all_pins(&c->journal);
 
        ret = bch2_btree_delete_at(&trans, iter, 0);
-       BUG_ON(ret);
-
+       if (ret) {
+               bch_err(c, "delete error in test_delete_written: %i", ret);
+               goto err;
+       }
+err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void test_iterate(struct bch_fs *c, u64 nr)
+static int test_iterate(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter *iter = NULL;
        struct bkey_s_c k;
        u64 i;
-       int ret;
+       int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
@@ -107,16 +132,19 @@ static void test_iterate(struct bch_fs *c, u64 nr)
                bkey_cookie_init(&k.k_i);
                k.k.p.offset = i;
 
-               ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
+               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
                                        NULL, NULL, 0);
-               BUG_ON(ret);
+               if (ret) {
+                       bch_err(c, "insert error in test_iterate: %i", ret);
+                       goto err;
+               }
        }
 
        pr_info("iterating forwards");
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
                           POS_MIN, 0, k, ret) {
                if (k.k->p.inode)
                        break;
@@ -132,17 +160,19 @@ static void test_iterate(struct bch_fs *c, u64 nr)
                BUG_ON(k.k->p.offset != --i);
 
        BUG_ON(i);
-
+err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void test_iterate_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter *iter = NULL;
        struct bkey_s_c k;
        u64 i;
-       int ret;
+       int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
@@ -157,16 +187,19 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
                k.k.p.offset = i + 8;
                k.k.size = 8;
 
-               ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
                                        NULL, NULL, 0);
-               BUG_ON(ret);
+               if (ret) {
+                       bch_err(c, "insert error in test_iterate_extents: %i", ret);
+                       goto err;
+               }
        }
 
        pr_info("iterating forwards");
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+       for_each_btree_key(&trans, iter, BTREE_ID_extents,
                           POS_MIN, 0, k, ret) {
                BUG_ON(bkey_start_offset(k.k) != i);
                i = k.k->p.offset;
@@ -182,17 +215,19 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
        }
 
        BUG_ON(i);
-
+err:
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void test_iterate_slots(struct bch_fs *c, u64 nr)
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
        u64 i;
-       int ret;
+       int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
@@ -206,16 +241,19 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
                bkey_cookie_init(&k.k_i);
                k.k.p.offset = i * 2;
 
-               ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
+               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
                                        NULL, NULL, 0);
-               BUG_ON(ret);
+               if (ret) {
+                       bch_err(c, "insert error in test_iterate_slots: %i", ret);
+                       goto err;
+               }
        }
 
        pr_info("iterating forwards");
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
                           0, k, ret) {
                if (k.k->p.inode)
                        break;
@@ -223,7 +261,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
                BUG_ON(k.k->p.offset != i);
                i += 2;
        }
-       bch2_trans_iter_free(&trans, iter);
+       bch2_trans_iter_put(&trans, iter);
 
        BUG_ON(i != nr * 2);
 
@@ -231,7 +269,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
                           BTREE_ITER_SLOTS, k, ret) {
                BUG_ON(k.k->p.offset != i);
                BUG_ON(bkey_deleted(k.k) != (i & 1));
@@ -240,17 +278,19 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
                if (i == nr * 2)
                        break;
        }
-
+       bch2_trans_iter_put(&trans, iter);
+err:
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
        u64 i;
-       int ret;
+       int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
@@ -265,22 +305,25 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                k.k.p.offset = i + 16;
                k.k.size = 8;
 
-               ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
                                        NULL, NULL, 0);
-               BUG_ON(ret);
+               if (ret) {
+                       bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
+                       goto err;
+               }
        }
 
        pr_info("iterating forwards");
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
                           0, k, ret) {
                BUG_ON(bkey_start_offset(k.k) != i + 8);
                BUG_ON(k.k->size != 8);
                i += 16;
        }
-       bch2_trans_iter_free(&trans, iter);
+       bch2_trans_iter_put(&trans, iter);
 
        BUG_ON(i != nr);
 
@@ -288,7 +331,7 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
                           BTREE_ITER_SLOTS, k, ret) {
                BUG_ON(bkey_deleted(k.k) != !(i % 16));
 
@@ -299,15 +342,17 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                if (i == nr)
                        break;
        }
-
+       bch2_trans_iter_put(&trans, iter);
+err:
        bch2_trans_exit(&trans);
+       return 0;
 }
 
 /*
  * XXX: we really want to make sure we've got a btree with depth > 0 for these
  * tests
  */
-static void test_peek_end(struct bch_fs *c, u64 nr)
+static int test_peek_end(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -315,7 +360,7 @@ static void test_peek_end(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
 
        k = bch2_btree_iter_peek(iter);
        BUG_ON(k.k);
@@ -323,10 +368,13 @@ static void test_peek_end(struct bch_fs *c, u64 nr)
        k = bch2_btree_iter_peek(iter);
        BUG_ON(k.k);
 
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
+       return 0;
 }
 
-static void test_peek_end_extents(struct bch_fs *c, u64 nr)
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -334,7 +382,7 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0);
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
 
        k = bch2_btree_iter_peek(iter);
        BUG_ON(k.k);
@@ -342,15 +390,18 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr)
        k = bch2_btree_iter_peek(iter);
        BUG_ON(k.k);
 
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
+       return 0;
 }
 
 /* extent unit tests */
 
 u64 test_version;
 
-static void insert_test_extent(struct bch_fs *c,
-                              u64 start, u64 end)
+static int insert_test_extent(struct bch_fs *c,
+                             u64 start, u64 end)
 {
        struct bkey_i_cookie k;
        int ret;
@@ -362,44 +413,49 @@ static void insert_test_extent(struct bch_fs *c,
        k.k_i.k.size = end - start;
        k.k_i.k.version.lo = test_version++;
 
-       ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+       ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
                                NULL, NULL, 0);
-       BUG_ON(ret);
+       if (ret)
+               bch_err(c, "insert error in insert_test_extent: %i", ret);
+       return ret;
 }
 
-static void __test_extent_overwrite(struct bch_fs *c,
+static int __test_extent_overwrite(struct bch_fs *c,
                                    u64 e1_start, u64 e1_end,
                                    u64 e2_start, u64 e2_end)
 {
-       insert_test_extent(c, e1_start, e1_end);
-       insert_test_extent(c, e2_start, e2_end);
+       int ret;
+
+       ret   = insert_test_extent(c, e1_start, e1_end) ?:
+               insert_test_extent(c, e2_start, e2_end);
 
        delete_test_keys(c);
+       return ret;
 }
 
-static void test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
 {
-       __test_extent_overwrite(c, 0, 64, 0, 32);
-       __test_extent_overwrite(c, 8, 64, 0, 32);
+       return  __test_extent_overwrite(c, 0, 64, 0, 32) ?:
+               __test_extent_overwrite(c, 8, 64, 0, 32);
 }
 
-static void test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
 {
-       __test_extent_overwrite(c, 0, 64, 32, 64);
-       __test_extent_overwrite(c, 0, 64, 32, 72);
+       return  __test_extent_overwrite(c, 0, 64, 32, 64) ?:
+               __test_extent_overwrite(c, 0, 64, 32, 72);
 }
 
-static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
 {
-       __test_extent_overwrite(c, 0, 64, 32, 40);
+       return __test_extent_overwrite(c, 0, 64, 32, 40);
 }
 
-static void test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
 {
-       __test_extent_overwrite(c, 32, 64,  0,  64);
-       __test_extent_overwrite(c, 32, 64,  0, 128);
-       __test_extent_overwrite(c, 32, 64, 32,  64);
-       __test_extent_overwrite(c, 32, 64, 32, 128);
+       return  __test_extent_overwrite(c, 32, 64,  0,  64) ?:
+               __test_extent_overwrite(c, 32, 64,  0, 128) ?:
+               __test_extent_overwrite(c, 32, 64, 32,  64) ?:
+               __test_extent_overwrite(c, 32, 64, 32, 128);
 }
 
 /* perf tests */
@@ -415,11 +471,11 @@ static u64 test_rand(void)
        return v;
 }
 
-static void rand_insert(struct bch_fs *c, u64 nr)
+static int rand_insert(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct bkey_i_cookie k;
-       int ret;
+       int ret = 0;
        u64 i;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -427,51 +483,67 @@ static void rand_insert(struct bch_fs *c, u64 nr)
        for (i = 0; i < nr; i++) {
                bkey_cookie_init(&k.k_i);
                k.k.p.offset = test_rand();
+               k.k.p.snapshot = U32_MAX;
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i));
-
-               BUG_ON(ret);
+                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
+               if (ret) {
+                       bch_err(c, "error in rand_insert: %i", ret);
+                       break;
+               }
        }
 
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void rand_lookup(struct bch_fs *c, u64 nr)
+static int rand_lookup(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
+       int ret = 0;
        u64 i;
 
        bch2_trans_init(&trans, c, 0, 0);
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
 
        for (i = 0; i < nr; i++) {
-               iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
-                                          POS(0, test_rand()), 0);
+               bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
 
                k = bch2_btree_iter_peek(iter);
-               bch2_trans_iter_free(&trans, iter);
+               ret = bkey_err(k);
+               if (ret) {
+                       bch_err(c, "error in rand_lookup: %i", ret);
+                       break;
+               }
        }
 
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void rand_mixed(struct bch_fs *c, u64 nr)
+static int rand_mixed(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       int ret;
+       int ret = 0;
        u64 i;
 
        bch2_trans_init(&trans, c, 0, 0);
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
 
        for (i = 0; i < nr; i++) {
-               iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
-                                          POS(0, test_rand()), 0);
+               bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
 
                k = bch2_btree_iter_peek(iter);
+               ret = bkey_err(k);
+               if (ret) {
+                       bch_err(c, "lookup error in rand_mixed: %i", ret);
+                       break;
+               }
 
                if (!(i & 3) && k.k) {
                        struct bkey_i_cookie k;
@@ -481,14 +553,16 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
 
                        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                                bch2_trans_update(&trans, iter, &k.k_i, 0));
-
-                       BUG_ON(ret);
+                       if (ret) {
+                               bch_err(c, "update error in rand_mixed: %i", ret);
+                               break;
+                       }
                }
-
-               bch2_trans_iter_free(&trans, iter);
        }
 
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
+       return ret;
 }
 
 static int __do_delete(struct btree_trans *trans, struct bpos pos)
@@ -498,17 +572,16 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
        struct bkey_s_c k;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos,
+       iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
                                   BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(iter);
-       if (ret)
-               goto err;
-
        k = bch2_btree_iter_peek(iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
+       if (!k.k)
+               goto err;
+
        bkey_init(&delete.k);
        delete.k.p = k.k->p;
 
@@ -518,10 +591,10 @@ err:
        return ret;
 }
 
-static void rand_delete(struct bch_fs *c, u64 nr)
+static int rand_delete(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       int ret;
+       int ret = 0;
        u64 i;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -531,64 +604,76 @@ static void rand_delete(struct bch_fs *c, u64 nr)
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                        __do_delete(&trans, pos));
-               BUG_ON(ret);
+               if (ret) {
+                       bch_err(c, "error in rand_delete: %i", ret);
+                       break;
+               }
        }
 
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void seq_insert(struct bch_fs *c, u64 nr)
+static int seq_insert(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
        struct bkey_i_cookie insert;
-       int ret;
+       int ret = 0;
        u64 i = 0;
 
        bkey_cookie_init(&insert.k_i);
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
                insert.k.p = iter->pos;
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                        bch2_trans_update(&trans, iter, &insert.k_i, 0));
-
-               BUG_ON(ret);
+               if (ret) {
+                       bch_err(c, "error in seq_insert: %i", ret);
+                       break;
+               }
 
                if (++i == nr)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void seq_lookup(struct bch_fs *c, u64 nr)
+static int seq_lookup(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       int ret;
+       int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret)
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
                ;
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void seq_overwrite(struct bch_fs *c, u64 nr)
+static int seq_overwrite(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       int ret;
+       int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
                           BTREE_ITER_INTENT, k, ret) {
                struct bkey_i_cookie u;
 
@@ -596,23 +681,30 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                        bch2_trans_update(&trans, iter, &u.k_i, 0));
-
-               BUG_ON(ret);
+               if (ret) {
+                       bch_err(c, "error in seq_overwrite: %i", ret);
+                       break;
+               }
        }
+       bch2_trans_iter_put(&trans, iter);
+
        bch2_trans_exit(&trans);
+       return ret;
 }
 
-static void seq_delete(struct bch_fs *c, u64 nr)
+static int seq_delete(struct bch_fs *c, u64 nr)
 {
        int ret;
 
-       ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+       ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
                                      POS(0, 0), POS(0, U64_MAX),
                                      NULL);
-       BUG_ON(ret);
+       if (ret)
+               bch_err(c, "error in seq_delete: %i", ret);
+       return ret;
 }
 
-typedef void (*perf_test_fn)(struct bch_fs *, u64);
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
 
 struct test_job {
        struct bch_fs                   *c;
@@ -628,11 +720,13 @@ struct test_job {
 
        u64                             start;
        u64                             finish;
+       int                             ret;
 };
 
 static int btree_perf_test_thread(void *data)
 {
        struct test_job *j = data;
+       int ret;
 
        if (atomic_dec_and_test(&j->ready)) {
                wake_up(&j->ready_wait);
@@ -641,7 +735,9 @@ static int btree_perf_test_thread(void *data)
                wait_event(j->ready_wait, !atomic_read(&j->ready));
        }
 
-       j->fn(j->c, j->nr / j->nr_threads);
+       ret = j->fn(j->c, j->nr / j->nr_threads);
+       if (ret)
+               j->ret = ret;
 
        if (atomic_dec_and_test(&j->done)) {
                j->finish = sched_clock();
@@ -651,8 +747,8 @@ static int btree_perf_test_thread(void *data)
        return 0;
 }
 
-void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
-                         u64 nr, unsigned nr_threads)
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+                        u64 nr, unsigned nr_threads)
 {
        struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
        char name_buf[20], nr_buf[20], per_sec_buf[20];
@@ -695,7 +791,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 
        if (!j.fn) {
                pr_err("unknown test %s", testname);
-               return;
+               return -EINVAL;
        }
 
        //pr_info("running test %s:", testname);
@@ -720,6 +816,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
                time / NSEC_PER_SEC,
                time * nr_threads / nr,
                per_sec_buf);
+       return j.ret;
 }
 
 #endif /* CONFIG_BCACHEFS_TESTS */
index 551d0764225ecf47eb79658b2fd2bf5eaa5c8d6d..c73b18aea7e01d01aece515369a8e38fd37c1870 100644 (file)
@@ -6,7 +6,7 @@ struct bch_fs;
 
 #ifdef CONFIG_BCACHEFS_TESTS
 
-void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
 
 #else
 
index fd4044a6a08fbafd487e4cd80dfb507dff61a264..2709163e02b538b0b0a6df075a56ba48b4ed8e6c 100644 (file)
@@ -520,7 +520,7 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
 {
        while (size) {
                struct page *page = alloc_page(gfp_mask);
-               unsigned len = min(PAGE_SIZE, size);
+               unsigned len = min_t(size_t, PAGE_SIZE, size);
 
                if (!page)
                        return -ENOMEM;
index f48c6380684f67ec5f6052507c07906148ee66f6..c69b05deec41dc69ba387803305d1f364551849d 100644 (file)
@@ -37,17 +37,6 @@ struct closure;
 #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
 #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
 
-#define memcpy(dst, src, len)                                          \
-({                                                                     \
-       void *_dst = (dst);                                             \
-       const void *_src = (src);                                       \
-       size_t _len = (len);                                            \
-                                                                       \
-       BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||         \
-                (void *) (_dst) + (_len) <= (void *) (_src)));         \
-       memcpy(_dst, _src, _len);                                       \
-})
-
 #else /* DEBUG */
 
 #define EBUG_ON(cond)
@@ -758,4 +747,9 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
 
 #define cmp_int(l, r)          ((l > r) - (l < r))
 
+static inline int u8_cmp(u8 l, u8 r)
+{
+       return cmp_int(l, r);
+}
+
 #endif /* _BCACHEFS_UTIL_H */
diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c
new file mode 100644 (file)
index 0000000..a3d252c
--- /dev/null
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#include "varint.h"
+
+int bch2_varint_encode(u8 *out, u64 v)
+{
+       unsigned bits = fls64(v|1);
+       unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+       if (likely(bytes < 9)) {
+               v <<= bytes;
+               v |= ~(~0 << (bytes - 1));
+       } else {
+               *out++ = 255;
+               bytes = 9;
+       }
+
+       put_unaligned_le64(v, out);
+       return bytes;
+}
+
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+       u64 v = get_unaligned_le64(in);
+       unsigned bytes = ffz(v & 255) + 1;
+
+       if (unlikely(in + bytes > end))
+               return -1;
+
+       if (likely(bytes < 9)) {
+               v >>= bytes;
+               v &= ~(~0ULL << (7 * bytes));
+       } else {
+               v = get_unaligned_le64(++in);
+       }
+
+       *out = v;
+       return bytes;
+}
diff --git a/libbcachefs/varint.h b/libbcachefs/varint.h
new file mode 100644 (file)
index 0000000..8daf813
--- /dev/null
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */
index 21f64cb7e40247738a0624a89cba8608a74ca372..858aa87660533ef1498e73b3ff27b290c3e21646 100644 (file)
@@ -61,7 +61,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 }
 
 const struct bch_hash_desc bch2_xattr_hash_desc = {
-       .btree_id       = BTREE_ID_XATTRS,
+       .btree_id       = BTREE_ID_xattrs,
        .key_type       = KEY_TYPE_xattr,
        .hash_key       = xattr_hash_key,
        .hash_bkey      = xattr_hash_bkey,
@@ -121,6 +121,7 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
                   const char *name, void *buffer, size_t size, int type)
 {
+       struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c_xattr xattr;
@@ -128,16 +129,13 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
-                               &inode->ei_str_hash, inode->v.i_ino,
+       iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash,
+                               inode->v.i_ino,
                                &X_SEARCH(type, name, strlen(name)),
                                0);
-       if (IS_ERR(iter)) {
-               bch2_trans_exit(&trans);
-               BUG_ON(PTR_ERR(iter) == -EINTR);
-
-               return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
-       }
+       ret = PTR_ERR_OR_ZERO(iter);
+       if (ret)
+               goto err;
 
        xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
        ret = le16_to_cpu(xattr.v->x_val_len);
@@ -147,9 +145,12 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
                else
                        memcpy(buffer, xattr_val(xattr.v), ret);
        }
-
+       bch2_trans_iter_put(&trans, iter);
+err:
        bch2_trans_exit(&trans);
-       return ret;
+
+       BUG_ON(ret == -EINTR);
+       return ret == -ENOENT ? -ENODATA : ret;
 }
 
 int bch2_xattr_set(struct btree_trans *trans, u64 inum,
@@ -239,7 +240,7 @@ static int bch2_xattr_emit(struct dentry *dentry,
 }
 
 static int bch2_xattr_list_bcachefs(struct bch_fs *c,
-                                   struct bch_inode_info *inode,
+                                   struct bch_inode_unpacked *inode,
                                    struct xattr_buf *buf,
                                    bool all)
 {
@@ -249,12 +250,12 @@ static int bch2_xattr_list_bcachefs(struct bch_fs *c,
        u64 v;
 
        for (id = 0; id < Inode_opt_nr; id++) {
-               v = bch2_inode_opt_get(&inode->ei_inode, id);
+               v = bch2_inode_opt_get(inode, id);
                if (!v)
                        continue;
 
                if (!all &&
-                   !(inode->ei_inode.bi_fields_set & (1 << id)))
+                   !(inode->bi_fields_set & (1 << id)))
                        continue;
 
                ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
@@ -279,7 +280,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
                           POS(inum, 0), 0, k, ret) {
                BUG_ON(k.k->p.inode < inum);
 
@@ -293,16 +294,18 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                if (ret)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
+
        ret = bch2_trans_exit(&trans) ?: ret;
 
        if (ret)
                return ret;
 
-       ret = bch2_xattr_list_bcachefs(c, inode, &buf, false);
+       ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
        if (ret)
                return ret;
 
-       ret = bch2_xattr_list_bcachefs(c, inode, &buf, true);
+       ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
        if (ret)
                return ret;
 
@@ -326,10 +329,10 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
 {
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 
        return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
-                       bch2_xattr_set(&trans, inode->v.i_ino,
-                                      &inode->ei_str_hash,
+                       bch2_xattr_set(&trans, inode->v.i_ino, &hash,
                                       name, value, size,
                                       handler->flags, flags));
 }
index 4f43d0bb4e9a8a668ed6f718336da69db4998bee..7857017c1d48a11c6ceab7e4069af4908b46a415 100644 (file)
@@ -147,6 +147,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
        struct genradix_root *r;
        struct genradix_node *n;
        unsigned level, i;
+
+       if (iter->offset == SIZE_MAX)
+               return NULL;
+
 restart:
        r = READ_ONCE(radix->root);
        if (!r)
@@ -165,10 +169,17 @@ restart:
                        (GENRADIX_ARY - 1);
 
                while (!n->children[i]) {
+                       size_t objs_per_ptr = genradix_depth_size(level);
+
+                       if (iter->offset + objs_per_ptr < iter->offset) {
+                               iter->offset    = SIZE_MAX;
+                               iter->pos       = SIZE_MAX;
+                               return NULL;
+                       }
+
                        i++;
-                       iter->offset = round_down(iter->offset +
-                                          genradix_depth_size(level),
-                                          genradix_depth_size(level));
+                       iter->offset = round_down(iter->offset + objs_per_ptr,
+                                                 objs_per_ptr);
                        iter->pos = (iter->offset >> PAGE_SHIFT) *
                                objs_per_page;
                        if (i == GENRADIX_ARY)
index 65e824b4cc0843c36d0b905e7b9c266dabcb98d7..41bfca2f8d522b111fb967f27d14cb1f24d2385f 100644 (file)
@@ -80,7 +80,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data),
 
        ret = pthread_create(&p->thread, &attr, kthread_start_fn, p);
        if (ret)
-               die("pthread_create error %s", strerror(ret));
+               return ERR_PTR(-ret);
        pthread_setname_np(p->thread, p->comm);
        return p;
 }
index 351eac7946a4950398c85d87648119ee788daedf..ba2196fc4ac1eb887c4ae66588999bf6f8d9fa07 100644 (file)
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
@@ -8,27 +9,29 @@
  * Code partially derived from nft_hash
  * Rewritten with rehash code from br_multicast plus single list
  * pointer as suggested by Josh Triplett
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/atomic.h>
-#include <linux/cpumask.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
 #include <linux/sched.h>
+#include <linux/rculist.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/jhash.h>
+#include <linux/overflow.h>
 #include <linux/random.h>
 #include <linux/rhashtable.h>
 #include <linux/err.h>
+#include <linux/export.h>
 
 #define HASH_DEFAULT_SIZE      64UL
 #define HASH_MIN_SIZE          4U
-#define BUCKET_LOCKS_PER_CPU   32UL
+
+union nested_table {
+       union nested_table __rcu *table;
+       struct rhash_lock_head __rcu *bucket;
+};
 
 static u32 head_hashfn(struct rhashtable *ht,
                       const struct bucket_table *tbl,
@@ -37,40 +40,75 @@ static u32 head_hashfn(struct rhashtable *ht,
        return rht_head_hashfn(ht, tbl, he, ht->p);
 }
 
-static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
-                             gfp_t gfp)
-{
-       unsigned int i, size;
-       unsigned int nr_pcpus = num_possible_cpus();
+#ifdef CONFIG_PROVE_LOCKING
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
 
-       nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
-       size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+       return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
-       /* Never allocate more than 0.5 locks per bucket */
-       size = min_t(unsigned int, size, tbl->size >> 1);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+       if (!debug_locks)
+               return 1;
+       if (unlikely(tbl->nest))
+               return 1;
+       return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#endif
 
-       if (sizeof(spinlock_t) != 0) {
-               tbl->locks = NULL;
-               if (gfp != GFP_KERNEL)
-                       gfp |= __GFP_NOWARN | __GFP_NORETRY;
+static inline union nested_table *nested_table_top(
+       const struct bucket_table *tbl)
+{
+       /* The top-level bucket entry does not need RCU protection
+        * because it's set at the same time as tbl->nest.
+        */
+       return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
+}
 
-               if (!tbl->locks)
-                       tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
-                                                  gfp);
-               if (!tbl->locks)
-                       return -ENOMEM;
-               for (i = 0; i < size; i++)
-                       spin_lock_init(&tbl->locks[i]);
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+       const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+       const unsigned int len = 1 << shift;
+       unsigned int i;
+
+       ntbl = rcu_dereference_protected(ntbl->table, 1);
+       if (!ntbl)
+               return;
+
+       if (size > len) {
+               size >>= shift;
+               for (i = 0; i < len; i++)
+                       nested_table_free(ntbl + i, size);
        }
-       tbl->locks_mask = size - 1;
 
-       return 0;
+       kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+       unsigned int size = tbl->size >> tbl->nest;
+       unsigned int len = 1 << tbl->nest;
+       union nested_table *ntbl;
+       unsigned int i;
+
+       ntbl = nested_table_top(tbl);
+
+       for (i = 0; i < len; i++)
+               nested_table_free(ntbl + i, size);
+
+       kfree(ntbl);
 }
 
 static void bucket_table_free(struct bucket_table *tbl)
 {
-       if (tbl)
-               kvfree(tbl->locks);
+       if (tbl->nest)
+               nested_bucket_table_free(tbl);
 
        kvfree(tbl);
 }
@@ -80,6 +118,59 @@ static void bucket_table_free_rcu(struct rcu_head *head)
        bucket_table_free(container_of(head, struct bucket_table, rcu));
 }
 
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+                                             union nested_table __rcu **prev,
+                                             bool leaf)
+{
+       union nested_table *ntbl;
+       int i;
+
+       ntbl = rcu_dereference(*prev);
+       if (ntbl)
+               return ntbl;
+
+       ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+       if (ntbl && leaf) {
+               for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
+                       INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
+       }
+
+       if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
+               return ntbl;
+       /* Raced with another thread. */
+       kfree(ntbl);
+       return rcu_dereference(*prev);
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+                                                     size_t nbuckets,
+                                                     gfp_t gfp)
+{
+       const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+       struct bucket_table *tbl;
+       size_t size;
+
+       if (nbuckets < (1 << (shift + 1)))
+               return NULL;
+
+       size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+       tbl = kzalloc(size, gfp);
+       if (!tbl)
+               return NULL;
+
+       if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+                               false)) {
+               kfree(tbl);
+               return NULL;
+       }
+
+       tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+       return tbl;
+}
+
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
                                               size_t nbuckets,
                                               gfp_t gfp)
@@ -88,28 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
        size_t size;
        int i;
 
-       size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
-       if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
-           gfp != GFP_KERNEL)
-               tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
-       if (tbl == NULL && gfp == GFP_KERNEL)
-               tbl = vzalloc(size);
-       if (tbl == NULL)
-               return NULL;
+       tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
 
-       tbl->size = nbuckets;
+       size = nbuckets;
 
-       if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
-               bucket_table_free(tbl);
-               return NULL;
+       if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
+               tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+               nbuckets = 0;
        }
 
+       if (tbl == NULL)
+               return NULL;
+
+       tbl->size = size;
+
+       rcu_head_init(&tbl->rcu);
        INIT_LIST_HEAD(&tbl->walkers);
 
-       get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+       tbl->hash_rnd = get_random_u32();
 
        for (i = 0; i < nbuckets; i++)
-               INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+               INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
 
        return tbl;
 }
@@ -127,18 +217,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
        return new_tbl;
 }
 
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht,
+                                struct rhash_lock_head __rcu **bkt,
+                                unsigned int old_hash)
 {
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-       struct bucket_table *new_tbl = rhashtable_last_table(ht,
-               rht_dereference_rcu(old_tbl->future_tbl, ht));
-       struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
-       int err = -ENOENT;
+       struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
+       int err = -EAGAIN;
        struct rhash_head *head, *next, *entry;
-       spinlock_t *new_bucket_lock;
+       struct rhash_head __rcu **pprev = NULL;
        unsigned int new_hash;
 
-       rht_for_each(entry, old_tbl, old_hash) {
+       if (new_tbl->nest)
+               goto out;
+
+       err = -ENOENT;
+
+       rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
+                         old_tbl, old_hash) {
                err = 0;
                next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 
@@ -153,57 +249,58 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 
        new_hash = head_hashfn(ht, new_tbl, entry);
 
-       new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
+       rht_lock(new_tbl, &new_tbl->buckets[new_hash]);
 
-       spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
-       head = rht_dereference_bucket(new_tbl->buckets[new_hash],
-                                     new_tbl, new_hash);
+       head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
 
        RCU_INIT_POINTER(entry->next, head);
 
-       rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
-       spin_unlock(new_bucket_lock);
+       rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
 
-       rcu_assign_pointer(*pprev, next);
+       if (pprev)
+               rcu_assign_pointer(*pprev, next);
+       else
+               /* Need to preserved the bit lock. */
+               rht_assign_locked(bkt, next);
 
 out:
        return err;
 }
 
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
                                    unsigned int old_hash)
 {
        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-       spinlock_t *old_bucket_lock;
+       struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+       int err;
 
-       old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+       if (!bkt)
+               return 0;
+       rht_lock(old_tbl, bkt);
 
-       spin_lock_bh(old_bucket_lock);
-       while (!rhashtable_rehash_one(ht, old_hash))
+       while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
                ;
-       old_tbl->rehash++;
-       spin_unlock_bh(old_bucket_lock);
+
+       if (err == -ENOENT)
+               err = 0;
+       rht_unlock(old_tbl, bkt);
+
+       return err;
 }
 
 static int rhashtable_rehash_attach(struct rhashtable *ht,
                                    struct bucket_table *old_tbl,
                                    struct bucket_table *new_tbl)
 {
-       /* Protect future_tbl using the first bucket lock. */
-       spin_lock_bh(old_tbl->locks);
-
-       /* Did somebody beat us to it? */
-       if (rcu_access_pointer(old_tbl->future_tbl)) {
-               spin_unlock_bh(old_tbl->locks);
-               return -EEXIST;
-       }
-
        /* Make insertions go into the new, empty table right away. Deletions
         * and lookups will be attempted in both tables until we synchronize.
+        * As cmpxchg() provides strong barriers, we do not need
+        * rcu_assign_pointer().
         */
-       rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
 
-       spin_unlock_bh(old_tbl->locks);
+       if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
+                   new_tbl) != NULL)
+               return -EEXIST;
 
        return 0;
 }
@@ -214,13 +311,18 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
        struct bucket_table *new_tbl;
        struct rhashtable_walker *walker;
        unsigned int old_hash;
+       int err;
 
        new_tbl = rht_dereference(old_tbl->future_tbl, ht);
        if (!new_tbl)
                return 0;
 
-       for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
-               rhashtable_rehash_chain(ht, old_hash);
+       for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+               err = rhashtable_rehash_chain(ht, old_hash);
+               if (err)
+                       return err;
+               cond_resched();
+       }
 
        /* Publish the new table pointer. */
        rcu_assign_pointer(ht->tbl, new_tbl);
@@ -228,25 +330,30 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
        spin_lock(&ht->lock);
        list_for_each_entry(walker, &old_tbl->walkers, list)
                walker->tbl = NULL;
-       spin_unlock(&ht->lock);
 
        /* Wait for readers. All new readers will see the new
         * table, and thus no references to the old table will
         * remain.
+        * We do this inside the locked region so that
+        * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
+        * to check if it should not re-link the table.
         */
        call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+       spin_unlock(&ht->lock);
 
        return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+                                  struct bucket_table *old_tbl,
+                                  unsigned int size)
 {
-       struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+       struct bucket_table *new_tbl;
        int err;
 
-       old_tbl = rhashtable_last_table(ht, old_tbl);
+       ASSERT_RHT_MUTEX(ht);
 
-       new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+       new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
        if (new_tbl == NULL)
                return -ENOMEM;
 
@@ -257,12 +364,27 @@ static int rhashtable_expand(struct rhashtable *ht)
        return err;
 }
 
+/**
+ * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
+ * @ht:                the hash table to shrink
+ *
+ * This function shrinks the hash table to fit, i.e., the smallest
+ * size would not cause it to expand right away automatically.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
 static int rhashtable_shrink(struct rhashtable *ht)
 {
-       struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+       struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
        unsigned int nelems = atomic_read(&ht->nelems);
        unsigned int size = 0;
-       int err;
 
        if (nelems)
                size = roundup_pow_of_two(nelems * 3 / 2);
@@ -275,15 +397,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
        if (rht_dereference(old_tbl->future_tbl, ht))
                return -EEXIST;
 
-       new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-       if (new_tbl == NULL)
-               return -ENOMEM;
-
-       err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
-       if (err)
-               bucket_table_free(new_tbl);
-
-       return err;
+       return rhashtable_rehash_alloc(ht, old_tbl, size);
 }
 
 static void rht_deferred_worker(struct work_struct *work)
@@ -299,11 +413,18 @@ static void rht_deferred_worker(struct work_struct *work)
        tbl = rhashtable_last_table(ht, tbl);
 
        if (rht_grow_above_75(ht, tbl))
-               rhashtable_expand(ht);
+               err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
        else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-               rhashtable_shrink(ht);
+               err = rhashtable_shrink(ht);
+       else if (tbl->nest)
+               err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
+
+       if (!err || err == -EEXIST) {
+               int nerr;
 
-       err = rhashtable_rehash_table(ht);
+               nerr = rhashtable_rehash_table(ht);
+               err = err ?: nerr;
+       }
 
        mutex_unlock(&ht->mutex);
 
@@ -311,22 +432,8 @@ static void rht_deferred_worker(struct work_struct *work)
                schedule_work(&ht->run_work);
 }
 
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
-                                       struct bucket_table *tbl,
-                                       unsigned int hash)
-{
-       unsigned int elasticity = ht->elasticity;
-       struct rhash_head *head;
-
-       rht_for_each(head, tbl, hash)
-               if (!--elasticity)
-                       return true;
-
-       return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
-                            struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+                                   struct bucket_table *tbl)
 {
        struct bucket_table *old_tbl;
        struct bucket_table *new_tbl;
@@ -347,7 +454,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
 
        err = -ENOMEM;
 
-       new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+       new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
        if (new_tbl == NULL)
                goto fail;
 
@@ -363,7 +470,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht,
 
 fail:
        /* Do not fail the insert if someone else did a rehash. */
-       if (likely(rcu_dereference_raw(tbl->future_tbl)))
+       if (likely(rcu_access_pointer(tbl->future_tbl)))
                return 0;
 
        /* Schedule async rehash to retry allocation in process context. */
@@ -373,57 +480,485 @@ fail:
        return err;
 }
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-                                           const void *key,
-                                           struct rhash_head *obj,
-                                           struct bucket_table *tbl)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+                                  struct rhash_lock_head __rcu **bkt,
+                                  struct bucket_table *tbl, unsigned int hash,
+                                  const void *key, struct rhash_head *obj)
 {
+       struct rhashtable_compare_arg arg = {
+               .ht = ht,
+               .key = key,
+       };
+       struct rhash_head __rcu **pprev = NULL;
        struct rhash_head *head;
-       unsigned int hash;
-       int err;
+       int elasticity;
+
+       elasticity = RHT_ELASTICITY;
+       rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+               struct rhlist_head *list;
+               struct rhlist_head *plist;
+
+               elasticity--;
+               if (!key ||
+                   (ht->p.obj_cmpfn ?
+                    ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+                    rhashtable_compare(&arg, rht_obj(ht, head)))) {
+                       pprev = &head->next;
+                       continue;
+               }
 
-       tbl = rhashtable_last_table(ht, tbl);
-       hash = head_hashfn(ht, tbl, obj);
-       spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+               if (!ht->rhlist)
+                       return rht_obj(ht, head);
 
-       err = -EEXIST;
-       if (key && rhashtable_lookup_fast(ht, key, ht->p))
-               goto exit;
+               list = container_of(obj, struct rhlist_head, rhead);
+               plist = container_of(head, struct rhlist_head, rhead);
 
-       err = -E2BIG;
-       if (unlikely(rht_grow_above_max(ht, tbl)))
-               goto exit;
+               RCU_INIT_POINTER(list->next, plist);
+               head = rht_dereference_bucket(head->next, tbl, hash);
+               RCU_INIT_POINTER(list->rhead.next, head);
+               if (pprev)
+                       rcu_assign_pointer(*pprev, obj);
+               else
+                       /* Need to preserve the bit lock */
+                       rht_assign_locked(bkt, obj);
+
+               return NULL;
+       }
+
+       if (elasticity <= 0)
+               return ERR_PTR(-EAGAIN);
+
+       return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(
+       struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
+       struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
+       void *data)
+{
+       struct bucket_table *new_tbl;
+       struct rhash_head *head;
+
+       if (!IS_ERR_OR_NULL(data))
+               return ERR_PTR(-EEXIST);
+
+       if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+               return ERR_CAST(data);
+
+       new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+       if (new_tbl)
+               return new_tbl;
+
+       if (PTR_ERR(data) != -ENOENT)
+               return ERR_CAST(data);
 
-       err = -EAGAIN;
-       if (rhashtable_check_elasticity(ht, tbl, hash) ||
-           rht_grow_above_100(ht, tbl))
-               goto exit;
+       if (unlikely(rht_grow_above_max(ht, tbl)))
+               return ERR_PTR(-E2BIG);
 
-       err = 0;
+       if (unlikely(rht_grow_above_100(ht, tbl)))
+               return ERR_PTR(-EAGAIN);
 
-       head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+       head = rht_ptr(bkt, tbl, hash);
 
        RCU_INIT_POINTER(obj->next, head);
+       if (ht->rhlist) {
+               struct rhlist_head *list;
 
-       rcu_assign_pointer(tbl->buckets[hash], obj);
+               list = container_of(obj, struct rhlist_head, rhead);
+               RCU_INIT_POINTER(list->next, NULL);
+       }
+
+       /* bkt is always the head of the list, so it holds
+        * the lock, which we need to preserve
+        */
+       rht_assign_locked(bkt, obj);
 
        atomic_inc(&ht->nelems);
+       if (rht_grow_above_75(ht, tbl))
+               schedule_work(&ht->run_work);
+
+       return NULL;
+}
+
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+                                  struct rhash_head *obj)
+{
+       struct bucket_table *new_tbl;
+       struct bucket_table *tbl;
+       struct rhash_lock_head __rcu **bkt;
+       unsigned int hash;
+       void *data;
+
+       new_tbl = rcu_dereference(ht->tbl);
+
+       do {
+               tbl = new_tbl;
+               hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+               if (rcu_access_pointer(tbl->future_tbl))
+                       /* Failure is OK */
+                       bkt = rht_bucket_var(tbl, hash);
+               else
+                       bkt = rht_bucket_insert(ht, tbl, hash);
+               if (bkt == NULL) {
+                       new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+                       data = ERR_PTR(-EAGAIN);
+               } else {
+                       rht_lock(tbl, bkt);
+                       data = rhashtable_lookup_one(ht, bkt, tbl,
+                                                    hash, key, obj);
+                       new_tbl = rhashtable_insert_one(ht, bkt, tbl,
+                                                       hash, obj, data);
+                       if (PTR_ERR(new_tbl) != -EEXIST)
+                               data = ERR_CAST(new_tbl);
+
+                       rht_unlock(tbl, bkt);
+               }
+       } while (!IS_ERR_OR_NULL(new_tbl));
+
+       if (PTR_ERR(data) == -EAGAIN)
+               data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+                              -EAGAIN);
+
+       return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+                            struct rhash_head *obj)
+{
+       void *data;
+
+       do {
+               rcu_read_lock();
+               data = rhashtable_try_insert(ht, key, obj);
+               rcu_read_unlock();
+       } while (PTR_ERR(data) == -EAGAIN);
 
-exit:
-       spin_unlock(rht_bucket_lock(tbl, hash));
+       return data;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
-       if (err == 0)
+/**
+ * rhashtable_walk_enter - Initialise an iterator
+ * @ht:                Table to walk over
+ * @iter:      Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+       iter->ht = ht;
+       iter->p = NULL;
+       iter->slot = 0;
+       iter->skip = 0;
+       iter->end_of_table = 0;
+
+       spin_lock(&ht->lock);
+       iter->walker.tbl =
+               rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
+       list_add(&iter->walker.list, &iter->walker.tbl->walkers);
+       spin_unlock(&ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter:      Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_enter.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+       spin_lock(&iter->ht->lock);
+       if (iter->walker.tbl)
+               list_del(&iter->walker.list);
+       spin_unlock(&iter->ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start_check - Start a hash table walk
+ * @iter:      Hash table iterator
+ *
+ * Start a hash table walk at the current iterator position.  Note that we take
+ * the RCU lock in all cases including when we return an error.  So you must
+ * always call rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ *
+ * rhashtable_walk_start is defined as an inline variant that returns
+ * void. This is preferred in cases where the caller would ignore
+ * resize events and always continue.
+ */
+int rhashtable_walk_start_check(struct rhashtable_iter *iter)
+       __acquires(RCU)
+{
+       struct rhashtable *ht = iter->ht;
+       bool rhlist = ht->rhlist;
+
+       rcu_read_lock();
+
+       spin_lock(&ht->lock);
+       if (iter->walker.tbl)
+               list_del(&iter->walker.list);
+       spin_unlock(&ht->lock);
+
+       if (iter->end_of_table)
+               return 0;
+       if (!iter->walker.tbl) {
+               iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+               iter->slot = 0;
+               iter->skip = 0;
+               return -EAGAIN;
+       }
+
+       if (iter->p && !rhlist) {
+               /*
+                * We need to validate that 'p' is still in the table, and
+                * if so, update 'skip'
+                */
+               struct rhash_head *p;
+               int skip = 0;
+               rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+                       skip++;
+                       if (p == iter->p) {
+                               iter->skip = skip;
+                               goto found;
+                       }
+               }
+               iter->p = NULL;
+       } else if (iter->p && rhlist) {
+               /* Need to validate that 'list' is still in the table, and
+                * if so, update 'skip' and 'p'.
+                */
+               struct rhash_head *p;
+               struct rhlist_head *list;
+               int skip = 0;
+               rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+                       for (list = container_of(p, struct rhlist_head, rhead);
+                            list;
+                            list = rcu_dereference(list->next)) {
+                               skip++;
+                               if (list == iter->list) {
+                                       iter->p = p;
+                                       iter->skip = skip;
+                                       goto found;
+                               }
+                       }
+               }
+               iter->p = NULL;
+       }
+found:
+       return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
+
+/**
+ * __rhashtable_walk_find_next - Find the next element in a table (or the first
+ * one in case of a new walk).
+ *
+ * @iter:      Hash table iterator
+ *
+ * Returns the found object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.
+ */
+static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
+{
+       struct bucket_table *tbl = iter->walker.tbl;
+       struct rhlist_head *list = iter->list;
+       struct rhashtable *ht = iter->ht;
+       struct rhash_head *p = iter->p;
+       bool rhlist = ht->rhlist;
+
+       if (!tbl)
                return NULL;
-       else if (err == -EAGAIN)
-               return tbl;
+
+       for (; iter->slot < tbl->size; iter->slot++) {
+               int skip = iter->skip;
+
+               rht_for_each_rcu(p, tbl, iter->slot) {
+                       if (rhlist) {
+                               list = container_of(p, struct rhlist_head,
+                                                   rhead);
+                               do {
+                                       if (!skip)
+                                               goto next;
+                                       skip--;
+                                       list = rcu_dereference(list->next);
+                               } while (list);
+
+                               continue;
+                       }
+                       if (!skip)
+                               break;
+                       skip--;
+               }
+
+next:
+               if (!rht_is_a_nulls(p)) {
+                       iter->skip++;
+                       iter->p = p;
+                       iter->list = list;
+                       return rht_obj(ht, rhlist ? &list->rhead : p);
+               }
+
+               iter->skip = 0;
+       }
+
+       iter->p = NULL;
+
+       /* Ensure we see any new tables. */
+       smp_rmb();
+
+       iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+       if (iter->walker.tbl) {
+               iter->slot = 0;
+               iter->skip = 0;
+               return ERR_PTR(-EAGAIN);
+       } else {
+               iter->end_of_table = true;
+       }
+
+       return NULL;
+}
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:      Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+       struct rhlist_head *list = iter->list;
+       struct rhashtable *ht = iter->ht;
+       struct rhash_head *p = iter->p;
+       bool rhlist = ht->rhlist;
+
+       if (p) {
+               if (!rhlist || !(list = rcu_dereference(list->next))) {
+                       p = rcu_dereference(p->next);
+                       list = container_of(p, struct rhlist_head, rhead);
+               }
+               if (!rht_is_a_nulls(p)) {
+                       iter->skip++;
+                       iter->p = p;
+                       iter->list = list;
+                       return rht_obj(ht, rhlist ? &list->rhead : p);
+               }
+
+               /* At the end of this slot, switch to next one and then find
+                * next entry from that point.
+                */
+               iter->skip = 0;
+               iter->slot++;
+       }
+
+       return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_peek - Return the next object but don't advance the iterator
+ * @iter:      Hash table iterator
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_peek(struct rhashtable_iter *iter)
+{
+       struct rhlist_head *list = iter->list;
+       struct rhashtable *ht = iter->ht;
+       struct rhash_head *p = iter->p;
+
+       if (p)
+               return rht_obj(ht, ht->rhlist ? &list->rhead : p);
+
+       /* No object found in current iter, find next one in the table. */
+
+       if (iter->skip) {
+               /* A nonzero skip value points to the next entry in the table
+                * beyond that last one that was found. Decrement skip so
+                * we find the current value. __rhashtable_walk_find_next
+                * will restore the original value of skip assuming that
+                * the table hasn't changed.
+                */
+               iter->skip--;
+       }
+
+       return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter:      Hash table iterator
+ *
+ * Finish a hash table walk.  Does not reset the iterator to the start of the
+ * hash table.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+       __releases(RCU)
+{
+       struct rhashtable *ht;
+       struct bucket_table *tbl = iter->walker.tbl;
+
+       if (!tbl)
+               goto out;
+
+       ht = iter->ht;
+
+       spin_lock(&ht->lock);
+       if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
+               /* This bucket table is being freed, don't re-link it. */
+               iter->walker.tbl = NULL;
        else
-               return ERR_PTR(err);
+               list_add(&iter->walker.list, &tbl->walkers);
+       spin_unlock(&ht->lock);
+
+out:
+       rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 
 static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 {
-       return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-                  (unsigned long)params->min_size);
+       size_t retsize;
+
+       if (params->nelem_hint)
+               retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
+                             (unsigned long)params->min_size);
+       else
+               retsize = max(HASH_DEFAULT_SIZE,
+                             (unsigned long)params->min_size);
+
+       return retsize;
 }
 
 static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
@@ -431,21 +966,58 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
        return jhash2(key, length, seed);
 }
 
+/**
+ * rhashtable_init - initialize a new hash table
+ * @ht:                hash table to be initialized
+ * @params:    configuration parameters
+ *
+ * Initializes a new hash table based on the provided configuration
+ * parameters. A table can be configured either with a variable or
+ * fixed length key:
+ *
+ * Configuration Example 1: Fixed length keys
+ * struct test_obj {
+ *     int                     key;
+ *     void *                  my_member;
+ *     struct rhash_head       node;
+ * };
+ *
+ * struct rhashtable_params params = {
+ *     .head_offset = offsetof(struct test_obj, node),
+ *     .key_offset = offsetof(struct test_obj, key),
+ *     .key_len = sizeof(int),
+ *     .hashfn = jhash,
+ * };
+ *
+ * Configuration Example 2: Variable length keys
+ * struct test_obj {
+ *     [...]
+ *     struct rhash_head       node;
+ * };
+ *
+ * u32 my_hash_fn(const void *data, u32 len, u32 seed)
+ * {
+ *     struct test_obj *obj = data;
+ *
+ *     return [... hash ...];
+ * }
+ *
+ * struct rhashtable_params params = {
+ *     .head_offset = offsetof(struct test_obj, node),
+ *     .hashfn = jhash,
+ *     .obj_hashfn = my_hash_fn,
+ * };
+ */
 int rhashtable_init(struct rhashtable *ht,
                    const struct rhashtable_params *params)
 {
        struct bucket_table *tbl;
        size_t size;
 
-       size = HASH_DEFAULT_SIZE;
-
        if ((!params->key_len && !params->obj_hashfn) ||
            (params->obj_hashfn && !params->obj_cmpfn))
                return -EINVAL;
 
-       if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-               return -EINVAL;
-
        memset(ht, 0, sizeof(*ht));
        mutex_init(&ht->mutex);
        spin_lock_init(&ht->lock);
@@ -454,39 +1026,18 @@ int rhashtable_init(struct rhashtable *ht,
        if (params->min_size)
                ht->p.min_size = roundup_pow_of_two(params->min_size);
 
-       if (params->max_size)
-               ht->p.max_size = rounddown_pow_of_two(params->max_size);
+       /* Cap total entries at 2^31 to avoid nelems overflow. */
+       ht->max_elems = 1u << 31;
 
-       if (params->insecure_max_entries)
-               ht->p.insecure_max_entries =
-                       rounddown_pow_of_two(params->insecure_max_entries);
-       else
-               ht->p.insecure_max_entries = ht->p.max_size * 2;
-
-       ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+       if (params->max_size) {
+               ht->p.max_size = rounddown_pow_of_two(params->max_size);
+               if (ht->p.max_size < ht->max_elems / 2)
+                       ht->max_elems = ht->p.max_size * 2;
+       }
 
-       if (params->nelem_hint)
-               size = rounded_hashtable_size(&ht->p);
-
-       /* The maximum (not average) chain length grows with the
-        * size of the hash table, at a rate of (log N)/(log log N).
-        * The value of 16 is selected so that even if the hash
-        * table grew to 2^32 you would not expect the maximum
-        * chain length to exceed it unless we are under attack
-        * (or extremely unlucky).
-        *
-        * As this limit is only to detect attacks, we don't need
-        * to set it to a lower value as you'd need the chain
-        * length to vastly exceed 16 to have any real effect
-        * on the system.
-        */
-       if (!params->insecure_elasticity)
-               ht->elasticity = 16;
+       ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
 
-       if (params->locks_mul)
-               ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-       else
-               ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+       size = rounded_hashtable_size(&ht->p);
 
        ht->key_len = ht->p.key_len;
        if (!params->hashfn) {
@@ -498,9 +1049,16 @@ int rhashtable_init(struct rhashtable *ht,
                }
        }
 
+       /*
+        * This is api initialization and thus we need to guarantee the
+        * initial rhashtable allocation. Upon failure, retry with the
+        * smallest possible size with __GFP_NOFAIL semantics.
+        */
        tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-       if (tbl == NULL)
-               return -ENOMEM;
+       if (unlikely(tbl == NULL)) {
+               size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
+               tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
+       }
 
        atomic_set(&ht->nelems, 0);
 
@@ -510,15 +1068,170 @@ int rhashtable_init(struct rhashtable *ht,
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(rhashtable_init);
 
-void rhashtable_destroy(struct rhashtable *ht)
+/**
+ * rhltable_init - initialize a new hash list table
+ * @hlt:       hash list table to be initialized
+ * @params:    configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
 {
-       struct bucket_table *tbl;
+       int err;
+
+       err = rhashtable_init(&hlt->ht, params);
+       hlt->ht.rhlist = true;
+       return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+                               void (*free_fn)(void *ptr, void *arg),
+                               void *arg)
+{
+       struct rhlist_head *list;
+
+       if (!ht->rhlist) {
+               free_fn(rht_obj(ht, obj), arg);
+               return;
+       }
+
+       list = container_of(obj, struct rhlist_head, rhead);
+       do {
+               obj = &list->rhead;
+               list = rht_dereference(list->next, ht);
+               free_fn(rht_obj(ht, obj), arg);
+       } while (list);
+}
+
+/**
+ * rhashtable_free_and_destroy - free elements and destroy hash table
+ * @ht:                the hash table to destroy
+ * @free_fn:   callback to release resources of element
+ * @arg:       pointer passed to free_fn
+ *
+ * Stops an eventual async resize. If defined, invokes free_fn for each
+ * element to releasal resources. Please note that RCU protected
+ * readers may still be accessing the elements. Releasing of resources
+ * must occur in a compatible manner. Then frees the bucket array.
+ *
+ * This function will eventually sleep to wait for an async resize
+ * to complete. The caller is responsible that no further write operations
+ * occurs in parallel.
+ */
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+                                void (*free_fn)(void *ptr, void *arg),
+                                void *arg)
+{
+       struct bucket_table *tbl, *next_tbl;
+       unsigned int i;
 
        cancel_work_sync(&ht->run_work);
 
        mutex_lock(&ht->mutex);
        tbl = rht_dereference(ht->tbl, ht);
+restart:
+       if (free_fn) {
+               for (i = 0; i < tbl->size; i++) {
+                       struct rhash_head *pos, *next;
+
+                       cond_resched();
+                       for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
+                            next = !rht_is_a_nulls(pos) ?
+                                       rht_dereference(pos->next, ht) : NULL;
+                            !rht_is_a_nulls(pos);
+                            pos = next,
+                            next = !rht_is_a_nulls(pos) ?
+                                       rht_dereference(pos->next, ht) : NULL)
+                               rhashtable_free_one(ht, pos, free_fn, arg);
+               }
+       }
+
+       next_tbl = rht_dereference(tbl->future_tbl, ht);
        bucket_table_free(tbl);
+       if (next_tbl) {
+               tbl = next_tbl;
+               goto restart;
+       }
        mutex_unlock(&ht->mutex);
 }
+EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
+
+void rhashtable_destroy(struct rhashtable *ht)
+{
+       return rhashtable_free_and_destroy(ht, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+       const struct bucket_table *tbl, unsigned int hash)
+{
+       const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+       unsigned int index = hash & ((1 << tbl->nest) - 1);
+       unsigned int size = tbl->size >> tbl->nest;
+       unsigned int subhash = hash;
+       union nested_table *ntbl;
+
+       ntbl = nested_table_top(tbl);
+       ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
+       subhash >>= tbl->nest;
+
+       while (ntbl && size > (1 << shift)) {
+               index = subhash & ((1 << shift) - 1);
+               ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+                                                 tbl, hash);
+               size >>= shift;
+               subhash >>= shift;
+       }
+
+       if (!ntbl)
+               return NULL;
+
+       return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(__rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+       const struct bucket_table *tbl, unsigned int hash)
+{
+       static struct rhash_lock_head __rcu *rhnull;
+
+       if (!rhnull)
+               INIT_RHT_NULLS_HEAD(rhnull);
+       return __rht_bucket_nested(tbl, hash) ?: &rhnull;
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+       struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+       const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+       unsigned int index = hash & ((1 << tbl->nest) - 1);
+       unsigned int size = tbl->size >> tbl->nest;
+       union nested_table *ntbl;
+
+       ntbl = nested_table_top(tbl);
+       hash >>= tbl->nest;
+       ntbl = nested_table_alloc(ht, &ntbl[index].table,
+                                 size <= (1 << shift));
+
+       while (ntbl && size > (1 << shift)) {
+               index = hash & ((1 << shift) - 1);
+               size >>= shift;
+               hash >>= shift;
+               ntbl = nested_table_alloc(ht, &ntbl[index].table,
+                                         size <= (1 << shift));
+       }
+
+       if (!ntbl)
+               return NULL;
+
+       return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
index d926e20f58d2cb37f814e300f667f55ff08a095c..1c7198d279bc9f534c946f2f56b65011a88ff452 100644 (file)
@@ -106,8 +106,6 @@ static void sched_init(void)
 {
        struct task_struct *p = malloc(sizeof(*p));
 
-       mlockall(MCL_CURRENT|MCL_FUTURE);
-
        memset(p, 0, sizeof(*p));
 
        p->state        = TASK_RUNNING;
index 7926be06e29f85983e331725877e351a56915680..f6c979aa6ae1bf751139ad604c3e56ead7cc777f 100644 (file)
@@ -28,7 +28,6 @@ void unregister_shrinker(struct shrinker *shrinker)
 struct meminfo {
        u64             total;
        u64             available;
-
 };
 
 static u64 parse_meminfo_line(const char *line)
@@ -50,7 +49,7 @@ static struct meminfo read_meminfo(void)
 
        f = fopen("/proc/meminfo", "r");
        if (!f)
-               die("error opening /proc/meminfo: %m");
+               return ret;
 
        while ((len = getline(&line, &n, f)) != -1) {
                if ((v = strcmp_prefix(line, "MemTotal:")))
@@ -77,10 +76,18 @@ void run_shrinkers(void)
                return;
 
        info = read_meminfo();
-       want_shrink = (info.total >> 2) - info.available;
 
-       if (want_shrink <= 0)
-               return;
+       if (info.total && info.available) {
+               want_shrink = (info.total >> 2) - info.available;
+
+               if (want_shrink <= 0)
+                       return;
+       } else {
+               /* If we weren't able to read /proc/meminfo, we must be pretty
+                * low: */
+
+               want_shrink = 8 << 20;
+       }
 
        mutex_lock(&shrinker_lock);
        list_for_each_entry(shrinker, &shrinker_list, list) {
index 49d46ed2e18e9b8c5916e4f59e39b3ce24aa205b..fca1208720b67dfd7e96915679572737dd626dba 100644 (file)
@@ -2,11 +2,13 @@
 
 #include <linux/export.h>
 #include <linux/log2.h>
+#include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/six.h>
+#include <linux/slab.h>
 
 #ifdef DEBUG
 #define EBUG_ON(cond)          BUG_ON(cond)
@@ -41,7 +43,7 @@ struct six_lock_vals {
 #define LOCK_VALS {                                                    \
        [SIX_LOCK_read] = {                                             \
                .lock_val       = __SIX_VAL(read_lock, 1),              \
-               .lock_fail      = __SIX_LOCK_HELD_write,                \
+               .lock_fail      = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
                .unlock_val     = -__SIX_VAL(read_lock, 1),             \
                .held_mask      = __SIX_LOCK_HELD_read,                 \
                .unlock_wakeup  = SIX_LOCK_write,                       \
@@ -76,36 +78,196 @@ static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
        }
 }
 
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+       unsigned read_count = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               read_count += *per_cpu_ptr(lock->readers, cpu);
+       return read_count;
+}
+
+struct six_lock_waiter {
+       struct list_head        list;
+       struct task_struct      *task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+                                  union six_lock_state state,
+                                  unsigned waitlist_id)
+{
+       if (waitlist_id == SIX_LOCK_write) {
+               if (state.write_locking && !state.read_lock) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+                       if (p)
+                               wake_up_process(p);
+               }
+       } else {
+               struct list_head *wait_list = &lock->wait_list[waitlist_id];
+               struct six_lock_waiter *w, *next;
+
+               if (!(state.waiters & (1 << waitlist_id)))
+                       return;
+
+               clear_bit(waitlist_bitnr(waitlist_id),
+                         (unsigned long *) &lock->state.v);
+
+               raw_spin_lock(&lock->wait_lock);
+
+               list_for_each_entry_safe(w, next, wait_list, list) {
+                       list_del_init(&w->list);
+
+                       if (wake_up_process(w->task) &&
+                           waitlist_id != SIX_LOCK_read) {
+                               if (!list_empty(wait_list))
+                                       set_bit(waitlist_bitnr(waitlist_id),
+                                               (unsigned long *) &lock->state.v);
+                               break;
+                       }
+               }
+
+               raw_spin_unlock(&lock->wait_lock);
+       }
+}
+
 static __always_inline bool do_six_trylock_type(struct six_lock *lock,
-                                               enum six_lock_type type)
+                                               enum six_lock_type type,
+                                               bool try)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
-       union six_lock_state old;
-       u64 v = READ_ONCE(lock->state.v);
+       union six_lock_state old, new;
+       bool ret;
+       u64 v;
 
        EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+       EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
 
-       do {
-               old.v = v;
+       EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
 
-               EBUG_ON(type == SIX_LOCK_write &&
-                       ((old.v & __SIX_LOCK_HELD_write) ||
-                        !(old.v & __SIX_LOCK_HELD_intent)));
+       /*
+        * Percpu reader mode:
+        *
+        * The basic idea behind this algorithm is that you can implement a lock
+        * between two threads without any atomics, just memory barriers:
+        *
+        * For two threads you'll need two variables, one variable for "thread a
+        * has the lock" and another for "thread b has the lock".
+        *
+        * To take the lock, a thread sets its variable indicating that it holds
+        * the lock, then issues a full memory barrier, then reads from the
+        * other thread's variable to check if the other thread thinks it has
+        * the lock. If we raced, we backoff and retry/sleep.
+        */
 
-               if (old.v & l[type].lock_fail)
-                       return false;
-       } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-                               old.v,
-                               old.v + l[type].lock_val)) != old.v);
+       if (type == SIX_LOCK_read && lock->readers) {
+retry:
+               preempt_disable();
+               this_cpu_inc(*lock->readers); /* signal that we own lock */
 
-       six_set_owner(lock, type, old);
-       return true;
+               smp_mb();
+
+               old.v = READ_ONCE(lock->state.v);
+               ret = !(old.v & l[type].lock_fail);
+
+               this_cpu_sub(*lock->readers, !ret);
+               preempt_enable();
+
+               /*
+                * If we failed because a writer was trying to take the
+                * lock, issue a wakeup because we might have caused a
+                * spurious trylock failure:
+                */
+               if (old.write_locking) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+
+                       if (p)
+                               wake_up_process(p);
+               }
+
+               /*
+                * If we failed from the lock path and the waiting bit wasn't
+                * set, set it:
+                */
+               if (!try && !ret) {
+                       v = old.v;
+
+                       do {
+                               new.v = old.v = v;
+
+                               if (!(old.v & l[type].lock_fail))
+                                       goto retry;
+
+                               if (new.waiters & (1 << type))
+                                       break;
+
+                               new.waiters |= 1 << type;
+                       } while ((v = atomic64_cmpxchg(&lock->state.counter,
+                                                      old.v, new.v)) != old.v);
+               }
+       } else if (type == SIX_LOCK_write && lock->readers) {
+               if (try) {
+                       atomic64_add(__SIX_VAL(write_locking, 1),
+                                    &lock->state.counter);
+                       smp_mb__after_atomic();
+               }
+
+               ret = !pcpu_read_count(lock);
+
+               /*
+                * On success, we increment lock->seq; also we clear
+                * write_locking unless we failed from the lock path:
+                */
+               v = 0;
+               if (ret)
+                       v += __SIX_VAL(seq, 1);
+               if (ret || try)
+                       v -= __SIX_VAL(write_locking, 1);
+
+               if (try && !ret) {
+                       old.v = atomic64_add_return(v, &lock->state.counter);
+                       six_lock_wakeup(lock, old, SIX_LOCK_read);
+               } else {
+                       atomic64_add(v, &lock->state.counter);
+               }
+       } else {
+               v = READ_ONCE(lock->state.v);
+               do {
+                       new.v = old.v = v;
+
+                       if (!(old.v & l[type].lock_fail)) {
+                               new.v += l[type].lock_val;
+
+                               if (type == SIX_LOCK_write)
+                                       new.write_locking = 0;
+                       } else if (!try && type != SIX_LOCK_write &&
+                                  !(new.waiters & (1 << type)))
+                               new.waiters |= 1 << type;
+                       else
+                               break; /* waiting bit already set */
+               } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+                                       old.v, new.v)) != old.v);
+
+               ret = !(old.v & l[type].lock_fail);
+
+               EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+       }
+
+       if (ret)
+               six_set_owner(lock, type, old);
+
+       EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+
+       return ret;
 }
 
 __always_inline __flatten
 static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
 {
-       if (!do_six_trylock_type(lock, type))
+       if (!do_six_trylock_type(lock, type, true))
                return false;
 
        if (type != SIX_LOCK_write)
@@ -119,8 +281,43 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
 {
        const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state old;
-       u64 v = READ_ONCE(lock->state.v);
+       u64 v;
+
+       EBUG_ON(type == SIX_LOCK_write);
+
+       if (type == SIX_LOCK_read &&
+           lock->readers) {
+               bool ret;
 
+               preempt_disable();
+               this_cpu_inc(*lock->readers);
+
+               smp_mb();
+
+               old.v = READ_ONCE(lock->state.v);
+               ret = !(old.v & l[type].lock_fail) && old.seq == seq;
+
+               this_cpu_sub(*lock->readers, !ret);
+               preempt_enable();
+
+               /*
+                * Similar to the lock path, we may have caused a spurious write
+                * lock fail and need to issue a wakeup:
+                */
+               if (old.write_locking) {
+                       struct task_struct *p = READ_ONCE(lock->owner);
+
+                       if (p)
+                               wake_up_process(p);
+               }
+
+               if (ret)
+                       six_acquire(&lock->dep_map, 1);
+
+               return ret;
+       }
+
+       v = READ_ONCE(lock->state.v);
        do {
                old.v = v;
 
@@ -136,14 +333,6 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
        return true;
 }
 
-struct six_lock_waiter {
-       struct list_head        list;
-       struct task_struct      *task;
-};
-
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-
 #ifdef CONFIG_LOCK_SPIN_ON_OWNER
 
 static inline int six_can_spin_on_owner(struct six_lock *lock)
@@ -218,7 +407,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
                if (owner && !six_spin_on_owner(lock, owner))
                        break;
 
-               if (do_six_trylock_type(lock, type)) {
+               if (do_six_trylock_type(lock, type, false)) {
                        osq_unlock(&lock->osq);
                        preempt_enable();
                        return true;
@@ -270,18 +459,22 @@ noinline
 static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
                                    six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-       const struct six_lock_vals l[] = LOCK_VALS;
-       union six_lock_state old, new;
+       union six_lock_state old;
        struct six_lock_waiter wait;
        int ret = 0;
-       u64 v;
+
+       if (type == SIX_LOCK_write) {
+               EBUG_ON(lock->state.write_locking);
+               atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+               smp_mb__after_atomic();
+       }
 
        ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
        if (ret)
-               return ret;
+               goto out_before_sleep;
 
        if (six_optimistic_spin(lock, type))
-               return 0;
+               goto out_before_sleep;
 
        lock_contended(&lock->dep_map, _RET_IP_);
 
@@ -298,32 +491,16 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
                        raw_spin_unlock(&lock->wait_lock);
                }
 
-               ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-               if (ret)
+               if (do_six_trylock_type(lock, type, false))
                        break;
 
-               v = READ_ONCE(lock->state.v);
-               do {
-                       new.v = old.v = v;
-
-                       if (!(old.v & l[type].lock_fail))
-                               new.v += l[type].lock_val;
-                       else if (!(new.waiters & (1 << type)))
-                               new.waiters |= 1 << type;
-                       else
-                               break; /* waiting bit already set */
-               } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-                                       old.v, new.v)) != old.v);
-
-               if (!(old.v & l[type].lock_fail))
+               ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+               if (ret)
                        break;
 
                schedule();
        }
 
-       if (!ret)
-               six_set_owner(lock, type, old);
-
        __set_current_state(TASK_RUNNING);
 
        if (!list_empty_careful(&wait.list)) {
@@ -331,6 +508,12 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
                list_del_init(&wait.list);
                raw_spin_unlock(&lock->wait_lock);
        }
+out_before_sleep:
+       if (ret && type == SIX_LOCK_write) {
+               old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
+                                           &lock->state.counter);
+               six_lock_wakeup(lock, old, SIX_LOCK_read);
+       }
 
        return ret;
 }
@@ -344,7 +527,7 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
        if (type != SIX_LOCK_write)
                six_acquire(&lock->dep_map, 0);
 
-       ret = do_six_trylock_type(lock, type) ? 0
+       ret = do_six_trylock_type(lock, type, true) ? 0
                : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
 
        if (ret && type != SIX_LOCK_write)
@@ -355,54 +538,12 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
        return ret;
 }
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-                                  union six_lock_state state,
-                                  unsigned waitlist_id)
-{
-       struct list_head *wait_list = &lock->wait_list[waitlist_id];
-       struct six_lock_waiter *w, *next;
-
-       if (waitlist_id == SIX_LOCK_write && state.read_lock)
-               return;
-
-       if (!(state.waiters & (1 << waitlist_id)))
-               return;
-
-       clear_bit(waitlist_bitnr(waitlist_id),
-                 (unsigned long *) &lock->state.v);
-
-       if (waitlist_id == SIX_LOCK_write) {
-               struct task_struct *p = READ_ONCE(lock->owner);
-
-               if (p)
-                       wake_up_process(p);
-               return;
-       }
-
-       raw_spin_lock(&lock->wait_lock);
-
-       list_for_each_entry_safe(w, next, wait_list, list) {
-               list_del_init(&w->list);
-
-               if (wake_up_process(w->task) &&
-                   waitlist_id != SIX_LOCK_read) {
-                       if (!list_empty(wait_list))
-                               set_bit(waitlist_bitnr(waitlist_id),
-                                       (unsigned long *) &lock->state.v);
-                       break;
-               }
-       }
-
-       raw_spin_unlock(&lock->wait_lock);
-}
-
 __always_inline __flatten
 static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state state;
 
-       EBUG_ON(!(lock->state.v & l[type].held_mask));
        EBUG_ON(type == SIX_LOCK_write &&
                !(lock->state.v & __SIX_LOCK_HELD_intent));
 
@@ -420,8 +561,18 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
                lock->owner = NULL;
        }
 
-       state.v = atomic64_add_return_release(l[type].unlock_val,
-                                             &lock->state.counter);
+       if (type == SIX_LOCK_read &&
+           lock->readers) {
+               smp_mb(); /* unlock barrier */
+               this_cpu_dec(*lock->readers);
+               smp_mb(); /* between unlocking and checking for waiters */
+               state.v = READ_ONCE(lock->state.v);
+       } else {
+               EBUG_ON(!(lock->state.v & l[type].held_mask));
+               state.v = atomic64_add_return_release(l[type].unlock_val,
+                                                     &lock->state.counter);
+       }
+
        six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
@@ -467,26 +618,28 @@ EXPORT_SYMBOL_GPL(six_lock_downgrade);
 
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-       const struct six_lock_vals l[] = LOCK_VALS;
        union six_lock_state old, new;
        u64 v = READ_ONCE(lock->state.v);
 
        do {
                new.v = old.v = v;
 
-               EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask));
-
-               new.v += l[SIX_LOCK_read].unlock_val;
-
-               if (new.v & l[SIX_LOCK_intent].lock_fail)
+               if (new.intent_lock)
                        return false;
 
-               new.v += l[SIX_LOCK_intent].lock_val;
+               if (!lock->readers) {
+                       EBUG_ON(!new.read_lock);
+                       new.read_lock--;
+               }
+
+               new.intent_lock = 1;
        } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
                                old.v, new.v)) != old.v);
 
+       if (lock->readers)
+               this_cpu_dec(*lock->readers);
+
        six_set_owner(lock, SIX_LOCK_intent, old);
-       six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup);
 
        return true;
 }
@@ -518,16 +671,22 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
 
-       EBUG_ON(type == SIX_LOCK_write);
        six_acquire(&lock->dep_map, 0);
 
        /* XXX: assert already locked, and that we don't overflow: */
 
        switch (type) {
        case SIX_LOCK_read:
-               atomic64_add(l[type].lock_val, &lock->state.counter);
+               if (lock->readers) {
+                       this_cpu_inc(*lock->readers);
+               } else {
+                       EBUG_ON(!lock->state.read_lock &&
+                               !lock->state.intent_lock);
+                       atomic64_add(l[type].lock_val, &lock->state.counter);
+               }
                break;
        case SIX_LOCK_intent:
+               EBUG_ON(!lock->state.intent_lock);
                lock->intent_lock_recurse++;
                break;
        case SIX_LOCK_write:
@@ -551,3 +710,50 @@ void six_lock_wakeup_all(struct six_lock *lock)
        raw_spin_unlock(&lock->wait_lock);
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+struct free_pcpu_rcu {
+       struct rcu_head         rcu;
+       void __percpu           *p;
+};
+
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
+{
+       struct free_pcpu_rcu *rcu =
+               container_of(_rcu, struct free_pcpu_rcu, rcu);
+
+       free_percpu(rcu->p);
+       kfree(rcu);
+}
+
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
+{
+       struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
+
+       if (!rcu)
+               return;
+
+       rcu->p = lock->readers;
+       lock->readers = NULL;
+
+       call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
+
+void six_lock_pcpu_free(struct six_lock *lock)
+{
+       BUG_ON(lock->readers && pcpu_read_count(lock));
+       BUG_ON(lock->state.read_lock);
+
+       free_percpu(lock->readers);
+       lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
+
+void six_lock_pcpu_alloc(struct six_lock *lock)
+{
+#ifdef __KERNEL__
+       if (!lock->readers)
+               lock->readers = alloc_percpu(unsigned);
+#endif
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
index 88e923cb22e3b242970ee6e4dd123fb8dc38230f..361419aec5fc09a09611ef9e84f9d8e393076d1b 100644 (file)
@@ -663,3 +663,22 @@ int dev_mounted(char *dev)
                return 1;
        return 2;
 }
+
+struct bpos bpos_parse(char *buf)
+{
+       char *s = buf, *field;
+       u64 inode_v = 0, offset_v = 0;
+
+       if (!(field = strsep(&s, ":")) ||
+           kstrtoull(field, 10, &inode_v))
+               die("invalid bpos %s", buf);
+
+       if ((field = strsep(&s, ":")) &&
+           kstrtoull(field, 10, &offset_v))
+               die("invalid bpos %s", buf);
+
+       if (s)
+               die("invalid bpos %s", buf);
+
+       return (struct bpos) { .inode = inode_v, .offset = offset_v };
+}
index d6814bcd971d40468279cc9052ae8b7b0ef91be2..01898e21d050c275d0b03c732237d1731d21c5a0 100644 (file)
@@ -172,4 +172,6 @@ do {                                                                        \
        _ret;                                                           \
 })
 
+struct bpos bpos_parse(char *);
+
 #endif /* _TOOLS_UTIL_H */