From ff247cc54875d28a82245371f174e15eb304c367 Mon Sep 17 00:00:00 2001 From: Jonathan Carter Date: Tue, 6 Apr 2021 15:19:46 +0200 Subject: [PATCH] New upstream snapshot --- Makefile | 5 + bcachefs.c | 5 + cmd_data.c | 82 +- cmd_debug.c | 47 +- cmd_device.c | 219 ++++- cmd_format.c | 25 +- cmd_fs.c | 13 +- cmd_fsck.c | 8 +- cmd_migrate.c | 19 +- cmds.h | 2 + debian/changelog | 9 + debian/control | 2 +- debian/files | 2 +- include/linux/bitops.h | 11 + include/linux/cpumask.h | 2 + include/linux/generic-radix-tree.h | 8 + include/linux/kernel.h | 2 + include/linux/list.h | 10 +- include/linux/list_nulls.h | 145 +++ include/linux/overflow.h | 346 +++++++ include/linux/page.h | 2 + include/linux/poison.h | 85 ++ include/linux/random.h | 1 + include/linux/rcupdate.h | 28 + include/linux/rhashtable-types.h | 135 +++ include/linux/rhashtable.h | 1205 ++++++++++++++++++++---- include/linux/sched/mm.h | 15 +- include/linux/six.h | 8 +- include/linux/slab.h | 34 +- include/linux/srcu.h | 31 + include/linux/types.h | 3 + include/linux/vmalloc.h | 2 + include/linux/wait.h | 1 + include/trace/events/bcachefs.h | 120 ++- libbcachefs.c | 113 ++- libbcachefs.h | 24 +- libbcachefs/acl.c | 35 +- libbcachefs/acl.h | 4 +- libbcachefs/alloc_background.c | 620 ++++++------- libbcachefs/alloc_background.h | 51 +- libbcachefs/alloc_foreground.c | 100 +- libbcachefs/alloc_types.h | 37 +- libbcachefs/bcachefs.h | 92 +- libbcachefs/bcachefs_format.h | 285 ++++-- libbcachefs/bcachefs_ioctl.h | 34 +- libbcachefs/bkey.c | 40 +- libbcachefs/bkey.h | 177 ++-- libbcachefs/bkey_buf.h | 60 ++ libbcachefs/bkey_methods.c | 124 ++- libbcachefs/bkey_methods.h | 1 - libbcachefs/bkey_on_stack.h | 43 - libbcachefs/bkey_sort.c | 310 +------ libbcachefs/bkey_sort.h | 8 - libbcachefs/bset.c | 236 ++--- libbcachefs/bset.h | 93 +- libbcachefs/btree_cache.c | 216 ++--- libbcachefs/btree_cache.h | 14 +- libbcachefs/btree_gc.c | 675 +++++++++----- libbcachefs/btree_gc.h | 15 +- libbcachefs/btree_io.c | 506 ++++------ libbcachefs/btree_io.h | 55 +- libbcachefs/btree_iter.c | 1340 +++++++++++++-------------- libbcachefs/btree_iter.h | 112 ++- libbcachefs/btree_key_cache.c | 427 ++++++--- libbcachefs/btree_key_cache.h | 28 + libbcachefs/btree_locking.h | 32 +- libbcachefs/btree_types.h | 130 +-- libbcachefs/btree_update.h | 8 +- libbcachefs/btree_update_interior.c | 667 +++++++------ libbcachefs/btree_update_interior.h | 32 +- libbcachefs/btree_update_leaf.c | 690 ++++++++------ libbcachefs/buckets.c | 1117 ++++++++++++---------- libbcachefs/buckets.h | 64 +- libbcachefs/buckets_types.h | 46 +- libbcachefs/chardev.c | 38 +- libbcachefs/checksum.h | 6 +- libbcachefs/clock.c | 8 +- libbcachefs/clock_types.h | 2 +- libbcachefs/compress.c | 15 +- libbcachefs/debug.c | 20 +- libbcachefs/debug.h | 33 +- libbcachefs/dirent.c | 31 +- libbcachefs/dirent.h | 6 +- libbcachefs/ec.c | 871 +++++++++-------- libbcachefs/ec.h | 87 +- libbcachefs/ec_types.h | 10 +- libbcachefs/error.c | 10 +- libbcachefs/error.h | 29 +- libbcachefs/extent_update.c | 47 +- libbcachefs/extents.c | 180 ++-- libbcachefs/extents.h | 31 +- libbcachefs/fs-common.c | 74 +- libbcachefs/fs-io.c | 334 ++++--- libbcachefs/fs-io.h | 3 +- libbcachefs/fs-ioctl.c | 4 +- libbcachefs/fs.c | 161 ++-- libbcachefs/fs.h | 10 +- libbcachefs/fsck.c | 355 +++---- libbcachefs/inode.c | 414 ++++++--- libbcachefs/inode.h | 26 +- libbcachefs/io.c | 390 ++++---- libbcachefs/io.h | 50 +- libbcachefs/io_types.h | 14 +- libbcachefs/journal.c | 639 +++++++------ libbcachefs/journal.h | 110 ++- libbcachefs/journal_io.c | 765 +++++++++++---- libbcachefs/journal_io.h | 10 +- libbcachefs/journal_reclaim.c | 583 ++++++++---- libbcachefs/journal_reclaim.h | 53 +- libbcachefs/journal_seq_blacklist.c | 5 +- libbcachefs/journal_types.h | 75 +- libbcachefs/migrate.c | 35 +- libbcachefs/move.c | 294 ++++-- libbcachefs/move.h | 8 +- libbcachefs/movinggc.c | 36 +- libbcachefs/opts.c | 51 +- libbcachefs/opts.h | 28 +- libbcachefs/quota.c | 17 +- libbcachefs/rebalance.c | 30 +- libbcachefs/rebalance_types.h | 2 +- libbcachefs/recovery.c | 755 +++++++-------- libbcachefs/recovery.h | 17 +- libbcachefs/reflink.c | 62 +- libbcachefs/replicas.c | 228 +++-- libbcachefs/replicas.h | 40 +- libbcachefs/str_hash.h | 21 +- libbcachefs/super-io.c | 151 +-- libbcachefs/super-io.h | 7 +- libbcachefs/super.c | 202 ++-- libbcachefs/super.h | 10 +- libbcachefs/super_types.h | 2 +- libbcachefs/sysfs.c | 159 ++-- libbcachefs/tests.c | 327 ++++--- libbcachefs/tests.h | 2 +- libbcachefs/util.c | 2 +- libbcachefs/util.h | 16 +- libbcachefs/varint.c | 42 + libbcachefs/varint.h | 8 + libbcachefs/xattr.c | 41 +- linux/generic-radix-tree.c | 17 +- linux/kthread.c | 2 +- linux/rhashtable.c | 1081 +++++++++++++++++---- linux/sched.c | 2 - linux/shrinker.c | 17 +- linux/six.c | 418 ++++++--- tools-util.c | 19 + tools-util.h | 2 + 147 files changed, 12785 insertions(+), 8158 deletions(-) create mode 100644 include/linux/list_nulls.h create mode 100644 include/linux/overflow.h create mode 100644 include/linux/poison.h create mode 100644 include/linux/rhashtable-types.h create mode 100644 include/linux/srcu.h create mode 100644 libbcachefs/bkey_buf.h delete mode 100644 libbcachefs/bkey_on_stack.h create mode 100644 libbcachefs/varint.c create mode 100644 libbcachefs/varint.h diff --git a/Makefile b/Makefile index cc00ac6..3fe9604 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,7 @@ PYTEST=pytest-3 CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall \ -Wno-pointer-sign \ -Wno-zero-length-bounds \ + -Wno-stringop-overflow \ -fno-strict-aliasing \ -fno-delete-null-pointer-checks \ -I. -Iinclude -Iraid \ @@ -155,6 +156,10 @@ update-bcachefs-sources: git add linux/six.c cp $(LINUX_DIR)/include/linux/six.h include/linux/ git add include/linux/six.h + cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/ + git add include/linux/list_nulls.h + cp $(LINUX_DIR)/include/linux/poison.h include/linux/ + git add include/linux/poison.h $(RM) libbcachefs/*.mod.c git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision git add .bcachefs_revision diff --git a/bcachefs.c b/bcachefs.c index b4958f1..e9ff7d1 100644 --- a/bcachefs.c +++ b/bcachefs.c @@ -59,6 +59,7 @@ static void usage(void) "\n" "Commands for managing filesystem data:\n" " data rereplicate Rereplicate degraded data\n" + " data job Kick off low level data jobs\n" "\n" "Encryption:\n" " unlock Unlock an encrypted filesystem prior to running/mounting\n" @@ -128,6 +129,8 @@ static int device_cmds(int argc, char *argv[]) return cmd_device_set_state(argc, argv); if (!strcmp(cmd, "resize")) return cmd_device_resize(argc, argv); + if (!strcmp(cmd, "resize-journal")) + return cmd_device_resize_journal(argc, argv); usage(); return 0; @@ -139,6 +142,8 @@ static int data_cmds(int argc, char *argv[]) if (!strcmp(cmd, "rereplicate")) return cmd_data_rereplicate(argc, argv); + if (!strcmp(cmd, "job")) + return cmd_data_job(argc, argv); usage(); return 0; diff --git a/cmd_data.c b/cmd_data.c index f495b6c..25a2dcb 100644 --- a/cmd_data.c +++ b/cmd_data.c @@ -4,6 +4,7 @@ #include #include "libbcachefs/bcachefs_ioctl.h" +#include "libbcachefs/btree_cache.h" #include "cmds.h" #include "libbcachefs.h" @@ -41,8 +42,83 @@ int cmd_data_rereplicate(int argc, char *argv[]) die("too many arguments"); return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) { - .op = BCH_DATA_OP_REREPLICATE, - .start = POS_MIN, - .end = POS_MAX, + .op = BCH_DATA_OP_REREPLICATE, + .start_btree = 0, + .start_pos = POS_MIN, + .end_btree = BTREE_ID_NR, + .end_pos = POS_MAX, }); } + +static void data_job_usage(void) +{ + puts("bcachefs data job\n" + "Usage: bcachefs data job [job} filesystem\n" + "\n" + "Kick off a data job and report progress\n" + "\n" + "job: one of scrub, rereplicate, migrate, or rewrite_old_nodes\n" + "\n" + "Options:\n" + " -b btree btree to operate on\n" + " -s inode:offset start position\n" + " -e inode:offset end position\n" + " -h, --help display this help and exit\n" + "Report bugs to "); + exit(EXIT_SUCCESS); +} + +const char * const data_jobs[] = { + "scrub", + "rereplicate", + "migrate", + "rewrite_old_nodes", + NULL +}; + +int cmd_data_job(int argc, char *argv[]) +{ + struct bch_ioctl_data op = { + .start_btree = 0, + .start_pos = POS_MIN, + .end_btree = BTREE_ID_NR, + .end_pos = POS_MAX, + }; + int opt; + + while ((opt = getopt(argc, argv, "s:e:h")) != -1) + switch (opt) { + case 'b': + op.start_btree = read_string_list_or_die(optarg, + bch2_btree_ids, "btree id"); + op.end_btree = op.start_btree; + break; + case 's': + op.start_pos = bpos_parse(optarg); + break; + op.end_pos = bpos_parse(optarg); + case 'e': + break; + case 'h': + data_job_usage(); + } + args_shift(optind); + + char *job = arg_pop(); + if (!job) + die("please specify which type of job"); + + op.op = read_string_list_or_die(job, data_jobs, "bad job type"); + + if (op.op == BCH_DATA_OP_SCRUB) + die("scrub not implemented yet"); + + char *fs_path = arg_pop(); + if (!fs_path) + fs_path = "."; + + if (argc) + die("too many arguments"); + + return bchu_data(bcache_fs_open(fs_path), op); +} diff --git a/cmd_debug.c b/cmd_debug.c index 4616447..4938ec0 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -114,7 +114,7 @@ int cmd_dump(int argc, char *argv[]) opt_set(opts, nochanges, true); opt_set(opts, norecovery, true); opt_set(opts, degraded, true); - opt_set(opts, errors, BCH_ON_ERROR_CONTINUE); + opt_set(opts, errors, BCH_ON_ERROR_continue); opt_set(opts, fix_errors, FSCK_OPT_YES); while ((opt = getopt(argc, argv, "o:fvh")) != -1) @@ -317,13 +317,13 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b) sectors = vstruct_sectors(bne, c->block_bits); } - fprintf(stdout, " offset %u journal seq %llu\n", - offset, le64_to_cpu(i->journal_seq)); + fprintf(stdout, " offset %u version %u, journal seq %llu\n", + offset, + le16_to_cpu(i->version), + le64_to_cpu(i->journal_seq)); offset += sectors; - for (k = i->start; - k != vstruct_last(i); - k = bkey_next_skip_noops(k, vstruct_last(i))) { + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) { struct bkey u; char buf[4096]; @@ -387,25 +387,6 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, bch2_trans_exit(&trans); } -static struct bpos parse_pos(char *buf) -{ - char *s = buf, *field; - u64 inode_v = 0, offset_v = 0; - - if (!(field = strsep(&s, ":")) || - kstrtoull(field, 10, &inode_v)) - die("invalid bpos %s", buf); - - if ((field = strsep(&s, ":")) && - kstrtoull(field, 10, &offset_v)) - die("invalid bpos %s", buf); - - if (s) - die("invalid bpos %s", buf); - - return (struct bpos) { .inode = inode_v, .offset = offset_v }; -} - static void list_keys_usage(void) { puts("bcachefs list - list filesystem metadata to stdout\n" @@ -445,7 +426,7 @@ int cmd_list(int argc, char *argv[]) opt_set(opts, nochanges, true); opt_set(opts, norecovery, true); opt_set(opts, degraded, true); - opt_set(opts, errors, BCH_ON_ERROR_CONTINUE); + opt_set(opts, errors, BCH_ON_ERROR_continue); while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1) switch (opt) { @@ -455,10 +436,10 @@ int cmd_list(int argc, char *argv[]) btree_id_end = btree_id_start + 1; break; case 's': - start = parse_pos(optarg); + start = bpos_parse(optarg); break; case 'e': - end = parse_pos(optarg); + end = bpos_parse(optarg); break; case 'i': if (kstrtoull(optarg, 10, &inum)) @@ -538,7 +519,7 @@ int cmd_list_journal(int argc, char *argv[]) opt_set(opts, nochanges, true); opt_set(opts, norecovery, true); opt_set(opts, degraded, true); - opt_set(opts, errors, BCH_ON_ERROR_CONTINUE); + opt_set(opts, errors, BCH_ON_ERROR_continue); opt_set(opts, fix_errors, FSCK_OPT_YES); opt_set(opts, keep_journal, true); @@ -570,14 +551,10 @@ int cmd_list_journal(int argc, char *argv[]) printf("journal entry %8llu\n" " version %8u\n" " last seq %8llu\n" - " read clock %8u\n" - " write clock %8u\n" , le64_to_cpu(p->j.seq), - le32_to_cpu(p->j.seq), - le64_to_cpu(p->j.last_seq), - le16_to_cpu(p->j.read_clock), - le16_to_cpu(p->j.write_clock)); + le32_to_cpu(p->j.version), + le64_to_cpu(p->j.last_seq)); for_each_jset_key(k, _n, entry, &p->j) { char buf[200]; diff --git a/cmd_device.c b/cmd_device.c index c311324..f9e975a 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -12,7 +12,9 @@ #include #include +#include "libbcachefs/bcachefs.h" #include "libbcachefs/bcachefs_ioctl.h" +#include "libbcachefs/journal.h" #include "libbcachefs/super-io.h" #include "cmds.h" #include "libbcachefs.h" @@ -121,11 +123,9 @@ static void device_remove_usage(void) { puts("bcachefs device_remove - remove a device from a filesystem\n" "Usage:\n" - " bcachefs device remove device\n" - " bcachefs device remove --by-id path devid\n" + " bcachefs device remove | \n" "\n" "Options:\n" - " -i, --by-id Remove device by device id\n" " -f, --force Force removal, even if some data\n" " couldn't be migrated\n" " -F, --force-metadata Force removal, even if some metadata\n" @@ -146,14 +146,10 @@ int cmd_device_remove(int argc, char *argv[]) }; struct bchfs_handle fs; bool by_id = false; - int opt, flags = BCH_FORCE_IF_DEGRADED; - unsigned dev_idx; + int opt, flags = BCH_FORCE_IF_DEGRADED, dev_idx; while ((opt = getopt_long(argc, argv, "fh", longopts, NULL)) != -1) switch (opt) { - case 'i': - by_id = true; - break; case 'f': flags |= BCH_FORCE_IF_DATA_LOST; break; @@ -165,27 +161,31 @@ int cmd_device_remove(int argc, char *argv[]) } args_shift(optind); - if (by_id) { - char *path = arg_pop(); - if (!path) - die("Please supply filesystem to remove device from"); + char *dev_str = arg_pop(); + if (!dev_str) + die("Please supply a device"); - dev_idx = (intptr_t) arg_pop(); - if (!dev_idx) - die("Please supply device id"); + char *end; + dev_idx = strtoul(dev_str, &end, 10); + if (*dev_str && !*end) + by_id = true; - fs = bcache_fs_open(path); + char *fs_path = arg_pop(); + if (fs_path) { + fs = bcache_fs_open(fs_path); + + if (!by_id) { + dev_idx = bchu_dev_path_to_idx(fs, dev_str); + if (dev_idx < 0) + die("%s does not seem to be a member of %s", + dev_str, fs_path); + } + } else if (!by_id) { + fs = bchu_fs_open_by_dev(dev_str, &dev_idx); } else { - char *dev = arg_pop(); - if (!dev) - die("Please supply a device to remove"); - - fs = bchu_fs_open_by_dev(dev, &dev_idx); + die("Filesystem path required when specifying device by id"); } - if (argc) - die("too many arguments"); - bchu_disk_remove(fs, dev_idx, flags); return 0; } @@ -220,7 +220,7 @@ int cmd_device_online(int argc, char *argv[]) if (argc) die("too many arguments"); - unsigned dev_idx; + int dev_idx; struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx); bchu_disk_online(fs, dev); return 0; @@ -265,7 +265,7 @@ int cmd_device_offline(int argc, char *argv[]) if (argc) die("too many arguments"); - unsigned dev_idx; + int dev_idx; struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx); bchu_disk_offline(fs, dev_idx, flags); return 0; @@ -301,20 +301,22 @@ int cmd_device_evacuate(int argc, char *argv[]) if (argc) die("too many arguments"); - unsigned dev_idx; + int dev_idx; struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx); struct bch_ioctl_dev_usage u = bchu_dev_usage(fs, dev_idx); - if (u.state == BCH_MEMBER_STATE_RW) { + if (u.state == BCH_MEMBER_STATE_rw) { printf("Setting %s readonly\n", dev_path); - bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_RO, 0); + bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_ro, 0); } return bchu_data(fs, (struct bch_ioctl_data) { .op = BCH_DATA_OP_MIGRATE, - .start = POS_MIN, - .end = POS_MAX, + .start_btree = 0, + .start_pos = POS_MIN, + .end_btree = BTREE_ID_NR, + .end_pos = POS_MAX, .migrate.dev = dev_idx, }); } @@ -322,7 +324,10 @@ int cmd_device_evacuate(int argc, char *argv[]) static void device_set_state_usage(void) { puts("bcachefs device set-state\n" - "Usage: bcachefs device set-state device new-state\n" + "Usage: bcachefs device set-state | \n" + "\n" + ": one of rw, ro, failed or spare\n" + ": path to mounted filesystem, optional unless specifying device by id\n" "\n" "Options:\n" " -f, --force Force, if data redundancy will be degraded\n" @@ -340,7 +345,9 @@ int cmd_device_set_state(int argc, char *argv[]) { "help", 0, NULL, 'h' }, { NULL } }; - int opt, flags = 0; + struct bchfs_handle fs; + bool by_id = false; + int opt, flags = 0, dev_idx; bool offline = false; while ((opt = getopt_long(argc, argv, "foh", longopts, NULL)) != -1) @@ -356,31 +363,32 @@ int cmd_device_set_state(int argc, char *argv[]) } args_shift(optind); - char *dev_path = arg_pop(); - if (!dev_path) - die("Please supply a device"); - char *new_state_str = arg_pop(); if (!new_state_str) die("Please supply a device state"); unsigned new_state = read_string_list_or_die(new_state_str, - bch2_dev_state, "device state"); + bch2_member_states, "device state"); - if (!offline) { - unsigned dev_idx; - struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx); + char *dev_str = arg_pop(); + if (!dev_str) + die("Please supply a device"); - bchu_disk_set_state(fs, dev_idx, new_state, flags); + char *end; + dev_idx = strtoul(dev_str, &end, 10); + if (*dev_str && !*end) + by_id = true; - bcache_fs_close(fs); - } else { + if (offline) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb = { NULL }; - int ret = bch2_read_super(dev_path, &opts, &sb); + if (by_id) + die("Cannot specify offline device by id"); + + int ret = bch2_read_super(dev_str, &opts, &sb); if (ret) - die("error opening %s: %s", dev_path, strerror(-ret)); + die("error opening %s: %s", dev_str, strerror(-ret)); struct bch_member *m = bch2_sb_get_members(sb.sb)->members + sb.sb->dev_idx; @@ -390,8 +398,27 @@ int cmd_device_set_state(int argc, char *argv[]) bch2_super_write(sb.bdev->bd_fd, sb.sb); bch2_free_super(&sb); + return 0; + } + + char *fs_path = arg_pop(); + if (fs_path) { + fs = bcache_fs_open(fs_path); + + if (!by_id) { + dev_idx = bchu_dev_path_to_idx(fs, dev_str); + if (dev_idx < 0) + die("%s does not seem to be a member of %s", + dev_str, fs_path); + } + } else if (!by_id) { + fs = bchu_fs_open_by_dev(dev_str, &dev_idx); + } else { + die("Filesystem path required when specifying device by id"); } + bchu_disk_set_state(fs, dev_idx, new_state, flags); + return 0; } @@ -496,3 +523,103 @@ int cmd_device_resize(int argc, char *argv[]) } return 0; } + +static void device_resize_journal_usage(void) +{ + puts("bcachefs device resize-journal \n" + "Usage: bcachefs device resize-journal device [ size ]\n" + "\n" + "Options:\n" + " -h, --help display this help and exit\n" + "Report bugs to "); + exit(EXIT_SUCCESS); +} + +int cmd_device_resize_journal(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "help", 0, NULL, 'h' }, + { NULL } + }; + u64 size; + int opt; + + while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1) + switch (opt) { + case 'h': + device_resize_journal_usage(); + } + args_shift(optind); + + char *dev = arg_pop(); + if (!dev) + die("Please supply a device"); + + int dev_fd = xopen(dev, O_RDONLY); + + char *size_arg = arg_pop(); + if (!size_arg) + size = get_size(dev, dev_fd); + else if (bch2_strtoull_h(size_arg, &size)) + die("invalid size"); + + size >>= 9; + + if (argc) + die("Too many arguments"); + + struct stat dev_stat = xfstat(dev_fd); + + struct mntent *mount = dev_to_mount(dev); + if (mount) { + if (!S_ISBLK(dev_stat.st_mode)) + die("%s is mounted but isn't a block device?!", dev); + + struct bchfs_handle fs = bcache_fs_open(mount->mnt_dir); + + unsigned idx = bchu_disk_get_idx(fs, dev_stat.st_rdev); + + struct bch_sb *sb = bchu_read_super(fs, -1); + if (idx >= sb->nr_devices) + die("error reading superblock: dev idx >= sb->nr_devices"); + + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + if (!mi) + die("error reading superblock: no member info"); + + /* could also just read this out of sysfs... meh */ + struct bch_member *m = mi->members + idx; + + u64 nbuckets = size / le16_to_cpu(m->bucket_size); + + printf("resizing journal on %s to %llu buckets\n", dev, nbuckets); + bchu_disk_resize_journal(fs, idx, nbuckets); + } else { + printf("%s is offline - starting:\n", dev); + + struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty()); + if (IS_ERR(c)) + die("error opening %s: %s", dev, strerror(-PTR_ERR(c))); + + struct bch_dev *ca, *resize = NULL; + unsigned i; + + for_each_online_member(ca, c, i) { + if (resize) + die("confused: more than one online device?"); + resize = ca; + percpu_ref_get(&resize->io_ref); + } + + u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size); + + printf("resizing journal on %s to %llu buckets\n", dev, nbuckets); + int ret = bch2_set_nr_journal_buckets(c, resize, nbuckets); + if (ret) + fprintf(stderr, "resize error: %s\n", strerror(-ret)); + + percpu_ref_put(&resize->io_ref); + bch2_fs_stop(c); + } + return 0; +} diff --git a/cmd_format.c b/cmd_format.c index 673c63a..b88ffe9 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -36,11 +36,14 @@ x(0, no_passphrase, no_argument) \ x('L', label, required_argument) \ x('U', uuid, required_argument) \ x(0, fs_size, required_argument) \ +x(0, superblock_size, required_argument) \ x(0, bucket_size, required_argument) \ x('g', group, required_argument) \ x(0, discard, no_argument) \ x(0, data_allowed, required_argument) \ x(0, durability, required_argument) \ +x(0, version, required_argument) \ +x(0, no_initialize, no_argument) \ x('f', force, no_argument) \ x('q', quiet, no_argument) \ x('h', help, no_argument) @@ -60,6 +63,7 @@ static void usage(void) " --no_passphrase Don't encrypt master encryption key\n" " -L, --label=label\n" " -U, --uuid=uuid\n" + " --superblock_size=size\n" "\n" "Device specific options:"); @@ -112,7 +116,7 @@ int cmd_format(int argc, char *argv[]) darray(char *) device_paths; struct format_opts opts = format_opts_default(); struct dev_opts dev_opts = dev_opts_default(), *dev; - bool force = false, no_passphrase = false, quiet = false; + bool force = false, no_passphrase = false, quiet = false, initialize = true; unsigned v; int opt; @@ -162,6 +166,12 @@ int cmd_format(int argc, char *argv[]) dev_opts.size >>= 9; break; + case O_superblock_size: + if (bch2_strtouint_h(optarg, &opts.superblock_size)) + die("invalid filesystem size"); + + opts.superblock_size >>= 9; + break; case O_bucket_size: dev_opts.bucket_size = hatoi_validate(optarg, "bucket size"); @@ -183,6 +193,13 @@ int cmd_format(int argc, char *argv[]) dev_opts.durability > BCH_REPLICAS_MAX) die("invalid durability"); break; + case O_version: + if (kstrtouint(optarg, 10, &opts.version)) + die("invalid version"); + break; + case O_no_initialize: + initialize = false; + break; case O_no_opt: darray_append(device_paths, optarg); dev_opts.path = optarg; @@ -206,8 +223,10 @@ int cmd_format(int argc, char *argv[]) if (darray_empty(devices)) die("Please supply a device"); - if (opts.encrypted && !no_passphrase) + if (opts.encrypted && !no_passphrase) { opts.passphrase = read_passphrase_twice("Enter passphrase: "); + initialize = false; + } darray_foreach(dev, devices) dev->fd = open_for_format(dev->path, force); @@ -229,7 +248,7 @@ int cmd_format(int argc, char *argv[]) darray_free(devices); - if (!opts.passphrase) { + if (initialize) { /* * Start the filesystem once, to allocate the journal and create * the root directory: diff --git a/cmd_fs.c b/cmd_fs.c index f0b67b6..8b9d91b 100644 --- a/cmd_fs.c +++ b/cmd_fs.c @@ -22,7 +22,7 @@ static void print_dev_usage_type(const char *type, u64 frag = max((s64) buckets * bucket_size - (s64) sectors, 0LL); printf_pad(20, " %s:", type); - printf("%12s%12llu%12s\n", + printf(" %15s %15llu %15s\n", pr_units(sectors, units), buckets, pr_units(frag, units)); @@ -37,18 +37,17 @@ static void print_dev_usage(struct bchfs_handle fs, printf("\n"); printf_pad(20, "%s (device %u):", d->label ?: "(no label)", d->idx); - printf("%24s%12s\n", d->dev ?: "(device not found)", bch2_dev_state[u.state]); + printf("%30s%16s\n", d->dev ?: "(device not found)", bch2_member_states[u.state]); - printf("%-20s%12s%12s%12s\n", + printf("%-20s%16s%16s%16s\n", "", "data", "buckets", "fragmented"); - for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++) { + for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++) print_dev_usage_type(bch2_data_types[i], u.bucket_size, u.buckets[i], u.sectors[i], units); - } print_dev_usage_type("erasure coded", u.bucket_size, @@ -57,12 +56,12 @@ static void print_dev_usage(struct bchfs_handle fs, units); printf_pad(20, " available:"); - printf("%12s%12llu\n", + printf(" %15s %15llu\n", pr_units(u.available_buckets * u.bucket_size, units), u.available_buckets); printf_pad(20, " capacity:"); - printf("%12s%12llu\n", + printf(" %15s %15llu\n", pr_units(u.nr_buckets * u.bucket_size, units), u.nr_buckets); } diff --git a/cmd_fsck.c b/cmd_fsck.c index 9ef69ad..247e207 100644 --- a/cmd_fsck.c +++ b/cmd_fsck.c @@ -93,10 +93,14 @@ int cmd_fsck(int argc, char *argv[]) exit(8); } - if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags)) + if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags)) { + fprintf(stderr, "%s: errors fixed\n", c->name); ret |= 1; - if (test_bit(BCH_FS_ERROR, &c->flags)) + } + if (test_bit(BCH_FS_ERROR, &c->flags)) { + fprintf(stderr, "%s: still has errors\n", c->name); ret |= 4; + } bch2_fs_stop(c); return ret; diff --git a/cmd_migrate.c b/cmd_migrate.c index 797c51e..a0d2742 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -122,8 +122,8 @@ static void update_inode(struct bch_fs *c, struct bkey_inode_buf packed; int ret; - bch2_inode_pack(&packed, inode); - ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, + bch2_inode_pack(c, &packed, inode); + ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, NULL, NULL, 0); if (ret) die("error updating inode: %s", strerror(-ret)); @@ -301,7 +301,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, while (length) { struct bkey_i_extent *e; - BKEY_PADDED(k) k; + __BKEY_PADDED(k, BKEY_EXTENT_VAL_U64s_MAX) k; u64 b = sector_to_bucket(ca, physical); struct disk_reservation res; unsigned sectors; @@ -329,7 +329,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c); - ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i, + ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, NULL, 0); if (ret) die("btree insert error %s", strerror(-ret)); @@ -599,7 +599,9 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path, bch2_alloc_write(c, false); } -static void find_superblock_space(ranges extents, struct dev_opts *dev) +static void find_superblock_space(ranges extents, + struct format_opts opts, + struct dev_opts *dev) { struct range *i; @@ -609,9 +611,10 @@ static void find_superblock_space(ranges extents, struct dev_opts *dev) u64 end = round_down(i->end, dev->bucket_size << 9); - if (start + (128 << 10) <= end) { + /* Need space for two superblocks: */ + if (start + (opts.superblock_size << 9) * 2 <= end) { dev->sb_offset = start >> 9; - dev->sb_end = dev->sb_offset + 256; + dev->sb_end = dev->sb_offset + opts.superblock_size * 2; return; } } @@ -673,7 +676,7 @@ static int migrate_fs(const char *fs_path, get_size(dev.path, dev.fd) / 5, &bcachefs_inum, stat.st_dev, force); - find_superblock_space(extents, &dev); + find_superblock_space(extents, format_opts, &dev); struct bch_sb *sb = bch2_format(fs_opt_strs, fs_opts,format_opts, &dev, 1); diff --git a/cmds.h b/cmds.h index bcd27ad..cc49084 100644 --- a/cmds.h +++ b/cmds.h @@ -28,8 +28,10 @@ int cmd_device_offline(int argc, char *argv[]); int cmd_device_evacuate(int argc, char *argv[]); int cmd_device_set_state(int argc, char *argv[]); int cmd_device_resize(int argc, char *argv[]); +int cmd_device_resize_journal(int argc, char *argv[]); int cmd_data_rereplicate(int argc, char *argv[]); +int cmd_data_job(int argc, char *argv[]); int cmd_unlock(int argc, char *argv[]); int cmd_set_passphrase(int argc, char *argv[]); diff --git a/debian/changelog b/debian/changelog index 684da48..3bd5898 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,12 @@ +bcachefs-tools (0.1+git20210404.ce906d66-1) UNRELEASED; urgency=medium + + * New upstream snapshot + * Update standards version to 4.5.1 + + Currently unreleased due to test failures. + + -- Jonathan Carter Tue, 06 Apr 2021 15:11:27 +0200 + bcachefs-tools (0.1+git20201025.742dbbdb-1) unstable; urgency=medium * New upstream snapshot diff --git a/debian/control b/debian/control index 81ffb46..caf1b0d 100644 --- a/debian/control +++ b/debian/control @@ -2,7 +2,7 @@ Source: bcachefs-tools Maintainer: Jonathan Carter Section: utils Priority: optional -Standards-Version: 4.5.0 +Standards-Version: 4.5.1 Rules-Requires-Root: no Build-Depends: debhelper-compat (= 13), pkg-config, diff --git a/debian/files b/debian/files index ba38766..d1acbd3 100644 --- a/debian/files +++ b/debian/files @@ -1 +1 @@ -bcachefs-tools_0.1+git20201025.742dbbdb-1_source.buildinfo utils optional +bcachefs-tools_0.1+git20210404.ce906d66-1_source.buildinfo utils optional diff --git a/include/linux/bitops.h b/include/linux/bitops.h index f2183d5..2fe736e 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -85,6 +85,17 @@ static inline bool test_and_set_bit(long nr, volatile unsigned long *addr) return (old & mask) != 0; } +static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr); + unsigned long old; + + old = __atomic_fetch_and(p, ~mask, __ATOMIC_RELAXED); + + return (old & mask) != 0; +} + static inline void clear_bit_unlock(long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 024d645..bfab7ea 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -10,6 +10,8 @@ #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) +#define raw_smp_processor_id() 0U + #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 3a91130..f09689d 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -183,6 +183,14 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); static inline void __genradix_iter_advance(struct genradix_iter *iter, size_t obj_size) { + size_t new_offset = iter->offset + obj_size; + + if (new_offset < iter->offset) { + iter->offset = SIZE_MAX; + iter->pos = SIZE_MAX; + return; + } + iter->offset += obj_size; if (!is_power_of_2(obj_size) && diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 10d94c5..4b45306 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -219,4 +219,6 @@ struct qstr { #define POISON_FREE 0x6b +static inline void dump_stack(void) {} + #endif diff --git a/include/linux/list.h b/include/linux/list.h index 4a31709..3639dc9 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -26,7 +26,6 @@ #define list_for_each_entry(p, h, m) cds_list_for_each_entry(p, h, m) #define list_for_each_entry_reverse(p, h, m) cds_list_for_each_entry_reverse(p, h, m) #define list_for_each_entry_safe(p, n, h, m) cds_list_for_each_entry_safe(p, n, h, m) -#define list_for_each_entry_safe_reverse(p, n, h, m) cds_list_for_each_entry_safe_reverse(p, n, h, m) static inline int list_empty_careful(const struct list_head *head) { @@ -54,6 +53,15 @@ static inline void list_splice_init(struct list_head *list, #define list_first_entry_or_null(ptr, type, member) \ (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) +#define list_prev_entry(pos, member) \ + list_entry((pos)->member.prev, typeof(*(pos)), member) + +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_last_entry(head, typeof(*pos), member), \ + n = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_prev_entry(n, member)) + /* hlists: */ #include diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h new file mode 100644 index 0000000..fa6e847 --- /dev/null +++ b/include/linux/list_nulls.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LIST_NULLS_H +#define _LINUX_LIST_NULLS_H + +#include +#include + +/* + * Special version of lists, where end of list is not a NULL pointer, + * but a 'nulls' marker, which can have many different values. + * (up to 2^31 different values guaranteed on all platforms) + * + * In the standard hlist, termination of a list is the NULL pointer. + * In this special 'nulls' variant, we use the fact that objects stored in + * a list are aligned on a word (4 or 8 bytes alignment). + * We therefore use the last significant bit of 'ptr' : + * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1) + * Set to 0 : This is a pointer to some object (ptr) + */ + +struct hlist_nulls_head { + struct hlist_nulls_node *first; +}; + +struct hlist_nulls_node { + struct hlist_nulls_node *next, **pprev; +}; +#define NULLS_MARKER(value) (1UL | (((long)value) << 1)) +#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ + ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) + +#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_nulls_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ + }) +/** + * ptr_is_a_nulls - Test if a ptr is a nulls + * @ptr: ptr to be tested + * + */ +static inline int is_a_nulls(const struct hlist_nulls_node *ptr) +{ + return ((unsigned long)ptr & 1); +} + +/** + * get_nulls_value - Get the 'nulls' value of the end of chain + * @ptr: end of chain + * + * Should be called only if is_a_nulls(ptr); + */ +static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) +{ + return ((unsigned long)ptr) >> 1; +} + +/** + * hlist_nulls_unhashed - Has node been removed and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed state. + * For example, hlist_del_init_rcu() leaves the node in unhashed state, + * but hlist_nulls_del() does not. + */ +static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) +{ + return !h->pprev; +} + +/** + * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? + * @h: Node to be checked + * + * Not that not all removal functions will leave a node in unhashed state. + * For example, hlist_del_init_rcu() leaves the node in unhashed state, + * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this + * function may be used locklessly. + */ +static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) +{ + return !READ_ONCE(h->pprev); +} + +static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) +{ + return is_a_nulls(READ_ONCE(h->first)); +} + +static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *first = h->first; + + n->next = first; + WRITE_ONCE(n->pprev, &h->first); + h->first = n; + if (!is_a_nulls(first)) + WRITE_ONCE(first->pprev, &n->next); +} + +static inline void __hlist_nulls_del(struct hlist_nulls_node *n) +{ + struct hlist_nulls_node *next = n->next; + struct hlist_nulls_node **pprev = n->pprev; + + WRITE_ONCE(*pprev, next); + if (!is_a_nulls(next)) + WRITE_ONCE(next->pprev, pprev); +} + +static inline void hlist_nulls_del(struct hlist_nulls_node *n) +{ + __hlist_nulls_del(n); + WRITE_ONCE(n->pprev, LIST_POISON2); +} + +/** + * hlist_nulls_for_each_entry - iterate over list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + */ +#define hlist_nulls_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + * + */ +#define hlist_nulls_for_each_entry_from(tpos, pos, member) \ + for (; (!is_a_nulls(pos)) && \ + ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +#endif diff --git a/include/linux/overflow.h b/include/linux/overflow.h new file mode 100644 index 0000000..ef74051 --- /dev/null +++ b/include/linux/overflow.h @@ -0,0 +1,346 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +#ifndef __LINUX_OVERFLOW_H +#define __LINUX_OVERFLOW_H + +#include +#include + +/* + * In the fallback code below, we need to compute the minimum and + * maximum values representable in a given type. These macros may also + * be useful elsewhere, so we provide them outside the + * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. + * + * It would seem more obvious to do something like + * + * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) + * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) + * + * Unfortunately, the middle expressions, strictly speaking, have + * undefined behaviour, and at least some versions of gcc warn about + * the type_max expression (but not if -fsanitize=undefined is in + * effect; in that case, the warning is deferred to runtime...). + * + * The slightly excessive casting in type_min is to make sure the + * macros also produce sensible values for the exotic type _Bool. [The + * overflow checkers only almost work for _Bool, but that's + * a-feature-not-a-bug, since people shouldn't be doing arithmetic on + * _Bools. Besides, the gcc builtins don't allow _Bool* as third + * argument.] + * + * Idea stolen from + * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - + * credit to Christian Biere. + */ +#define is_signed_type(type) (((type)(-1)) < (type)1) +#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) +#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_min(T) ((T)((T)-type_max(T)-(T)1)) + +/* + * Avoids triggering -Wtype-limits compilation warning, + * while using unsigned data types to check a < 0. + */ +#define is_non_negative(a) ((a) > 0 || (a) == 0) +#define is_negative(a) (!(is_non_negative(a))) + +/* + * Allows for effectively applying __must_check to a macro so we can have + * both the type-agnostic benefits of the macros while also being able to + * enforce that the return value is, in fact, checked. + */ +static inline bool __must_check __must_check_overflow(bool overflow) +{ + return unlikely(overflow); +} + +#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW +/* + * For simplicity and code hygiene, the fallback code below insists on + * a, b and *d having the same type (similar to the min() and max() + * macros), whereas gcc's type-generic overflow checkers accept + * different types. Hence we don't just make check_add_overflow an + * alias for __builtin_add_overflow, but add type checks similar to + * below. + */ +#define check_add_overflow(a, b, d) __must_check_overflow(({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_add_overflow(__a, __b, __d); \ +})) + +#define check_sub_overflow(a, b, d) __must_check_overflow(({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_sub_overflow(__a, __b, __d); \ +})) + +#define check_mul_overflow(a, b, d) __must_check_overflow(({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_mul_overflow(__a, __b, __d); \ +})) + +#else + + +/* Checking for unsigned overflow is relatively easy without causing UB. */ +#define __unsigned_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a + __b; \ + *__d < __a; \ +}) +#define __unsigned_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a - __b; \ + __a < __b; \ +}) +/* + * If one of a or b is a compile-time constant, this avoids a division. + */ +#define __unsigned_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a * __b; \ + __builtin_constant_p(__b) ? \ + __b > 0 && __a > type_max(typeof(__a)) / __b : \ + __a > 0 && __b > type_max(typeof(__b)) / __a; \ +}) + +/* + * For signed types, detecting overflow is much harder, especially if + * we want to avoid UB. But the interface of these macros is such that + * we must provide a result in *d, and in fact we must produce the + * result promised by gcc's builtins, which is simply the possibly + * wrapped-around value. Fortunately, we can just formally do the + * operations in the widest relevant unsigned type (u64) and then + * truncate the result - gcc is smart enough to generate the same code + * with and without the (u64) casts. + */ + +/* + * Adding two signed integers can overflow only if they have the same + * sign, and overflow has happened iff the result has the opposite + * sign. + */ +#define __signed_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a + (u64)__b; \ + (((~(__a ^ __b)) & (*__d ^ __a)) \ + & type_min(typeof(__a))) != 0; \ +}) + +/* + * Subtraction is similar, except that overflow can now happen only + * when the signs are opposite. In this case, overflow has happened if + * the result has the opposite sign of a. + */ +#define __signed_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a - (u64)__b; \ + ((((__a ^ __b)) & (*__d ^ __a)) \ + & type_min(typeof(__a))) != 0; \ +}) + +/* + * Signed multiplication is rather hard. gcc always follows C99, so + * division is truncated towards 0. This means that we can write the + * overflow check like this: + * + * (a > 0 && (b > MAX/a || b < MIN/a)) || + * (a < -1 && (b > MIN/a || b < MAX/a) || + * (a == -1 && b == MIN) + * + * The redundant casts of -1 are to silence an annoying -Wtype-limits + * (included in -Wextra) warning: When the type is u8 or u16, the + * __b_c_e in check_mul_overflow obviously selects + * __unsigned_mul_overflow, but unfortunately gcc still parses this + * code and warns about the limited range of __b. + */ + +#define __signed_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + typeof(a) __tmax = type_max(typeof(a)); \ + typeof(a) __tmin = type_min(typeof(a)); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a * (u64)__b; \ + (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ + (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ + (__b == (typeof(__b))-1 && __a == __tmin); \ +}) + + +#define check_add_overflow(a, b, d) __must_check_overflow( \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_add_overflow(a, b, d), \ + __unsigned_add_overflow(a, b, d))) + +#define check_sub_overflow(a, b, d) __must_check_overflow( \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_sub_overflow(a, b, d), \ + __unsigned_sub_overflow(a, b, d))) + +#define check_mul_overflow(a, b, d) __must_check_overflow( \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_mul_overflow(a, b, d), \ + __unsigned_mul_overflow(a, b, d))) + +#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ + +/** check_shl_overflow() - Calculate a left-shifted value and check overflow + * + * @a: Value to be shifted + * @s: How many bits left to shift + * @d: Pointer to where to store the result + * + * Computes *@d = (@a << @s) + * + * Returns true if '*d' cannot hold the result or when 'a << s' doesn't + * make sense. Example conditions: + * - 'a << s' causes bits to be lost when stored in *d. + * - 's' is garbage (e.g. negative) or so large that the result of + * 'a << s' is guaranteed to be 0. + * - 'a' is negative. + * - 'a << s' sets the sign bit, if any, in '*d'. + * + * '*d' will hold the results of the attempted shift, but is not + * considered "safe for use" if false is returned. + */ +#define check_shl_overflow(a, s, d) __must_check_overflow(({ \ + typeof(a) _a = a; \ + typeof(s) _s = s; \ + typeof(d) _d = d; \ + u64 _a_full = _a; \ + unsigned int _to_shift = \ + is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ + *_d = (_a_full << _to_shift); \ + (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ + (*_d >> _to_shift) != _a); \ +})) + +/** + * array_size() - Calculate size of 2-dimensional array. + * + * @a: dimension one + * @b: dimension two + * + * Calculates size of 2-dimensional array: @a * @b. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +static inline __must_check size_t array_size(size_t a, size_t b) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/** + * array3_size() - Calculate size of 3-dimensional array. + * + * @a: dimension one + * @b: dimension two + * @c: dimension three + * + * Calculates size of 3-dimensional array: @a * @b * @c. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + if (check_mul_overflow(bytes, c, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/* + * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for + * struct_size() below. + */ +static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + if (check_add_overflow(bytes, c, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/** + * struct_size() - Calculate size of structure with trailing array. + * @p: Pointer to the structure. + * @member: Name of the array member. + * @count: Number of elements in the array. + * + * Calculates size of memory needed for structure @p followed by an + * array of @count number of @member elements. + * + * Return: number of bytes needed or SIZE_MAX on overflow. + */ +#define struct_size(p, member, count) \ + __ab_c_size(count, \ + sizeof(*(p)->member) + __must_be_array((p)->member),\ + sizeof(*(p))) + +/** + * flex_array_size() - Calculate size of a flexible array member + * within an enclosing structure. + * + * @p: Pointer to the structure. + * @member: Name of the flexible array member. + * @count: Number of elements in the array. + * + * Calculates size of a flexible array of @count number of @member + * elements, at the end of structure @p. + * + * Return: number of bytes needed or SIZE_MAX on overflow. + */ +#define flex_array_size(p, member, count) \ + array_size(count, \ + sizeof(*(p)->member) + __must_be_array((p)->member)) + +#endif /* __LINUX_OVERFLOW_H */ diff --git a/include/linux/page.h b/include/linux/page.h index 87be064..310b3ed 100644 --- a/include/linux/page.h +++ b/include/linux/page.h @@ -21,6 +21,8 @@ struct page; #define kmap_atomic(page) page_address(page) #define kunmap_atomic(addr) do {} while (0) +#define PageHighMem(page) false + static const char zero_page[PAGE_SIZE]; #define ZERO_PAGE(o) ((struct page *) &zero_page[0]) diff --git a/include/linux/poison.h b/include/linux/poison.h new file mode 100644 index 0000000..dc8ae5d --- /dev/null +++ b/include/linux/poison.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_POISON_H +#define _LINUX_POISON_H + +/********** include/linux/list.h **********/ + +/* + * Architectures might want to move the poison pointer offset + * into some well-recognized area such as 0xdead000000000000, + * that is also not mappable by user-space exploits: + */ +#ifdef CONFIG_ILLEGAL_POINTER_VALUE +# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL) +#else +# define POISON_POINTER_DELTA 0 +#endif + +/* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x122 + POISON_POINTER_DELTA) + +/********** include/linux/timer.h **********/ +#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) + +/********** mm/page_poison.c **********/ +#ifdef CONFIG_PAGE_POISONING_ZERO +#define PAGE_POISON 0x00 +#else +#define PAGE_POISON 0xaa +#endif + +/********** mm/page_alloc.c ************/ + +#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA) + +/********** mm/slab.c **********/ +/* + * Magic nums for obj red zoning. + * Placed in the first word before and the first word after an obj. + */ +#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ +#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ + +#define SLUB_RED_INACTIVE 0xbb +#define SLUB_RED_ACTIVE 0xcc + +/* ...and for poisoning */ +#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ +#define POISON_FREE 0x6b /* for use-after-free poisoning */ +#define POISON_END 0xa5 /* end-byte of poisoning */ + +/********** arch/$ARCH/mm/init.c **********/ +#define POISON_FREE_INITMEM 0xcc + +/********** arch/ia64/hp/common/sba_iommu.c **********/ +/* + * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a + * value of "SBAIOMMU POISON\0" for spill-over poisoning. + */ + +/********** fs/jbd/journal.c **********/ +#define JBD_POISON_FREE 0x5b +#define JBD2_POISON_FREE 0x5c + +/********** drivers/base/dmapool.c **********/ +#define POOL_POISON_FREED 0xa7 /* !inuse */ +#define POOL_POISON_ALLOCATED 0xa9 /* !initted */ + +/********** drivers/atm/ **********/ +#define ATM_POISON_FREE 0x12 +#define ATM_POISON 0xdeadbeef + +/********** kernel/mutexes **********/ +#define MUTEX_DEBUG_INIT 0x11 +#define MUTEX_DEBUG_FREE 0x22 +#define MUTEX_POISON_WW_CTX ((void *) 0x500 + POISON_POINTER_DELTA) + +/********** security/ **********/ +#define KEY_DESTROY 0xbd + +#endif diff --git a/include/linux/random.h b/include/linux/random.h index c38ae46..28c595a 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -45,6 +45,7 @@ static inline type get_random_##type(void) \ get_random_type(int); get_random_type(long); +get_random_type(u32); get_random_type(u64); #endif /* _LINUX_RANDOM_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c99d78a..ae29224 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -13,4 +13,32 @@ #define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v) +/* Has the specified rcu_head structure been handed to call_rcu()? */ + +/** + * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() + * @rhp: The rcu_head structure to initialize. + * + * If you intend to invoke rcu_head_after_call_rcu() to test whether a + * given rcu_head structure has already been passed to call_rcu(), then + * you must also invoke this rcu_head_init() function on it just after + * allocating that structure. Calls to this function must not race with + * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. + */ +static inline void rcu_head_init(struct rcu_head *rhp) +{ + rhp->func = (void *)~0L; +} + +static inline bool +rcu_head_after_call_rcu(struct rcu_head *rhp, + void (*f)(struct rcu_head *head)) +{ + void (*func)(struct rcu_head *head) = READ_ONCE(rhp->func); + + if (func == f) + return true; + return false; +} + #endif /* __TOOLS_LINUX_RCUPDATE_H */ diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h new file mode 100644 index 0000000..57467cb --- /dev/null +++ b/include/linux/rhashtable-types.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Simple structures that might be needed in include + * files. + */ + +#ifndef _LINUX_RHASHTABLE_TYPES_H +#define _LINUX_RHASHTABLE_TYPES_H + +#include +#include +#include +#include + +struct rhash_head { + struct rhash_head __rcu *next; +}; + +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + +struct bucket_table; + +/** + * struct rhashtable_compare_arg - Key for the function rhashtable_compare + * @ht: Hash table + * @key: Key to compare against + */ +struct rhashtable_compare_arg { + struct rhashtable *ht; + const void *key; +}; + +typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); +typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); +typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, + const void *obj); + +/** + * struct rhashtable_params - Hash table construction parameters + * @nelem_hint: Hint on number of elements, should be 75% of desired size + * @key_len: Length of key + * @key_offset: Offset of key in struct to be hashed + * @head_offset: Offset of rhash_head in struct to be hashed + * @max_size: Maximum size while expanding + * @min_size: Minimum size while shrinking + * @automatic_shrinking: Enable automatic shrinking of tables + * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) + * @obj_hashfn: Function to hash object + * @obj_cmpfn: Function to compare key with object + */ +struct rhashtable_params { + u16 nelem_hint; + u16 key_len; + u16 key_offset; + u16 head_offset; + unsigned int max_size; + u16 min_size; + bool automatic_shrinking; + rht_hashfn_t hashfn; + rht_obj_hashfn_t obj_hashfn; + rht_obj_cmpfn_t obj_cmpfn; +}; + +/** + * struct rhashtable - Hash table handle + * @tbl: Bucket table + * @key_len: Key length for hashfn + * @max_elems: Maximum number of elements in table + * @p: Configuration parameters + * @rhlist: True if this is an rhltable + * @run_work: Deferred worker to expand/shrink asynchronously + * @mutex: Mutex to protect current/future table swapping + * @lock: Spin lock to protect walker list + * @nelems: Number of elements in table + */ +struct rhashtable { + struct bucket_table __rcu *tbl; + unsigned int key_len; + unsigned int max_elems; + struct rhashtable_params p; + bool rhlist; + struct work_struct run_work; + struct mutex mutex; + spinlock_t lock; + atomic_t nelems; +}; + +/** + * struct rhltable - Hash table with duplicate objects in a list + * @ht: Underlying rhtable + */ +struct rhltable { + struct rhashtable ht; +}; + +/** + * struct rhashtable_walker - Hash table walker + * @list: List entry on list of walkers + * @tbl: The table that we were walking over + */ +struct rhashtable_walker { + struct list_head list; + struct bucket_table *tbl; +}; + +/** + * struct rhashtable_iter - Hash table iterator + * @ht: Table to iterate through + * @p: Current pointer + * @list: Current hash list pointer + * @walker: Associated rhashtable walker + * @slot: Current slot + * @skip: Number of entries to skip in slot + */ +struct rhashtable_iter { + struct rhashtable *ht; + struct rhash_head *p; + struct rhlist_head *list; + struct rhashtable_walker walker; + unsigned int slot; + unsigned int skip; + bool end_of_table; +}; + +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params); +int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params); + +#endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 8dbe153..6cf8c25 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1,7 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * - * Copyright (c) 2015 Herbert Xu + * Copyright (c) 2015-2016 Herbert Xu * Copyright (c) 2014-2015 Thomas Graf * Copyright (c) 2008-2014 Patrick McHardy * @@ -17,92 +18,93 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H -#include -#include -#include #include #include #include -#include -#include -#include +#include #include +#include +#include +#include -#define RHT_BASE_BITS 4 -#define RHT_HASH_BITS 27 -#define RHT_BASE_SHIFT RHT_HASH_BITS -#define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) +#define BIT(nr) (1UL << (nr)) -struct rhash_head { - struct rhash_head __rcu *next; -}; +#include +/* + * Objects in an rhashtable have an embedded struct rhash_head + * which is linked into as hash chain from the hash table - or one + * of two or more hash tables when the rhashtable is being resized. + * The end of the chain is marked with a special nulls marks which has + * the least significant bit set but otherwise stores the address of + * the hash bucket. This allows us to be sure we've found the end + * of the right list. + * The value stored in the hash bucket has BIT(0) used as a lock bit. + * This bit must be atomically set before any changes are made to + * the chain. To avoid dereferencing this pointer without clearing + * the bit first, we use an opaque 'struct rhash_lock_head *' for the + * pointer stored in the bucket. This struct needs to be defined so + * that rcu_dereference() works on it, but it has no content so a + * cast is needed for it to be useful. This ensures it isn't + * used by mistake with clearing the lock bit first. + */ +struct rhash_lock_head {}; +/* Maximum chain length before rehash + * + * The maximum (not average) chain length grows with the size of the hash + * table, at a rate of (log N)/(log log N). + * + * The value of 16 is selected so that even if the hash table grew to + * 2^32 you would not expect the maximum chain length to exceed it + * unless we are under attack (or extremely unlucky). + * + * As this limit is only to detect attacks, we don't need to set it to a + * lower value as you'd need the chain length to vastly exceed 16 to have + * any real effect on the system. + */ +#define RHT_ELASTICITY 16u + +/** + * struct bucket_table - Table of hash buckets + * @size: Number of hash buckets + * @nest: Number of bits of first-level nested table. + * @rehash: Current bucket being rehashed + * @hash_rnd: Random seed to fold into hash + * @walkers: List of active walkers + * @rcu: RCU structure for freeing the table + * @future_tbl: Table under construction during rehashing + * @ntbl: Nested table used when out of memory. + * @buckets: size * hash buckets + */ struct bucket_table { unsigned int size; - unsigned int rehash; + unsigned int nest; u32 hash_rnd; - unsigned int locks_mask; - spinlock_t *locks; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; - struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; -}; - -struct rhashtable_compare_arg { - struct rhashtable *ht; - const void *key; + struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); -typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); -typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, - const void *obj); - -struct rhashtable_params { - size_t nelem_hint; - size_t key_len; - size_t key_offset; - size_t head_offset; - unsigned int insecure_max_entries; - unsigned int max_size; - unsigned int min_size; - u32 nulls_base; - bool insecure_elasticity; - bool automatic_shrinking; - size_t locks_mul; - rht_hashfn_t hashfn; - rht_obj_hashfn_t obj_hashfn; - rht_obj_cmpfn_t obj_cmpfn; -}; - -struct rhashtable { - struct bucket_table __rcu *tbl; - atomic_t nelems; - unsigned int key_len; - unsigned int elasticity; - struct rhashtable_params p; - struct work_struct run_work; - struct mutex mutex; - spinlock_t lock; -}; - -struct rhashtable_walker { - struct list_head list; - struct bucket_table *tbl; -}; - -#define NULLS_MARKER(value) (1UL | (((long)value) << 1)) - -static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) -{ - return NULLS_MARKER(ht->p.nulls_base + hash); -} - -#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \ - ((ptr) = (typeof(ptr)) rht_marker(ht, hash)) +/* + * NULLS_MARKER() expects a hash value with the low + * bits mostly likely to be significant, and it discards + * the msb. + * We give it an address, in which the bottom bit is + * always 0, and the msb might be significant. + * So we shift the address down one bit to align with + * expectations and avoid losing a significant bit. + * + * We never store the NULLS_MARKER in the hash table + * itself as we need the lsb for locking. + * Instead we store a NULL + */ +#define RHT_NULLS_MARKER(ptr) \ + ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) +#define INIT_RHT_NULLS_HEAD(ptr) \ + ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { @@ -118,37 +120,45 @@ static inline void *rht_obj(const struct rhashtable *ht, static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { - return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1); + return hash & (tbl->size - 1); } -static inline unsigned int rht_key_hashfn( - struct rhashtable *ht, const struct bucket_table *tbl, - const void *key, const struct rhashtable_params params) +static inline unsigned int rht_key_get_hash(struct rhashtable *ht, + const void *key, const struct rhashtable_params params, + unsigned int hash_rnd) { unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ if (!__builtin_constant_p(params.key_len)) - hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd); + hash = ht->p.hashfn(key, ht->key_len, hash_rnd); else if (params.key_len) { unsigned int key_len = params.key_len; if (params.hashfn) - hash = params.hashfn(key, key_len, tbl->hash_rnd); + hash = params.hashfn(key, key_len, hash_rnd); else if (key_len & (sizeof(u32) - 1)) - hash = jhash(key, key_len, tbl->hash_rnd); + hash = jhash(key, key_len, hash_rnd); else - hash = jhash2(key, key_len / sizeof(u32), - tbl->hash_rnd); + hash = jhash2(key, key_len / sizeof(u32), hash_rnd); } else { unsigned int key_len = ht->p.key_len; if (params.hashfn) - hash = params.hashfn(key, key_len, tbl->hash_rnd); + hash = params.hashfn(key, key_len, hash_rnd); else - hash = jhash(key, key_len, tbl->hash_rnd); + hash = jhash(key, key_len, hash_rnd); } + return hash; +} + +static inline unsigned int rht_key_hashfn( + struct rhashtable *ht, const struct bucket_table *tbl, + const void *key, const struct rhashtable_params params) +{ + unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); + return rht_bucket_index(tbl, hash); } @@ -165,6 +175,11 @@ static inline unsigned int rht_head_hashfn( rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); } +/** + * rht_grow_above_75 - returns true if nelems > 0.75 * table-size + * @ht: hash table + * @tbl: current table + */ static inline bool rht_grow_above_75(const struct rhashtable *ht, const struct bucket_table *tbl) { @@ -173,6 +188,11 @@ static inline bool rht_grow_above_75(const struct rhashtable *ht, (!ht->p.max_size || tbl->size < ht->p.max_size); } +/** + * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size + * @ht: hash table + * @tbl: current table + */ static inline bool rht_shrink_below_30(const struct rhashtable *ht, const struct bucket_table *tbl) { @@ -181,6 +201,11 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht, tbl->size > ht->p.min_size; } +/** + * rht_grow_above_100 - returns true if nelems > table-size + * @ht: hash table + * @tbl: current table + */ static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { @@ -188,62 +213,353 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht, (!ht->p.max_size || tbl->size < ht->p.max_size); } +/** + * rht_grow_above_max - returns true if table is above maximum + * @ht: hash table + * @tbl: current table + */ static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { - return ht->p.insecure_max_entries && - atomic_read(&ht->nelems) >= ht->p.insecure_max_entries; + return atomic_read(&ht->nelems) >= ht->max_elems; } -static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl, - unsigned int hash) +#ifdef CONFIG_PROVE_LOCKING +int lockdep_rht_mutex_is_held(struct rhashtable *ht); +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); +#else +static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { - return &tbl->locks[hash & tbl->locks_mask]; + return 1; } -int rhashtable_insert_rehash(struct rhashtable *, struct bucket_table *); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *, - const void *, - struct rhash_head *, - struct bucket_table *); +static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, + u32 hash) +{ + return 1; +} +#endif /* CONFIG_PROVE_LOCKING */ + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj); -int rhashtable_init(struct rhashtable *, const struct rhashtable_params *); -void rhashtable_destroy(struct rhashtable *); +void rhashtable_walk_enter(struct rhashtable *ht, + struct rhashtable_iter *iter); +void rhashtable_walk_exit(struct rhashtable_iter *iter); +int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); -#define rht_dereference(p, ht) rcu_dereference(p) -#define rht_dereference_rcu(p, ht) rcu_dereference(p) -#define rht_dereference_bucket(p, tbl, hash) rcu_dereference(p) -#define rht_dereference_bucket_rcu(p, tbl, hash) rcu_dereference(p) +static inline void rhashtable_walk_start(struct rhashtable_iter *iter) +{ + (void)rhashtable_walk_start_check(iter); +} + +void *rhashtable_walk_next(struct rhashtable_iter *iter); +void *rhashtable_walk_peek(struct rhashtable_iter *iter); +void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); + +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg); +void rhashtable_destroy(struct rhashtable *ht); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); + +#define rht_dereference(p, ht) \ + rcu_dereference(p) + +#define rht_dereference_rcu(p, ht) \ + rcu_dereference(p) + +#define rht_dereference_bucket(p, tbl, hash) \ + rcu_dereference(p) + +#define rht_dereference_bucket_rcu(p, tbl, hash) \ + rcu_dereference(p) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) -#define rht_for_each_continue(pos, head, tbl, hash) \ - for (pos = rht_dereference_bucket(head, tbl, hash); \ - !rht_is_a_nulls(pos); \ +static inline struct rhash_lock_head __rcu *const *rht_bucket( + const struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_lock_head __rcu **rht_bucket_var( + struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : + &tbl->buckets[hash]; +} + +static inline struct rhash_lock_head __rcu **rht_bucket_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : + &tbl->buckets[hash]; +} + +/* + * We lock a bucket by setting BIT(0) in the pointer - this is always + * zero in real pointers. The NULLS mark is never stored in the bucket, + * rather we store NULL if the bucket is empty. + * bit_spin_locks do not handle contention well, but the whole point + * of the hashtable design is to achieve minimum per-bucket contention. + * A nested hash table might not have a bucket pointer. In that case + * we cannot get a lock. For remove and replace the bucket cannot be + * interesting and doesn't need locking. + * For insert we allocate the bucket if this is the last bucket_table, + * and then take the lock. + * Sometimes we unlock a bucket by writing a new pointer there. In that + * case we don't need to unlock, but we do need to reset state such as + * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() + * provides the same release semantics that bit_spin_unlock() provides, + * this is safe. + * When we write to a bucket without unlocking, we use rht_assign_locked(). + */ + +static inline void rht_lock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt) +{ + bit_spin_lock(0, (unsigned long *)bkt); +} + +static inline void rht_lock_nested(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bucket, + unsigned int subclass) +{ + bit_spin_lock(0, (unsigned long *)bucket); +} + +static inline void rht_unlock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt) +{ + bit_spin_unlock(0, (unsigned long *)bkt); +} + +static inline struct rhash_head *__rht_ptr( + struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) +{ + return (struct rhash_head *) + ((unsigned long)p & ~BIT(0) ?: + (unsigned long)RHT_NULLS_MARKER(bkt)); +} + +/* + * Where 'bkt' is a bucket and might be locked: + * rht_ptr_rcu() dereferences that pointer and clears the lock bit. + * rht_ptr() dereferences in a context where the bucket is locked. + * rht_ptr_exclusive() dereferences in a context where exclusive + * access is guaranteed, such as when destroying the table. + */ +static inline struct rhash_head *rht_ptr_rcu( + struct rhash_lock_head __rcu *const *bkt) +{ + return __rht_ptr(rcu_dereference(*bkt), bkt); +} + +static inline struct rhash_head *rht_ptr( + struct rhash_lock_head __rcu *const *bkt, + struct bucket_table *tbl, + unsigned int hash) +{ + return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); +} + +static inline struct rhash_head *rht_ptr_exclusive( + struct rhash_lock_head __rcu *const *bkt) +{ + return __rht_ptr(rcu_dereference(*bkt), bkt); +} + +static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, + struct rhash_head *obj) +{ + if (rht_is_a_nulls(obj)) + obj = NULL; + rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); +} + +static inline void rht_assign_unlock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt, + struct rhash_head *obj) +{ + if (rht_is_a_nulls(obj)) + obj = NULL; + rcu_assign_pointer(*bkt, (void *)obj); + preempt_enable(); + __release(bitlock); +} + +/** + * rht_for_each_from - iterate over hash chain from given head + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + */ +#define rht_for_each_from(pos, head, tbl, hash) \ + for (pos = head; \ + !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) +/** + * rht_for_each - iterate over hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash) + rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + tbl, hash) + +/** + * rht_for_each_entry_from - iterate over hash chain from given head + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + */ +#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ + for (pos = head; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = rht_dereference_bucket((pos)->next, tbl, hash)) -#define rht_for_each_rcu_continue(pos, head, tbl, hash) \ +/** + * rht_for_each_entry - iterate over hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + */ +#define rht_for_each_entry(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_from(tpos, pos, \ + rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + tbl, hash, member) + +/** + * rht_for_each_entry_safe - safely iterate over hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @next: the &struct rhash_head to use as next in loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive allows for the looped code to + * remove the loop cursor from the list. + */ +#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ + for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ + (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ + pos = next, \ + next = !rht_is_a_nulls(pos) ? \ + rht_dereference_bucket(pos->next, tbl, hash) : NULL) + +/** + * rht_for_each_rcu_from - iterate over rcu hash chain from given head + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ - pos = rht_dereference_bucket_rcu(head, tbl, hash); \ + pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) -#define rht_for_each_rcu(pos, tbl, hash) \ - rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash) +/** + * rht_for_each_rcu - iterate over rcu hash chain + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_rcu(pos, tbl, hash) \ + for (({barrier(); }), \ + pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ + !rht_is_a_nulls(pos); \ + pos = rcu_dereference_raw(pos->next)) -#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \ +/** + * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @head: the &struct rhash_head to start from + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ - pos = rht_dereference_bucket_rcu(head, tbl, hash); \ + pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) -#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ - tbl, hash, member) +/** + * rht_for_each_entry_rcu - iterate over rcu hash chain of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rhash_head to use as a loop cursor. + * @tbl: the &struct bucket_table + * @hash: the hash value / bucket index + * @member: name of the &struct rhash_head within the hashable struct. + * + * This hash chain list-traversal primitive may safely run concurrently with + * the _rcu mutation primitives such as rhashtable_insert() as long as the + * traversal is guarded by rcu_read_lock(). + */ +#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ + rht_for_each_entry_rcu_from(tpos, pos, \ + rht_ptr_rcu(rht_bucket(tbl, hash)), \ + tbl, hash, member) + +/** + * rhl_for_each_rcu - iterate over rcu hash table list + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_rcu(pos, list) \ + for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) + +/** + * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * @member: name of the &struct rlist_head within the hashable struct. + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_entry_rcu(tpos, pos, list, member) \ + for (pos = list; pos && rht_entry(tpos, pos, member); \ + pos = rcu_dereference_raw(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) @@ -254,7 +570,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } -static inline void *rhashtable_lookup_fast( +/* Internal function, do not use. */ +static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -262,23 +579,27 @@ static inline void *rhashtable_lookup_fast( .ht = ht, .key = key, }; - const struct bucket_table *tbl; + struct rhash_lock_head __rcu *const *bkt; + struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; - rcu_read_lock(); - tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); - rht_for_each_rcu(he, tbl, hash) { - if (params.obj_cmpfn ? - params.obj_cmpfn(&arg, rht_obj(ht, he)) : - rhashtable_compare(&arg, rht_obj(ht, he))) - continue; - rcu_read_unlock(); - return rht_obj(ht, he); - } + bkt = rht_bucket(tbl, hash); + do { + rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { + if (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, he)) : + rhashtable_compare(&arg, rht_obj(ht, he))) + continue; + return he; + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); @@ -286,149 +607,593 @@ restart: tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; - rcu_read_unlock(); return NULL; } -static inline int __rhashtable_insert_fast( - struct rhashtable *ht, const void *key, struct rhash_head *obj, +/** + * rhashtable_lookup - search hash table + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. The first matching entry is returned. + * + * This must only be called under the RCU read lock. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(ht, key, params); + + return he ? rht_obj(ht, he) : NULL; +} + +/** + * rhashtable_lookup_fast - search hash table, without RCU read lock + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. The first matching entry is returned. + * + * Only use this function when you have other mechanisms guaranteeing + * that the object won't go away after the RCU read lock is released. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup_fast( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + void *obj; + + rcu_read_lock(); + obj = rhashtable_lookup(ht, key, params); + rcu_read_unlock(); + + return obj; +} + +/** + * rhltable_lookup - search hash list table + * @hlt: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. All matching entries are returned + * in a list. + * + * This must only be called under the RCU read lock. + * + * Returns the list of entries that match the given key. + */ +static inline struct rhlist_head *rhltable_lookup( + struct rhltable *hlt, const void *key, const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); + + return he ? container_of(he, struct rhlist_head, rhead) : NULL; +} + +/* Internal function, please use rhashtable_insert_fast() instead. This + * function returns the existing element already in hashes in there is a clash, + * otherwise it returns an error via ERR_PTR(). + */ +static inline void *__rhashtable_insert_fast( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; - struct bucket_table *tbl, *new_tbl; + struct rhash_lock_head __rcu **bkt; + struct rhash_head __rcu **pprev; + struct bucket_table *tbl; struct rhash_head *head; - spinlock_t *lock; - unsigned int elasticity; unsigned int hash; - int err; + int elasticity; + void *data; -restart: rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); + hash = rht_head_hashfn(ht, tbl, obj, params); + elasticity = RHT_ELASTICITY; + bkt = rht_bucket_insert(ht, tbl, hash); + data = ERR_PTR(-ENOMEM); + if (!bkt) + goto out; + pprev = NULL; + rht_lock(tbl, bkt); - /* All insertions must grab the oldest table containing - * the hashed bucket that is yet to be rehashed. - */ - for (;;) { - hash = rht_head_hashfn(ht, tbl, obj, params); - lock = rht_bucket_lock(tbl, hash); - spin_lock_bh(lock); + if (unlikely(rcu_access_pointer(tbl->future_tbl))) { +slow_path: + rht_unlock(tbl, bkt); + rcu_read_unlock(); + return rhashtable_insert_slow(ht, key, obj); + } - if (tbl->rehash <= hash) - break; + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *plist; + struct rhlist_head *list; - spin_unlock_bh(lock); - tbl = rht_dereference_rcu(tbl->future_tbl, ht); - } + elasticity--; + if (!key || + (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } - new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (unlikely(new_tbl)) { - tbl = rhashtable_insert_slow(ht, key, obj, new_tbl); - if (!IS_ERR_OR_NULL(tbl)) - goto slow_path; + data = rht_obj(ht, head); - err = PTR_ERR(tbl); - goto out; - } + if (!rhlist) + goto out_unlock; - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto out; - if (unlikely(rht_grow_above_100(ht, tbl))) { -slow_path: - spin_unlock_bh(lock); - err = rhashtable_insert_rehash(ht, tbl); - rcu_read_unlock(); - if (err) - return err; + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); - goto restart; + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) { + rcu_assign_pointer(*pprev, obj); + rht_unlock(tbl, bkt); + } else + rht_assign_unlock(tbl, bkt, obj); + data = NULL; + goto out; } - err = -EEXIST; - elasticity = ht->elasticity; - rht_for_each(head, tbl, hash) { - if (key && - unlikely(!(params.obj_cmpfn ? - params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head))))) - goto out; - if (!--elasticity) - goto slow_path; - } + if (elasticity <= 0) + goto slow_path; + + data = ERR_PTR(-E2BIG); + if (unlikely(rht_grow_above_max(ht, tbl))) + goto out_unlock; - err = 0; + if (unlikely(rht_grow_above_100(ht, tbl))) + goto slow_path; - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + /* Inserting at head of list makes unlocking free. */ + head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (rhlist) { + struct rhlist_head *list; - rcu_assign_pointer(tbl->buckets[hash], obj); + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } atomic_inc(&ht->nelems); + rht_assign_unlock(tbl, bkt, obj); + if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); + data = NULL; out: - spin_unlock_bh(lock); rcu_read_unlock(); - return err; + return data; + +out_unlock: + rht_unlock(tbl, bkt); + goto out; } +/** + * rhashtable_insert_fast - insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhashtable_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + void *ret; + + ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhltable_insert_key - insert object into hash list table + * @hlt: hash list table + * @key: the pointer to the key + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhltable_insert_key( + struct rhltable *hlt, const void *key, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params, true)); +} + +/** + * rhltable_insert - insert object into hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take the per bucket bitlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ +static inline int rhltable_insert( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + const char *key = rht_obj(&hlt->ht, &list->rhead); + + key += params.key_offset; + + return rhltable_insert_key(hlt, key, list, params); +} + +/** + * rhashtable_lookup_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * This lookup function may only be used for fixed key hash table (key_len + * parameter set). It will BUG() if used inappropriately. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + */ static inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); + void *ret; BUG_ON(ht->p.obj_hashfn); - return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, - params); + ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; } -static inline int __rhashtable_remove_fast( +/** + * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_fast(), but this function returns the + * object if it exists, NULL if it did not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + const char *key = rht_obj(ht, obj); + + BUG_ON(ht->p.obj_hashfn); + + return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); +} + +/** + * rhashtable_lookup_insert_key - search and insert object to hash table + * with explicit key + * @ht: hash table + * @key: key + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Lookups may occur in parallel with hashtable mutations and resizing. + * + * Will trigger an automatic deferred table resizing if residency in the + * table grows beyond 70%. + * + * Returns zero on success. + */ +static inline int rhashtable_lookup_insert_key( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params) +{ + void *ret; + + BUG_ON(!ht->p.obj_hashfn || !key); + + ret = __rhashtable_insert_fast(ht, key, obj, params, false); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhashtable_lookup_get_insert_key - lookup and insert object into hash table + * @ht: hash table + * @key: key + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_key(), but this function returns the + * object if it exists, NULL if it does not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_key( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params) +{ + BUG_ON(!ht->p.obj_hashfn || !key); + + return __rhashtable_insert_fast(ht, key, obj, params, false); +} + +/* Internal function, please use rhashtable_remove_fast() instead */ +static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, - struct rhash_head *obj, const struct rhashtable_params params) + struct rhash_head *obj, const struct rhashtable_params params, + bool rhlist) { + struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; - spinlock_t * lock; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); - lock = rht_bucket_lock(tbl, hash); + bkt = rht_bucket_var(tbl, hash); + if (!bkt) + return -ENOENT; + pprev = NULL; + rht_lock(tbl, bkt); - spin_lock_bh(lock); + rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + + list = container_of(he, struct rhlist_head, rhead); - pprev = &tbl->buckets[hash]; - rht_for_each(he, tbl, hash) { if (he != obj) { + struct rhlist_head __rcu **lpprev; + pprev = &he->next; - continue; + + if (!rhlist) + continue; + + do { + lpprev = &list->next; + list = rht_dereference_bucket(list->next, + tbl, hash); + } while (list && obj != &list->rhead); + + if (!list) + continue; + + list = rht_dereference_bucket(list->next, tbl, hash); + RCU_INIT_POINTER(*lpprev, list); + err = 0; + break; } - rcu_assign_pointer(*pprev, obj->next); + obj = rht_dereference_bucket(obj->next, tbl, hash); + err = 1; + + if (rhlist) { + list = rht_dereference_bucket(list->next, tbl, hash); + if (list) { + RCU_INIT_POINTER(list->rhead.next, obj); + obj = &list->rhead; + err = 0; + } + } + + if (pprev) { + rcu_assign_pointer(*pprev, obj); + rht_unlock(tbl, bkt); + } else { + rht_assign_unlock(tbl, bkt, obj); + } + goto unlocked; + } + + rht_unlock(tbl, bkt); +unlocked: + if (err > 0) { + atomic_dec(&ht->nelems); + if (unlikely(ht->p.automatic_shrinking && + rht_shrink_below_30(ht, tbl))) + schedule_work(&ht->run_work); err = 0; - break; } - spin_unlock_bh(lock); + return err; +} + +/* Internal function, please use rhashtable_remove_fast() instead */ +static inline int __rhashtable_remove_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params, bool rhlist) +{ + struct bucket_table *tbl; + int err; + + rcu_read_lock(); + + tbl = rht_dereference_rcu(ht->tbl, ht); + + /* Because we have already taken (and released) the bucket + * lock in old_tbl, if we find that future_tbl is not yet + * visible then that guarantees the entry to still be in + * the old tbl if it exists. + */ + while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, + rhlist)) && + (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) + ; + + rcu_read_unlock(); return err; } +/** + * rhashtable_remove_fast - remove object from hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table if permitted when residency drops + * below 30%. + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ static inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(ht, obj, params, false); +} + +/** + * rhltable_remove - remove object from hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table if permitted when residency drops + * below 30% + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ +static inline int rhltable_remove( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); +} + +/* Internal function, please use rhashtable_replace_fast() instead */ +static inline int __rhashtable_replace_fast( + struct rhashtable *ht, struct bucket_table *tbl, + struct rhash_head *obj_old, struct rhash_head *obj_new, + const struct rhashtable_params params) +{ + struct rhash_lock_head __rcu **bkt; + struct rhash_head __rcu **pprev; + struct rhash_head *he; + unsigned int hash; + int err = -ENOENT; + + /* Minimally, the old and new objects must have same hash + * (which should mean identifiers are the same). + */ + hash = rht_head_hashfn(ht, tbl, obj_old, params); + if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) + return -EINVAL; + + bkt = rht_bucket_var(tbl, hash); + if (!bkt) + return -ENOENT; + + pprev = NULL; + rht_lock(tbl, bkt); + + rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { + if (he != obj_old) { + pprev = &he->next; + continue; + } + + rcu_assign_pointer(obj_new->next, obj_old->next); + if (pprev) { + rcu_assign_pointer(*pprev, obj_new); + rht_unlock(tbl, bkt); + } else { + rht_assign_unlock(tbl, bkt, obj_new); + } + err = 0; + goto unlocked; + } + + rht_unlock(tbl, bkt); + +unlocked: + return err; +} + +/** + * rhashtable_replace_fast - replace an object in hash table + * @ht: hash table + * @obj_old: pointer to hash head inside object being replaced + * @obj_new: pointer to hash head inside object which is new + * @params: hash table parameters + * + * Replacing an object doesn't affect the number of elements in the hash table + * or bucket, so we don't need to worry about shrinking or expanding the + * table here. + * + * Returns zero on success, -ENOENT if the entry could not be found, + * -EINVAL if hash is not the same for the old and new objects. + */ +static inline int rhashtable_replace_fast( + struct rhashtable *ht, struct rhash_head *obj_old, + struct rhash_head *obj_new, + const struct rhashtable_params params) { struct bucket_table *tbl; int err; @@ -442,22 +1207,62 @@ static inline int rhashtable_remove_fast( * visible then that guarantees the entry to still be in * the old tbl if it exists. */ - while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) && + while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, + obj_new, params)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; - if (err) - goto out; - - atomic_dec(&ht->nelems); - if (unlikely(ht->p.automatic_shrinking && - rht_shrink_below_30(ht, tbl))) - schedule_work(&ht->run_work); - -out: rcu_read_unlock(); return err; } +/** + * rhltable_walk_enter - Initialise an iterator + * @hlt: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +static inline void rhltable_walk_enter(struct rhltable *hlt, + struct rhashtable_iter *iter) +{ + return rhashtable_walk_enter(&hlt->ht, iter); +} + +/** + * rhltable_free_and_destroy - free elements and destroy hash list table + * @hlt: the hash list table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * See documentation for rhashtable_free_and_destroy. + */ +static inline void rhltable_free_and_destroy(struct rhltable *hlt, + void (*free_fn)(void *ptr, + void *arg), + void *arg) +{ + return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); +} + +static inline void rhltable_destroy(struct rhltable *hlt) +{ + return rhltable_free_and_destroy(hlt, NULL, NULL); +} + #endif /* _LINUX_RHASHTABLE_H */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 347105c..03feda7 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -1,7 +1,8 @@ #ifndef _LINUX_SCHED_MM_H #define _LINUX_SCHED_MM_H -#define PF_MEMALLOC_NOFS 0 +#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ static inline unsigned int memalloc_nofs_save(void) { @@ -15,4 +16,16 @@ static inline void memalloc_nofs_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; } +static inline unsigned int memalloc_noreclaim_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC; + return flags; +} + +static inline void memalloc_noreclaim_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC) | flags; +} + #endif /* _LINUX_SCHED_MM_H */ diff --git a/include/linux/six.h b/include/linux/six.h index a16e94f..477c33e 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -80,7 +80,8 @@ union six_lock_state { }; struct { - unsigned read_lock:28; + unsigned read_lock:27; + unsigned write_locking:1; unsigned intent_lock:1; unsigned waiters:3; /* @@ -107,6 +108,7 @@ struct six_lock { unsigned intent_lock_recurse; struct task_struct *owner; struct optimistic_spin_queue osq; + unsigned __percpu *readers; raw_spinlock_t wait_lock; struct list_head wait_list[2]; @@ -194,4 +196,8 @@ void six_lock_increment(struct six_lock *, enum six_lock_type); void six_lock_wakeup_all(struct six_lock *); +void six_lock_pcpu_free_rcu(struct six_lock *); +void six_lock_pcpu_free(struct six_lock *); +void six_lock_pcpu_alloc(struct six_lock *); + #endif /* _LINUX_SIX_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 32ffa55..775b7e3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -58,7 +58,7 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) #define kzalloc(size, flags) kmalloc(size, flags|__GFP_ZERO) #define kmalloc_array(n, size, flags) \ ((size) != 0 && (n) > SIZE_MAX / (size) \ - ? NULL : kmalloc(n * size, flags)) + ? NULL : kmalloc((n) * (size), flags)) #define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO) @@ -66,6 +66,7 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) #define kzfree(p) free(p) #define kvmalloc(size, flags) kmalloc(size, flags) +#define kvzalloc(size, flags) kzalloc(size, flags) #define kvfree(p) kfree(p) static inline struct page *alloc_pages(gfp_t flags, unsigned int order) @@ -132,4 +133,35 @@ static inline void *kmemdup(const void *src, size_t len, gfp_t gfp) return p; } +struct kmem_cache { + size_t obj_size; +}; + +static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp) +{ + return kmalloc(c->obj_size, gfp); +} + +static inline void kmem_cache_free(struct kmem_cache *c, void *p) +{ + kfree(p); +} + +static inline void kmem_cache_destroy(struct kmem_cache *p) +{ + kfree(p); +} + +static inline struct kmem_cache *kmem_cache_create(size_t obj_size) +{ + struct kmem_cache *p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return NULL; + + p->obj_size = obj_size; + return p; +} + +#define KMEM_CACHE(_struct, _flags) kmem_cache_create(sizeof(struct _struct)) + #endif /* __TOOLS_LINUX_SLAB_H */ diff --git a/include/linux/srcu.h b/include/linux/srcu.h new file mode 100644 index 0000000..75823cf --- /dev/null +++ b/include/linux/srcu.h @@ -0,0 +1,31 @@ +#ifndef __TOOLS_LINUX_SRCU_H +#define __TOOLS_LINUX_SRCU_H + +struct srcu_struct { +}; + +static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) {} + +static inline int srcu_read_lock(struct srcu_struct *ssp) +{ + return 0; +} + +static inline bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + return false; +} + +static inline unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + return 0; +} + +static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {} + +static inline int init_srcu_struct(struct srcu_struct *ssp) +{ + return 0; +} + +#endif /* __TOOLS_LINUX_SRCU_H */ diff --git a/include/linux/types.h b/include/linux/types.h index 387c383..c9886cb 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -11,6 +11,8 @@ #define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */ #include +#include + #define BITS_PER_LONG __BITS_PER_LONG struct page; @@ -31,6 +33,7 @@ typedef unsigned gfp_t; #define __GFP_IO 0 #define __GFP_NOWARN 0 #define __GFP_NORETRY 0 +#define __GFP_NOFAIL 0 #define __GFP_ZERO 1 #define PAGE_ALLOC_COSTLY_ORDER 6 diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index efcc191..c674d9a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -16,6 +16,8 @@ static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask) { void *p; + size = round_up(size, PAGE_SIZE); + run_shrinkers(); p = aligned_alloc(PAGE_SIZE, size); diff --git a/include/linux/wait.h b/include/linux/wait.h index 62d15e5..c3d9824 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -91,6 +91,7 @@ do { \ } while (0) #define wait_event_killable(wq, condition) ({wait_event(wq, condition); 0; }) +#define wait_event_interruptible(wq, condition) ({wait_event(wq, condition); 0; }) #define __wait_event_timeout(wq, condition, timeout) \ ___wait_event(wq, ___wait_cond_timeout(condition), \ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 9b4e829..d4cb7a2 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -121,6 +121,65 @@ DEFINE_EVENT(bio, journal_write, TP_ARGS(bio) ); +TRACE_EVENT(journal_reclaim_start, + TP_PROTO(struct bch_fs *c, u64 min_nr, + u64 prereserved, u64 prereserved_total, + u64 btree_cache_dirty, u64 btree_cache_total, + u64 btree_key_cache_dirty, u64 btree_key_cache_total), + TP_ARGS(c, min_nr, prereserved, prereserved_total, + btree_cache_dirty, btree_cache_total, + btree_key_cache_dirty, btree_key_cache_total), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(u64, min_nr ) + __field(u64, prereserved ) + __field(u64, prereserved_total ) + __field(u64, btree_cache_dirty ) + __field(u64, btree_cache_total ) + __field(u64, btree_key_cache_dirty ) + __field(u64, btree_key_cache_total ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->min_nr = min_nr; + __entry->prereserved = prereserved; + __entry->prereserved_total = prereserved_total; + __entry->btree_cache_dirty = btree_cache_dirty; + __entry->btree_cache_total = btree_cache_total; + __entry->btree_key_cache_dirty = btree_key_cache_dirty; + __entry->btree_key_cache_total = btree_key_cache_total; + ), + + TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + __entry->uuid, + __entry->min_nr, + __entry->prereserved, + __entry->prereserved_total, + __entry->btree_cache_dirty, + __entry->btree_cache_total, + __entry->btree_key_cache_dirty, + __entry->btree_key_cache_total) +); + +TRACE_EVENT(journal_reclaim_finish, + TP_PROTO(struct bch_fs *c, u64 nr_flushed), + TP_ARGS(c, nr_flushed), + + TP_STRUCT__entry( + __array(char, uuid, 16 ) + __field(u64, nr_flushed ) + ), + + TP_fast_assign( + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->nr_flushed = nr_flushed; + ), + + TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed) +); + /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, @@ -513,7 +572,7 @@ TRACE_EVENT(transaction_restart_ip, __entry->ip = ip; ), - TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) + TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip) ); DECLARE_EVENT_CLASS(transaction_restart, @@ -528,7 +587,7 @@ DECLARE_EVENT_CLASS(transaction_restart, __entry->ip = ip; ), - TP_printk("%pf", (void *) __entry->ip) + TP_printk("%ps", (void *) __entry->ip) ); DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, @@ -536,9 +595,46 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, TP_ARGS(ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, - TP_PROTO(unsigned long ip), - TP_ARGS(ip) +TRACE_EVENT(trans_restart_would_deadlock, + TP_PROTO(unsigned long trans_ip, + unsigned long caller_ip, + unsigned reason, + enum btree_id have_btree_id, + unsigned have_iter_type, + enum btree_id want_btree_id, + unsigned want_iter_type), + TP_ARGS(trans_ip, caller_ip, reason, + have_btree_id, have_iter_type, + want_btree_id, want_iter_type), + + TP_STRUCT__entry( + __field(unsigned long, trans_ip ) + __field(unsigned long, caller_ip ) + __field(u8, reason ) + __field(u8, have_btree_id ) + __field(u8, have_iter_type ) + __field(u8, want_btree_id ) + __field(u8, want_iter_type ) + ), + + TP_fast_assign( + __entry->trans_ip = trans_ip; + __entry->caller_ip = caller_ip; + __entry->reason = reason; + __entry->have_btree_id = have_btree_id; + __entry->have_iter_type = have_iter_type; + __entry->want_btree_id = want_btree_id; + __entry->want_iter_type = want_iter_type; + ), + + TP_printk("%ps %pS because %u have %u:%u want %u:%u", + (void *) __entry->trans_ip, + (void *) __entry->caller_ip, + __entry->reason, + __entry->have_btree_id, + __entry->have_iter_type, + __entry->want_btree_id, + __entry->want_iter_type) ); TRACE_EVENT(trans_restart_iters_realloced, @@ -555,7 +651,7 @@ TRACE_EVENT(trans_restart_iters_realloced, __entry->nr = nr; ), - TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) + TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr) ); TRACE_EVENT(trans_restart_mem_realloced, @@ -572,7 +668,7 @@ TRACE_EVENT(trans_restart_mem_realloced, __entry->bytes = bytes; ), - TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) + TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, @@ -585,6 +681,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, TP_ARGS(ip) ); +DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) +); + DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, TP_PROTO(unsigned long ip), TP_ARGS(ip) @@ -620,11 +721,6 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse, TP_ARGS(ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_atomic, - TP_PROTO(unsigned long ip), - TP_ARGS(ip) -); - DECLARE_EVENT_CLASS(node_lock_fail, TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), TP_ARGS(level, iter_seq, node, node_seq), diff --git a/libbcachefs.c b/libbcachefs.c index 7ff02b8..e5dcfd8 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -35,52 +35,35 @@ static u64 min_size(unsigned bucket_size) return BCH_MIN_NR_NBUCKETS * bucket_size; } -static void init_layout(struct bch_sb_layout *l, unsigned block_size, +static void init_layout(struct bch_sb_layout *l, + unsigned block_size, + unsigned sb_size, u64 start, u64 end) { - unsigned sb_size; - u64 backup; /* offset of 2nd sb */ + unsigned i; memset(l, 0, sizeof(*l)); - if (start != BCH_SB_SECTOR) - start = round_up(start, block_size); - end = round_down(end, block_size); - - if (start >= end) - die("insufficient space for superblocks"); - - /* - * Create two superblocks in the allowed range: reserve a maximum of 64k - */ - sb_size = min_t(u64, 128, end - start / 2); - - backup = start + sb_size; - backup = round_up(backup, block_size); - - backup = min(backup, end); - - sb_size = min(end - backup, backup- start); - sb_size = rounddown_pow_of_two(sb_size); - - if (sb_size < 8) - die("insufficient space for superblocks"); - l->magic = BCACHE_MAGIC; l->layout_type = 0; l->nr_superblocks = 2; l->sb_max_size_bits = ilog2(sb_size); - l->sb_offset[0] = cpu_to_le64(start); - l->sb_offset[1] = cpu_to_le64(backup); + + /* Create two superblocks in the allowed range: */ + for (i = 0; i < l->nr_superblocks; i++) { + if (start != BCH_SB_SECTOR) + start = round_up(start, block_size); + + l->sb_offset[i] = cpu_to_le64(start); + start += sb_size; + } + + if (start >= end) + die("insufficient space for superblocks"); } void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev) { - if (!dev->sb_offset) { - dev->sb_offset = BCH_SB_SECTOR; - dev->sb_end = BCH_SB_SECTOR + 256; - } - if (!dev->size) dev->size = get_size(dev->path, dev->fd) >> 9; @@ -202,13 +185,16 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, if (bch2_sb_realloc(&sb, 0)) die("insufficient memory"); - sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); - sb.sb->version_min = le16_to_cpu(bcachefs_metadata_version_current); + sb.sb->version = le16_to_cpu(opts.version); + sb.sb->version_min = le16_to_cpu(opts.version); sb.sb->magic = BCACHE_MAGIC; sb.sb->block_size = cpu_to_le16(fs_opts.block_size); sb.sb->user_uuid = opts.uuid; sb.sb->nr_devices = nr_devs; + if (opts.version == bcachefs_metadata_version_current) + sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + uuid_generate(sb.sb->uuid.b); if (opts.label) @@ -255,7 +241,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, m->first_bucket = 0; m->bucket_size = cpu_to_le16(i->bucket_size); - SET_BCH_MEMBER_REPLACEMENT(m, CACHE_REPLACEMENT_LRU); + SET_BCH_MEMBER_REPLACEMENT(m, BCH_CACHE_REPLACEMENT_lru); SET_BCH_MEMBER_DISCARD(m, i->discard); SET_BCH_MEMBER_DATA_ALLOWED(m, i->data_allowed); SET_BCH_MEMBER_DURABILITY(m, i->durability + 1); @@ -282,6 +268,8 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, parse_target(&sb, devs, nr_devs, fs_opt_strs.background_target)); SET_BCH_SB_PROMOTE_TARGET(sb.sb, parse_target(&sb, devs, nr_devs, fs_opt_strs.promote_target)); + SET_BCH_SB_METADATA_TARGET(sb.sb, + parse_target(&sb, devs, nr_devs, fs_opt_strs.metadata_target)); /* Crypt: */ if (opts.encrypted) { @@ -295,7 +283,13 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, for (i = devs; i < devs + nr_devs; i++) { sb.sb->dev_idx = i - devs; + if (!i->sb_offset) { + i->sb_offset = BCH_SB_SECTOR; + i->sb_end = i->size; + } + init_layout(&sb.sb->layout, fs_opts.block_size, + opts.superblock_size, i->sb_offset, i->sb_end); if (i->sb_offset == BCH_SB_SECTOR) { @@ -533,14 +527,14 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f, time_str, BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR - ? bch2_dev_state[BCH_MEMBER_STATE(m)] + ? bch2_member_states[BCH_MEMBER_STATE(m)] : "unknown", group, data_allowed_str, data_has_str, - BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR + BCH_MEMBER_REPLACEMENT(m) < BCH_CACHE_REPLACEMENT_NR ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)] : "unknown", @@ -619,6 +613,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f, static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f, enum units units) { + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + + printf(" flags: %x", le32_to_cpu(clean->flags)); + printf(" journal seq: %llx", le64_to_cpu(clean->journal_seq)); } static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f, @@ -669,13 +668,15 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout, { struct bch_sb_field_members *mi; char user_uuid_str[40], internal_uuid_str[40]; - char features_str[200]; + char features_str[500]; + char compat_features_str[500]; char fields_have_str[200]; char label[BCH_SB_LABEL_SIZE + 1]; char time_str[64]; char foreground_str[64]; char background_str[64]; char promote_str[64]; + char metadata_str[64]; struct bch_sb_field *f; u64 fields_have = 0; unsigned nr_devices = 0; @@ -715,10 +716,17 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout, bch2_sb_get_target(sb, promote_str, sizeof(promote_str), BCH_SB_PROMOTE_TARGET(sb)); + bch2_sb_get_target(sb, metadata_str, sizeof(metadata_str), + BCH_SB_METADATA_TARGET(sb)); + bch2_flags_to_text(&PBUF(features_str), bch2_sb_features, le64_to_cpu(sb->features[0])); + bch2_flags_to_text(&PBUF(compat_features_str), + bch2_sb_compat, + le64_to_cpu(sb->compat[0])); + vstruct_for_each(sb, f) fields_have |= 1 << le32_to_cpu(f->type); bch2_flags_to_text(&PBUF(fields_have_str), @@ -726,8 +734,10 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout, printf("External UUID: %s\n" "Internal UUID: %s\n" + "Device index: %u\n" "Label: %s\n" - "Version: %llu\n" + "Version: %u\n" + "Oldest version on disk: %u\n" "Created: %s\n" "Squence number: %llu\n" "Block_size: %s\n" @@ -735,6 +745,7 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout, "Error action: %s\n" "Clean: %llu\n" "Features: %s\n" + "Compat features: %s\n" "Metadata replicas: %llu\n" "Data replicas: %llu\n" @@ -746,6 +757,7 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout, "Foreground write target: %s\n" "Background write target: %s\n" "Promote target: %s\n" + "Metadata target: %s\n" "String hash type: %s (%llu)\n" "32 bit inodes: %llu\n" @@ -757,19 +769,22 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout, "Superblock size: %llu\n", user_uuid_str, internal_uuid_str, + sb->dev_idx, label, - le64_to_cpu(sb->version), + le16_to_cpu(sb->version), + le16_to_cpu(sb->version_min), time_str, le64_to_cpu(sb->seq), pr_units(le16_to_cpu(sb->block_size), units), pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units), - BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS + BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR ? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)] : "unknown", BCH_SB_CLEAN(sb), features_str, + compat_features_str, BCH_SB_META_REPLICAS_WANT(sb), BCH_SB_DATA_REPLICAS_WANT(sb), @@ -792,6 +807,7 @@ void bch2_sb_print(struct bch_sb *sb, bool print_layout, foreground_str, background_str, promote_str, + metadata_str, BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR ? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)] @@ -891,7 +907,7 @@ struct bchfs_handle bcache_fs_open(const char *path) * Given a path to a block device, open the filesystem it belongs to; also * return the device's idx: */ -struct bchfs_handle bchu_fs_open_by_dev(const char *path, unsigned *idx) +struct bchfs_handle bchu_fs_open_by_dev(const char *path, int *idx) { char buf[1024], *uuid_str; @@ -935,6 +951,17 @@ struct bchfs_handle bchu_fs_open_by_dev(const char *path, unsigned *idx) return bcache_fs_open(uuid_str); } +int bchu_dev_path_to_idx(struct bchfs_handle fs, const char *dev_path) +{ + int idx; + struct bchfs_handle fs2 = bchu_fs_open_by_dev(dev_path, &idx); + + if (memcmp(&fs.uuid, &fs2.uuid, sizeof(fs.uuid))) + idx = -1; + bcache_fs_close(fs2); + return idx; +} + int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd) { int progress_fd = xioctl(fs.ioctl_fd, BCH_IOCTL_DATA, &cmd); diff --git a/libbcachefs.h b/libbcachefs.h index 30add92..45d2f87 100644 --- a/libbcachefs.h +++ b/libbcachefs.h @@ -12,6 +12,8 @@ /* option parsing */ +#define SUPERBLOCK_SIZE_DEFAULT 2048 /* 1 MB */ + struct bch_opt_strs { union { char *by_id[bch2_opts_nr]; @@ -30,9 +32,9 @@ void bch2_opts_usage(unsigned); struct format_opts { char *label; uuid_le uuid; - + unsigned version; + unsigned superblock_size; unsigned encoded_extent_max; - bool encrypted; char *passphrase; }; @@ -40,6 +42,8 @@ struct format_opts { static inline struct format_opts format_opts_default() { return (struct format_opts) { + .version = bcachefs_metadata_version_current, + .superblock_size = SUPERBLOCK_SIZE_DEFAULT, .encoded_extent_max = 128, }; } @@ -90,7 +94,8 @@ struct bchfs_handle { void bcache_fs_close(struct bchfs_handle); struct bchfs_handle bcache_fs_open(const char *); -struct bchfs_handle bchu_fs_open_by_dev(const char *, unsigned *); +struct bchfs_handle bchu_fs_open_by_dev(const char *, int *); +int bchu_dev_path_to_idx(struct bchfs_handle, const char *); static inline void bchu_disk_add(struct bchfs_handle fs, char *dev) { @@ -214,6 +219,19 @@ static inline void bchu_disk_resize(struct bchfs_handle fs, xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE, &i); } +static inline void bchu_disk_resize_journal(struct bchfs_handle fs, + unsigned idx, + u64 nbuckets) +{ + struct bch_ioctl_disk_resize i = { + .flags = BCH_BY_INDEX, + .dev = idx, + .nbuckets = nbuckets, + }; + + xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE_JOURNAL, &i); +} + int bchu_data(struct bchfs_handle, struct bch_ioctl_data); struct dev_name { diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 76c98dd..0f2d743 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -216,6 +216,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c_xattr xattr; @@ -226,7 +227,7 @@ retry: bch2_trans_begin(&trans); iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, - &inode->ei_str_hash, inode->v.i_ino, + &hash, inode->v.i_ino, &X_SEARCH(acl_to_xattr_type(type), "", 0), 0); if (IS_ERR(iter)) { @@ -239,12 +240,12 @@ retry: } xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); - acl = bch2_acl_from_disk(xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); if (!IS_ERR(acl)) set_cached_acl(&inode->v, type, acl); + bch2_trans_iter_put(&trans, iter); out: bch2_trans_exit(&trans); return acl; @@ -287,6 +288,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) struct btree_trans trans; struct btree_iter *inode_iter; struct bch_inode_unpacked inode_u; + struct bch_hash_info hash_info; struct posix_acl *acl; umode_t mode; int ret; @@ -308,12 +310,12 @@ retry: if (type == ACL_TYPE_ACCESS) { ret = posix_acl_update_mode(&inode->v, &mode, &acl); if (ret) - goto err; + goto btree_err; } - ret = bch2_set_acl_trans(&trans, &inode_u, - &inode->ei_str_hash, - acl, type); + hash_info = bch2_hash_info_init(c, &inode_u); + + ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type); if (ret) goto btree_err; @@ -325,6 +327,8 @@ retry: &inode->ei_journal_seq, BTREE_INSERT_NOUNLOCK); btree_err: + bch2_trans_iter_put(&trans, inode_iter); + if (ret == -EINTR) goto retry; if (unlikely(ret)) @@ -342,29 +346,31 @@ err: } int bch2_acl_chmod(struct btree_trans *trans, - struct bch_inode_info *inode, + struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) { + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); struct btree_iter *iter; struct bkey_s_c_xattr xattr; struct bkey_i_xattr *new; struct posix_acl *acl; - int ret = 0; + int ret; iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, - &inode->ei_str_hash, inode->v.i_ino, + &hash_info, inode->bi_inum, &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret == -ENOENT ? 0 : ret; xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); - acl = bch2_acl_from_disk(xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); - if (IS_ERR_OR_NULL(acl)) - return PTR_ERR(acl); + ret = PTR_ERR_OR_ZERO(acl); + if (ret || !acl) + goto err; ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) @@ -381,6 +387,7 @@ int bch2_acl_chmod(struct btree_trans *trans, *new_acl = acl; acl = NULL; err: + bch2_trans_iter_put(trans, iter); kfree(acl); return ret; } diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index cb62d50..ba210c2 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -33,7 +33,7 @@ int bch2_set_acl_trans(struct btree_trans *, const struct bch_hash_info *, struct posix_acl *, int); int bch2_set_acl(struct inode *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, +int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, umode_t, struct posix_acl **); #else @@ -47,7 +47,7 @@ static inline int bch2_set_acl_trans(struct btree_trans *trans, } static inline int bch2_acl_chmod(struct btree_trans *trans, - struct bch_inode_info *inode, + struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) { diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 97508de..48971fc 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -14,6 +14,7 @@ #include "ec.h" #include "error.h" #include "recovery.h" +#include "varint.h" #include #include @@ -24,15 +25,12 @@ #include #include -static const char * const bch2_alloc_field_names[] = { -#define x(name, bytes) #name, - BCH_ALLOC_FIELDS() +static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { +#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() #undef x - NULL }; -static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); - /* Ratelimiting/PD controllers */ static void pd_controllers_update(struct work_struct *work) @@ -54,10 +52,10 @@ static void pd_controllers_update(struct work_struct *work) * reclaimed by copy GC */ fragmented += max_t(s64, 0, (bucket_to_sector(ca, - stats.buckets[BCH_DATA_user] + - stats.buckets[BCH_DATA_cached]) - - (stats.sectors[BCH_DATA_user] + - stats.sectors[BCH_DATA_cached])) << 9); + stats.d[BCH_DATA_user].buckets + + stats.d[BCH_DATA_cached].buckets) - + (stats.d[BCH_DATA_user].sectors + + stats.d[BCH_DATA_cached].sectors)) << 9); } bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); @@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work) /* Persistent alloc info: */ -static inline u64 get_alloc_field(const struct bch_alloc *a, - const void **p, unsigned field) +static inline u64 alloc_field_v1_get(const struct bch_alloc *a, + const void **p, unsigned field) { - unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; u64 v; if (!(a->fields & (1 << field))) @@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a, return v; } -static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, - unsigned field, u64 v) +static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, + unsigned field, u64 v) { - unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; if (!v) return; @@ -127,55 +125,127 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, *p += bytes; } -struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) { - struct bkey_alloc_unpacked ret = { .gen = 0 }; + const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; + const void *d = in->data; + unsigned idx = 0; - if (k.k->type == KEY_TYPE_alloc) { - const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; - const void *d = a->data; - unsigned idx = 0; + out->gen = in->gen; - ret.gen = a->gen; +#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); + BCH_ALLOC_FIELDS_V1() +#undef x +} -#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); - BCH_ALLOC_FIELDS() +static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() #undef x - } - return ret; + return 0; } -void bch2_alloc_pack(struct bkey_i_alloc *dst, - const struct bkey_alloc_unpacked src) +static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) { - unsigned idx = 0; - void *d = dst->v.data; + struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k); + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + u8 *out = a->v.data; + u8 *end = (void *) &dst[1]; + u8 *last_nonzero_field = out; unsigned bytes; - dst->v.fields = 0; - dst->v.gen = src.gen; + a->k.p = POS(src.dev, src.bucket); + a->v.gen = src.gen; + a->v.oldest_gen = src.oldest_gen; + a->v.data_type = src.data_type; + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (src._name) { \ + out += bch2_varint_encode(out, src._name); \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + } -#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); - BCH_ALLOC_FIELDS() + BCH_ALLOC_FIELDS_V2() #undef x + BUG_ON(out > end); + + out = last_nonzero_field; + a->v.nr_fields = last_nonzero_fieldnr; + + bytes = (u8 *) out - (u8 *) &a->v; + set_bkey_val_bytes(&a->k, bytes); + memset_u64s_tail(&a->v, 0, bytes); +} + +struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +{ + struct bkey_alloc_unpacked ret = { + .dev = k.k->p.inode, + .bucket = k.k->p.offset, + .gen = 0, + }; + + if (k.k->type == KEY_TYPE_alloc_v2) + bch2_alloc_unpack_v2(&ret, k); + else if (k.k->type == KEY_TYPE_alloc) + bch2_alloc_unpack_v1(&ret, k); + + return ret; +} - bytes = (void *) d - (void *) &dst->v; - set_bkey_val_bytes(&dst->k, bytes); - memset_u64s_tail(&dst->v, 0, bytes); +void bch2_alloc_pack(struct bch_fs *c, + struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) +{ + bch2_alloc_pack_v2(dst, src); } static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) { unsigned i, bytes = offsetof(struct bch_alloc, data); - for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) if (a->fields & (1 << i)) - bytes += BCH_ALLOC_FIELD_BYTES[i]; + bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; return DIV_ROUND_UP(bytes, sizeof(u64)); } -const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) +const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -190,20 +260,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - const void *d = a.v->data; - unsigned i; + struct bkey_alloc_unpacked u; - pr_buf(out, "gen %u", a.v->gen); + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; + + if (bch2_alloc_unpack_v2(&u, k)) + return "unpack error"; - for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) - if (a.v->fields & (1 << i)) - pr_buf(out, " %s %llu", - bch2_alloc_field_names[i], - get_alloc_field(a.v, &d, i)); + return NULL; +} + +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + pr_buf(out, "gen %u oldest_gen %u data_type %u", + u.gen, u.oldest_gen, u.data_type); +#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name); + BCH_ALLOC_FIELDS_V2() +#undef x } static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, @@ -213,11 +293,13 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, struct bucket *g; struct bkey_alloc_unpacked u; - if (level || k.k->type != KEY_TYPE_alloc) + if (level || + (k.k->type != KEY_TYPE_alloc && + k.k->type != KEY_TYPE_alloc_v2)) return 0; ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = __bucket(ca, k.k->p.offset, 0); + g = bucket(ca, k.k->p.offset); u = bch2_alloc_unpack(k); g->_mark.gen = u.gen; @@ -234,12 +316,10 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) { - struct bch_dev *ca; - unsigned i; - int ret = 0; + int ret; down_read(&c->gc_lock); - ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, + ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc, NULL, bch2_alloc_read_fn); up_read(&c->gc_lock); @@ -248,26 +328,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) return ret; } - percpu_down_write(&c->mark_lock); - bch2_dev_usage_from_buckets(c); - percpu_up_write(&c->mark_lock); - - mutex_lock(&c->bucket_clock[READ].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, READ); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[READ].lock); - - mutex_lock(&c->bucket_clock[WRITE].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, WRITE); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[WRITE].lock); - return 0; } @@ -278,18 +338,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bkey_s_c k; struct bch_dev *ca; - struct bucket_array *ba; struct bucket *g; struct bucket_mark m; struct bkey_alloc_unpacked old_u, new_u; - __BKEY_PADDED(k, 8) alloc_key; /* hack: */ - struct bkey_i_alloc *a; + struct bkey_alloc_buf a; int ret; retry: bch2_trans_begin(trans); ret = bch2_btree_key_cache_flush(trans, - BTREE_ID_ALLOC, iter->pos); + BTREE_ID_alloc, iter->pos); if (ret) goto err; @@ -302,193 +360,60 @@ retry: percpu_down_read(&c->mark_lock); ca = bch_dev_bkey_exists(c, iter->pos.inode); - ba = bucket_array(ca); - - g = &ba->b[iter->pos.offset]; + g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - new_u = alloc_mem_to_key(g, m); + new_u = alloc_mem_to_key(iter, g, m); percpu_up_read(&c->mark_lock); if (!bkey_alloc_unpacked_cmp(old_u, new_u)) return 0; - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, new_u); - - bch2_trans_update(trans, iter, &a->k_i, + bch2_alloc_pack(c, &a, new_u); + bch2_trans_update(trans, iter, &a.k, BTREE_TRIGGER_NORUN); ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - flags); + BTREE_INSERT_NOFAIL|flags); err: if (ret == -EINTR) goto retry; return ret; } -int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) +int bch2_alloc_write(struct bch_fs *c, unsigned flags) { struct btree_trans trans; struct btree_iter *iter; - u64 first_bucket, nbuckets; + struct bch_dev *ca; + unsigned i; int ret = 0; - percpu_down_read(&c->mark_lock); - first_bucket = bucket_array(ca)->first_bucket; - nbuckets = bucket_array(ca)->nbuckets; - percpu_up_read(&c->mark_lock); - - BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, - POS(ca->dev_idx, first_bucket), + iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - while (iter->pos.offset < nbuckets) { - bch2_trans_cond_resched(&trans); - - ret = bch2_alloc_write_key(&trans, iter, flags); - if (ret) - break; - bch2_btree_iter_next_slot(iter); - } - - bch2_trans_exit(&trans); - - return ret; -} + for_each_member_device(ca, c, i) { + bch2_btree_iter_set_pos(iter, + POS(ca->dev_idx, ca->mi.first_bucket)); -int bch2_alloc_write(struct bch_fs *c, unsigned flags) -{ - struct bch_dev *ca; - unsigned i; - int ret = 0; + while (iter->pos.offset < ca->mi.nbuckets) { + bch2_trans_cond_resched(&trans); - for_each_rw_member(ca, c, i) { - bch2_dev_alloc_write(c, ca, flags); - if (ret) { - percpu_ref_put(&ca->io_ref); - break; + ret = bch2_alloc_write_key(&trans, iter, flags); + if (ret) { + percpu_ref_put(&ca->io_ref); + goto err; + } + bch2_btree_iter_next_slot(iter); } } - +err: + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return ret; } /* Bucket IO clocks: */ -static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets = bucket_array(ca); - struct bucket *g; - u16 max_last_io = 0; - unsigned i; - - lockdep_assert_held(&c->bucket_clock[rw].lock); - - /* Recalculate max_last_io for this device: */ - for_each_bucket(g, buckets) - max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); - - ca->max_last_bucket_io[rw] = max_last_io; - - /* Recalculate global max_last_io: */ - max_last_io = 0; - - for_each_member_device(ca, c, i) - max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); - - clock->max_last_io = max_last_io; -} - -static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets; - struct bch_dev *ca; - struct bucket *g; - unsigned i; - - trace_rescale_prios(c); - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - g->io_time[rw] = clock->hand - - bucket_last_io(c, g, rw) / 2; - - bch2_recalc_oldest_io(c, ca, rw); - - up_read(&ca->bucket_lock); - } -} - -static inline u64 bucket_clock_freq(u64 capacity) -{ - return max(capacity >> 10, 2028ULL); -} - -static void bch2_inc_clock_hand(struct io_timer *timer) -{ - struct bucket_clock *clock = container_of(timer, - struct bucket_clock, rescale); - struct bch_fs *c = container_of(clock, - struct bch_fs, bucket_clock[clock->rw]); - struct bch_dev *ca; - u64 capacity; - unsigned i; - - mutex_lock(&clock->lock); - - /* if clock cannot be advanced more, rescale prio */ - if (clock->max_last_io >= U16_MAX - 2) - bch2_rescale_bucket_io_times(c, clock->rw); - - BUG_ON(clock->max_last_io >= U16_MAX - 2); - - for_each_member_device(ca, c, i) - ca->max_last_bucket_io[clock->rw]++; - clock->max_last_io++; - clock->hand++; - - mutex_unlock(&clock->lock); - - capacity = READ_ONCE(c->capacity); - - if (!capacity) - return; - - /* - * we only increment when 0.1% of the filesystem capacity has been read - * or written too, this determines if it's time - * - * XXX: we shouldn't really be going off of the capacity of devices in - * RW mode (that will be 0 when we're RO, yet we can still service - * reads) - */ - timer->expire += bucket_clock_freq(capacity); - - bch2_io_timer_add(&c->io_clock[clock->rw], timer); -} - -static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - - clock->hand = 1; - clock->rw = rw; - clock->rescale.fn = bch2_inc_clock_hand; - clock->rescale.expire = bucket_clock_freq(c->capacity); - mutex_init(&clock->lock); -} - int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, size_t bucket_nr, int rw) { @@ -496,40 +421,38 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, struct bch_dev *ca = bch_dev_bkey_exists(c, dev); struct btree_iter *iter; struct bucket *g; - struct bkey_i_alloc *a; + struct bkey_alloc_buf *a; struct bkey_alloc_unpacked u; - u16 *time; + u64 *time, now; int ret = 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), + iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr), BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto out; - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; percpu_down_read(&c->mark_lock); g = bucket(ca, bucket_nr); - u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - time = rw == READ ? &u.read_time : &u.write_time; - if (*time == c->bucket_clock[rw].hand) + now = atomic64_read(&c->io_clock[rw].now); + if (*time == now) goto out; - *time = c->bucket_clock[rw].hand; - - bch2_alloc_pack(a, u); + *time = now; - ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?: + bch2_alloc_pack(c, a, u); + ret = bch2_trans_update(trans, iter, &a->k, 0) ?: bch2_trans_commit(trans, NULL, NULL, 0); out: bch2_trans_iter_put(trans, iter); @@ -553,7 +476,8 @@ out: static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) { unsigned long gc_count = c->gc_count; - u64 available; + s64 available; + unsigned i; int ret = 0; ca->allocator_state = ALLOCATOR_BLOCKED; @@ -569,13 +493,19 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) if (gc_count != c->gc_count) ca->inc_gen_really_needs_gc = 0; - available = max_t(s64, 0, dev_buckets_available(ca) - - ca->inc_gen_really_needs_gc); + available = dev_buckets_available(ca); + available -= ca->inc_gen_really_needs_gc; + + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + available -= fifo_used(&ca->free[i]); + spin_unlock(&c->freelist_lock); + + available = max(available, 0LL); if (available > fifo_free(&ca->free_inc) || (available && - (!fifo_full(&ca->free[RESERVE_BTREE]) || - !fifo_full(&ca->free[RESERVE_MOVINGGC])))) + !fifo_full(&ca->free[RESERVE_MOVINGGC]))) break; up_read(&c->gc_lock); @@ -591,20 +521,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) return ret; } -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, - size_t bucket, - struct bucket_mark mark) +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + struct bucket_mark m) { u8 gc_gen; - if (!is_available_bucket(mark)) + if (!is_available_bucket(m)) + return false; + + if (m.owned_by_allocator) return false; if (ca->buckets_nouse && - test_bit(bucket, ca->buckets_nouse)) + test_bit(b, ca->buckets_nouse)) return false; - gc_gen = bucket_gc_gen(ca, bucket); + gc_gen = bucket_gc_gen(bucket(ca, b)); if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc++; @@ -618,43 +550,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, /* * Determines what order we're going to reuse buckets, smallest bucket_key() * first. - * - * - * - We take into account the read prio of the bucket, which gives us an - * indication of how hot the data is -- we scale the prio so that the prio - * farthest from the clock is worth 1/8th of the closest. - * - * - The number of sectors of cached data in the bucket, which gives us an - * indication of the cost in cache misses this eviction will cause. - * - * - If hotness * sectors used compares equal, we pick the bucket with the - * smallest bucket_gc_gen() - since incrementing the same bucket's generation - * number repeatedly forces us to run mark and sweep gc to avoid generation - * number wraparound. */ -static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark m) +static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, + u64 now, u64 last_seq_ondisk) { - unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); - unsigned max_last_io = ca->max_last_bucket_io[READ]; - - /* - * Time since last read, scaled to [0, 8) where larger value indicates - * more recently read data: - */ - unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; + unsigned used = bucket_sectors_used(m); - /* How much we want to keep the data in this bucket: */ - unsigned long data_wantness = - (hotness + 1) * bucket_sectors_used(m); - - unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); + if (used) { + /* + * Prefer to keep buckets that have been read more recently, and + * buckets that have more data in them: + */ + u64 last_read = max_t(s64, 0, now - g->io_time[READ]); + u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); - return (data_wantness << 9) | - (needs_journal_commit << 8) | - (bucket_gc_gen(ca, b) / 16); + return -last_read_scaled; + } else { + /* + * Prefer to use buckets with smaller gc_gen so that we don't + * have to walk the btree and recalculate oldest_gen - but shift + * off the low bits so that buckets will still have equal sort + * keys when there's only a small difference, so that we can + * keep sequential buckets together: + */ + return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| + (bucket_gc_gen(g) >> 4); + } } static inline int bucket_alloc_cmp(alloc_heap *h, @@ -677,16 +599,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; struct alloc_heap_entry e = { 0 }; + u64 now, last_seq_ondisk; size_t b, i, nr = 0; - ca->alloc_heap.used = 0; - - mutex_lock(&c->bucket_clock[READ].lock); down_read(&ca->bucket_lock); buckets = bucket_array(ca); - - bch2_recalc_oldest_io(c, ca, READ); + ca->alloc_heap.used = 0; + now = atomic64_read(&c->io_clock[READ].now); + last_seq_ondisk = c->journal.last_seq_ondisk; /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -694,8 +615,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) * all buckets have been visited. */ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - unsigned long key = bucket_sort_key(c, ca, b, m); + struct bucket *g = &buckets->b[b]; + struct bucket_mark m = READ_ONCE(g->mark); + unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); if (!bch2_can_invalidate_bucket(ca, b, m)) continue; @@ -730,7 +652,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) } up_read(&ca->bucket_lock); - mutex_unlock(&c->bucket_clock[READ].lock); } static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) @@ -810,13 +731,13 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ca->inc_gen_needs_gc = 0; switch (ca->mi.replacement) { - case CACHE_REPLACEMENT_LRU: + case BCH_CACHE_REPLACEMENT_lru: find_reclaimable_buckets_lru(c, ca); break; - case CACHE_REPLACEMENT_FIFO: + case BCH_CACHE_REPLACEMENT_fifo: find_reclaimable_buckets_fifo(c, ca); break; - case CACHE_REPLACEMENT_RANDOM: + case BCH_CACHE_REPLACEMENT_random: find_reclaimable_buckets_random(c, ca); break; } @@ -875,14 +796,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, struct btree_iter *iter, u64 *journal_seq, unsigned flags) { -#if 0 - __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; -#else - /* hack: */ - __BKEY_PADDED(k, 8) alloc_key; -#endif struct bch_fs *c = trans->c; - struct bkey_i_alloc *a; + struct bkey_alloc_buf a; struct bkey_alloc_unpacked u; struct bucket *g; struct bucket_mark m; @@ -896,34 +811,33 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, /* first, put on free_inc and mark as owned by allocator: */ percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); - - verify_not_on_freelist(c, ca, b); - - BUG_ON(!fifo_push(&ca->free_inc, b)); - g = bucket(ca, b); m = READ_ONCE(g->mark); - invalidating_cached_data = m.cached_sectors != 0; + BUG_ON(m.dirty_sectors); + + bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + + spin_lock(&c->freelist_lock); + verify_not_on_freelist(c, ca, b); + BUG_ON(!fifo_push(&ca->free_inc, b)); + spin_unlock(&c->freelist_lock); /* * If we're not invalidating cached data, we only increment the bucket * gen in memory here, the incremented gen will be updated in the btree * by bch2_trans_mark_pointer(): */ + if (!m.cached_sectors && + !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { + BUG_ON(m.data_type); + bucket_cmpxchg(g, m, m.gen++); + percpu_up_read(&c->mark_lock); + goto out; + } - if (!invalidating_cached_data) - bch2_invalidate_bucket(c, ca, b, &m); - else - bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); - - spin_unlock(&c->freelist_lock); percpu_up_read(&c->mark_lock); - if (!invalidating_cached_data) - goto out; - /* * If the read-only path is trying to shut down, we can't be generating * new btree updates: @@ -933,8 +847,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, goto out; } - BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); retry: ret = bch2_btree_iter_traverse(iter); @@ -944,7 +856,7 @@ retry: percpu_down_read(&c->mark_lock); g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - u = alloc_mem_to_key(g, m); + u = alloc_mem_to_key(iter, g, m); percpu_up_read(&c->mark_lock); @@ -954,14 +866,11 @@ retry: u.data_type = 0; u.dirty_sectors = 0; u.cached_sectors = 0; - u.read_time = c->bucket_clock[READ].hand; - u.write_time = c->bucket_clock[WRITE].hand; - - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); + u.read_time = atomic64_read(&c->io_clock[READ].now); + u.write_time = atomic64_read(&c->io_clock[WRITE].now); - bch2_trans_update(trans, iter, &a->k_i, + bch2_alloc_pack(c, &a, u); + bch2_trans_update(trans, iter, &a.k, BTREE_TRIGGER_BUCKET_INVALIDATE); /* @@ -976,8 +885,7 @@ retry: BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_JOURNAL_RESERVED| flags); if (ret == -EINTR) goto retry; @@ -1029,8 +937,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) int ret = 0; bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, + iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS(ca->dev_idx, 0), BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| @@ -1045,6 +952,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) (!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0)); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); /* If we used NOWAIT, don't return the error: */ @@ -1138,6 +1046,12 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) return 0; } +static inline bool allocator_thread_running(struct bch_dev *ca) +{ + return ca->mi.state == BCH_MEMBER_STATE_rw && + test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags); +} + /** * bch_allocator_thread - move buckets from free_inc to reserves * @@ -1154,9 +1068,16 @@ static int bch2_allocator_thread(void *arg) int ret; set_freezable(); - ca->allocator_state = ALLOCATOR_RUNNING; while (1) { + if (!allocator_thread_running(ca)) { + ca->allocator_state = ALLOCATOR_STOPPED; + if (kthread_wait_freezable(allocator_thread_running(ca))) + break; + } + + ca->allocator_state = ALLOCATOR_RUNNING; + cond_resched(); if (kthread_should_stop()) break; @@ -1456,9 +1377,12 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; p = kthread_create(bch2_allocator_thread, ca, - "bch_alloc[%s]", ca->name); - if (IS_ERR(p)) + "bch-alloc/%s", ca->name); + if (IS_ERR(p)) { + bch_err(ca->fs, "error creating allocator thread: %li", + PTR_ERR(p)); return PTR_ERR(p); + } get_task_struct(p); rcu_assign_pointer(ca->alloc_thread, p); @@ -1469,8 +1393,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca) void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - bch2_bucket_clock_init(c, READ); - bch2_bucket_clock_init(c, WRITE); c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index cbaff56..6fededc 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -7,12 +7,33 @@ #include "debug.h" struct bkey_alloc_unpacked { + u64 bucket; + u8 dev; u8 gen; + u8 oldest_gen; + u8 data_type; #define x(_name, _bits) u##_bits _name; - BCH_ALLOC_FIELDS() + BCH_ALLOC_FIELDS_V2() #undef x }; +struct bkey_alloc_buf { + struct bkey_i k; + + union { + struct { +#define x(_name, _bits) + _bits / 8 + u8 _pad[8 + BCH_ALLOC_FIELDS_V1()]; +#undef x + } _v1; + struct { +#define x(_name, _bits) + 8 + _bits / 8 + u8 _pad[8 + BCH_ALLOC_FIELDS_V2()]; +#undef x + } _v2; + }; +} __attribute__((packed, aligned(8))); + /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U @@ -20,23 +41,28 @@ struct bkey_alloc_unpacked { static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, struct bkey_alloc_unpacked r) { - return l.gen != r.gen -#define x(_name, _bits) || l._name != r._name - BCH_ALLOC_FIELDS() + return l.gen != r.gen || + l.oldest_gen != r.oldest_gen || + l.data_type != r.data_type +#define x(_name, ...) || l._name != r._name + BCH_ALLOC_FIELDS_V2() #undef x ; } struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -void bch2_alloc_pack(struct bkey_i_alloc *, +void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *, const struct bkey_alloc_unpacked); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); static inline struct bkey_alloc_unpacked -alloc_mem_to_key(struct bucket *g, struct bucket_mark m) +alloc_mem_to_key(struct btree_iter *iter, + struct bucket *g, struct bucket_mark m) { return (struct bkey_alloc_unpacked) { + .dev = iter->pos.inode, + .bucket = iter->pos.offset, .gen = m.gen, .oldest_gen = g->oldest_gen, .data_type = m.data_type, @@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m) #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) -const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc (struct bkey_ops) { \ - .key_invalid = bch2_alloc_invalid, \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ +} + +#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ .val_to_text = bch2_alloc_to_text, \ } @@ -76,7 +108,7 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, size_t bucket) { - if (expensive_debug_checks(c)) { + if (bch2_expensive_debug_checks) { size_t iter; long i; unsigned j; @@ -98,7 +130,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); -int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned); int bch2_alloc_write(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 7a92e3d..8f0b94f 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -192,8 +192,9 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) rcu_read_lock(); buckets = bucket_array(ca); - for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) - if (is_available_bucket(buckets->b[b].mark)) + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) + if (is_available_bucket(buckets->b[b].mark) && + !buckets->b[b].mark.owned_by_allocator) goto success; b = -1; success: @@ -204,9 +205,10 @@ success: static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) { switch (reserve) { - case RESERVE_ALLOC: - return 0; case RESERVE_BTREE: + case RESERVE_BTREE_MOVINGGC: + return 0; + case RESERVE_MOVINGGC: return OPEN_BUCKETS_COUNT / 4; default: return OPEN_BUCKETS_COUNT / 2; @@ -223,9 +225,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bool may_alloc_partial, struct closure *cl) { - struct bucket_array *buckets; struct open_bucket *ob; - long bucket = 0; + long b = 0; spin_lock(&c->freelist_lock); @@ -259,22 +260,13 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, return ERR_PTR(-OPEN_BUCKETS_EMPTY); } - if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) + if (likely(fifo_pop(&ca->free[RESERVE_NONE], b))) goto out; switch (reserve) { - case RESERVE_ALLOC: - if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) - goto out; - break; - case RESERVE_BTREE: - if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= - ca->free[RESERVE_BTREE].size && - fifo_pop(&ca->free[RESERVE_BTREE], bucket)) - goto out; - break; + case RESERVE_BTREE_MOVINGGC: case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b)) goto out; break; default: @@ -292,20 +284,19 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, trace_bucket_alloc_fail(ca, reserve); return ERR_PTR(-FREELIST_EMPTY); out: - verify_not_on_freelist(c, ca, bucket); + verify_not_on_freelist(c, ca, b); ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); - buckets = bucket_array(ca); ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; ob->ptr = (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = buckets->b[bucket].mark.gen, - .offset = bucket_to_sector(ca, bucket), + .gen = bucket(ca, b)->mark.gen, + .offset = bucket_to_sector(ca, b), .dev = ca->dev_idx, }; @@ -458,16 +449,18 @@ bch2_bucket_alloc_set(struct bch_fs *c, * it's to a device we don't want: */ -static void bucket_alloc_from_stripe(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - unsigned flags) +static enum bucket_alloc_ret +bucket_alloc_from_stripe(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + unsigned flags, + struct closure *cl) { struct dev_alloc_list devs_sorted; struct ec_stripe_head *h; @@ -476,31 +469,39 @@ static void bucket_alloc_from_stripe(struct bch_fs *c, unsigned i, ec_idx; if (!erasure_code) - return; + return 0; if (nr_replicas < 2) - return; + return 0; if (ec_open_bucket(c, ptrs)) - return; + return 0; - h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); + h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, + wp == &c->copygc_write_point, + cl); + if (IS_ERR(h)) + return -PTR_ERR(h); if (!h) - return; + return 0; devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); for (i = 0; i < devs_sorted.nr; i++) - open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) + for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + if (!h->s->blocks[ec_idx]) + continue; + + ob = c->open_buckets + h->s->blocks[ec_idx]; if (ob->ptr.dev == devs_sorted.devs[i] && - !test_and_set_bit(h->s->data_block_idx[ec_idx], - h->s->blocks_allocated)) + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) goto got_bucket; + } goto out_put_head; got_bucket: ca = bch_dev_bkey_exists(c, ob->ptr.dev); - ob->ec_idx = h->s->data_block_idx[ec_idx]; + ob->ec_idx = ec_idx; ob->ec = h->s; add_new_bucket(c, ptrs, devs_may_alloc, @@ -508,6 +509,7 @@ got_bucket: atomic_inc(&h->s->pin); out_put_head: bch2_ec_stripe_head_put(c, h); + return 0; } /* Sector allocator */ @@ -585,10 +587,13 @@ open_bucket_add_buckets(struct bch_fs *c, } if (!ec_open_bucket(c, ptrs)) { - bucket_alloc_from_stripe(c, ptrs, wp, &devs, + ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs, target, erasure_code, nr_replicas, nr_effective, - have_cache, flags); + have_cache, flags, _cl); + if (ret == FREELIST_EMPTY || + ret == OPEN_BUCKETS_EMPTY) + return ret; if (*nr_effective >= nr_replicas) return 0; } @@ -634,10 +639,13 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, if (!drop && ob->ec) { mutex_lock(&ob->ec->lock); - open_bucket_for_each(c, &ob->ec->blocks, ob2, j) - drop |= ob2->ptr.dev == ca->dev_idx; - open_bucket_for_each(c, &ob->ec->parity, ob2, j) + for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) { + if (!ob->ec->blocks[j]) + continue; + + ob2 = c->open_buckets + ob->ec->blocks[j]; drop |= ob2->ptr.dev == ca->dev_idx; + } mutex_unlock(&ob->ec->lock); } diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 2070546..be164d6 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -10,38 +10,12 @@ struct ec_bucket_buf; -/* There's two of these clocks, one for reads and one for writes: */ -struct bucket_clock { - /* - * "now" in (read/write) IO time - incremented whenever we do X amount - * of reads or writes. - * - * Goes with the bucket read/write prios: when we read or write to a - * bucket we reset the bucket's prio to the current hand; thus hand - - * prio = time since bucket was last read/written. - * - * The units are some amount (bytes/sectors) of data read/written, and - * the units can change on the fly if we need to rescale to fit - * everything in a u16 - your only guarantee is that the units are - * consistent. - */ - u16 hand; - u16 max_last_io; - - int rw; - - struct io_timer rescale; - struct mutex lock; -}; - -/* There is one reserve for each type of btree, one for prios and gens - * and one for moving GC */ enum alloc_reserve { - RESERVE_ALLOC = -1, - RESERVE_BTREE = 0, - RESERVE_MOVINGGC = 1, - RESERVE_NONE = 2, - RESERVE_NR = 3, + RESERVE_BTREE_MOVINGGC = -2, + RESERVE_BTREE = -1, + RESERVE_MOVINGGC = 0, + RESERVE_NONE = 1, + RESERVE_NR = 2, }; typedef FIFO(long) alloc_fifo; @@ -89,7 +63,6 @@ struct write_point { u64 last_used; unsigned long write_point; enum bch_data_type type; - bool is_ec; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 29f4116..549cded 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -193,6 +193,7 @@ #include #include #include +#include #include #include #include @@ -213,9 +214,11 @@ dynamic_fault("bcachefs:meta:write:" name) #ifdef __KERNEL__ -#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) +#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) #else -#define bch2_fmt(_c, fmt) fmt "\n" +#define bch2_fmt(_c, fmt) fmt "\n" +#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) #endif #define bch_info(c, fmt, ...) \ @@ -228,8 +231,11 @@ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_err(c, fmt, ...) \ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + #define bch_err_ratelimited(c, fmt, ...) \ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) #define bch_verbose(c, fmt, ...) \ do { \ @@ -265,6 +271,8 @@ do { \ BCH_DEBUG_PARAM(debug_check_bkeys, \ "Run bkey_debugcheck (primarily checking GC/allocation "\ "information) when iterating over keys") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ BCH_DEBUG_PARAM(verify_btree_ondisk, \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ @@ -295,6 +303,16 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif +#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +#ifndef CONFIG_BCACHEFS_DEBUG +#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; +BCH_DEBUG_PARAMS_DEBUG() +#undef BCH_DEBUG_PARAM +#endif + #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ x(btree_node_split) \ @@ -351,14 +369,14 @@ enum gc_phase { GC_PHASE_START, GC_PHASE_SB, - GC_PHASE_BTREE_EC, - GC_PHASE_BTREE_EXTENTS, - GC_PHASE_BTREE_INODES, - GC_PHASE_BTREE_DIRENTS, - GC_PHASE_BTREE_XATTRS, - GC_PHASE_BTREE_ALLOC, - GC_PHASE_BTREE_QUOTAS, - GC_PHASE_BTREE_REFLINK, + GC_PHASE_BTREE_stripes, + GC_PHASE_BTREE_extents, + GC_PHASE_BTREE_inodes, + GC_PHASE_BTREE_dirents, + GC_PHASE_BTREE_xattrs, + GC_PHASE_BTREE_alloc, + GC_PHASE_BTREE_quotas, + GC_PHASE_BTREE_reflink, GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, @@ -411,7 +429,9 @@ struct bch_dev { unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; - struct bch_dev_usage __percpu *usage[2]; + struct bch_dev_usage *usage_base; + struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_dev_usage __percpu *usage_gc; /* Allocator: */ struct task_struct __rcu *alloc_thread; @@ -433,9 +453,6 @@ struct bch_dev { size_t fifo_last_bucket; - /* last calculated minimum prio */ - u16 max_last_bucket_io[2]; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; @@ -455,6 +472,7 @@ struct bch_dev { atomic64_t rebalance_work; struct journal_device journal; + u64 prev_journal_sector; struct work_struct io_error_work; @@ -491,8 +509,9 @@ enum { BCH_FS_ERRORS_FIXED, /* misc: */ - BCH_FS_FIXED_GENS, - BCH_FS_ALLOC_WRITTEN, + BCH_FS_NEED_ANOTHER_GC, + BCH_FS_DELETED_NODES, + BCH_FS_NEED_ALLOC_WRITE, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, }; @@ -521,14 +540,20 @@ struct journal_keys { struct journal_key { enum btree_id btree_id:8; unsigned level:8; + bool allocated; struct bkey_i *k; u32 journal_seq; u32 journal_offset; } *d; size_t nr; + size_t size; u64 journal_seq_base; }; +struct btree_iter_buf { + struct btree_iter *iter; +}; + struct bch_fs { struct closure cl; @@ -557,7 +582,10 @@ struct bch_fs { struct bch_replicas_cpu replicas_gc; struct mutex replicas_gc_lock; + struct journal_entry_res btree_root_journal_res; struct journal_entry_res replicas_journal_res; + struct journal_entry_res clock_journal_res; + struct journal_entry_res dev_usage_journal_res; struct bch_disk_groups_cpu __rcu *disk_groups; @@ -569,6 +597,7 @@ struct bch_fs { uuid_le user_uuid; u16 version; + u16 version_min; u16 encoded_extent_max; u8 nr_devices; @@ -624,13 +653,15 @@ struct bch_fs { struct mutex btree_trans_lock; struct list_head btree_trans_list; mempool_t btree_iters_pool; + struct btree_iter_buf __percpu *btree_iters_bufs; + + struct srcu_struct btree_trans_barrier; struct btree_key_cache btree_key_cache; struct workqueue_struct *wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; - struct workqueue_struct *journal_reclaim_wq; /* ALLOCATION */ struct delayed_work pd_controllers_update; @@ -649,6 +680,7 @@ struct bch_fs { unsigned bucket_size_max; atomic64_t sectors_available; + struct mutex sectors_available_lock; struct bch_fs_pcpu __percpu *pcpu; @@ -656,20 +688,13 @@ struct bch_fs { seqcount_t usage_lock; struct bch_fs_usage *usage_base; - struct bch_fs_usage __percpu *usage[2]; + struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; struct bch_fs_usage __percpu *usage_gc; + u64 __percpu *online_reserved; /* single element mempool: */ struct mutex usage_scratch_lock; - struct bch_fs_usage *usage_scratch; - - /* - * When we invalidate buckets, we use both the priority and the amount - * of good data to determine which buckets to reuse first - to weight - * those together consistently we keep track of the smallest nonzero - * priority of any bucket. - */ - struct bucket_clock bucket_clock[2]; + struct bch_fs_usage_online *usage_scratch; struct io_clock io_clock[2]; @@ -705,7 +730,7 @@ struct bch_fs { * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] * has been marked by GC. * - * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) + * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) * * Protected by gc_pos_lock. Only written to by GC thread, so GC thread * can read without a lock. @@ -780,6 +805,9 @@ struct bch_fs { struct bio_set dio_write_bioset; struct bio_set dio_read_bioset; + + atomic64_t btree_writes_nr; + atomic64_t btree_writes_sectors; struct bio_list btree_write_error_list; struct work_struct btree_write_error_work; spinlock_t btree_write_error_lock; @@ -801,7 +829,8 @@ struct bch_fs { struct mutex verify_lock; #endif - u64 unused_inode_hint; + u64 *unused_inode_hints; + unsigned inode_shard_bits; /* * A btree node on disk could have too many bsets for an iterator to fit @@ -814,6 +843,7 @@ struct bch_fs { struct journal journal; struct list_head journal_entries; struct journal_keys journal_keys; + struct list_head journal_iters; u64 last_bucket_seq_cleanup; @@ -826,10 +856,6 @@ struct bch_fs { unsigned copy_gc_enabled:1; bool promote_whole_extents; -#define BCH_DEBUG_PARAM(name, description) bool name; - BCH_DEBUG_PARAMS_ALL() -#undef BCH_DEBUG_PARAM - struct time_stats times[BCH_TIME_STAT_NR]; }; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 2926c64..ead7268 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -138,19 +138,18 @@ struct bpos { #define KEY_SNAPSHOT_MAX ((__u32)~0U) #define KEY_SIZE_MAX ((__u32)~0U) -static inline struct bpos POS(__u64 inode, __u64 offset) +static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) { - struct bpos ret; - - ret.inode = inode; - ret.offset = offset; - ret.snapshot = 0; - - return ret; + return (struct bpos) { + .inode = inode, + .offset = offset, + .snapshot = snapshot, + }; } -#define POS_MIN POS(0, 0) -#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) +#define POS_MIN SPOS(0, 0, 0) +#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) +#define POS(_inode, _offset) SPOS(_inode, _offset, 0) /* Empty placeholder struct, for container_of() */ struct bch_val { @@ -326,7 +325,7 @@ static inline void bkey_init(struct bkey *k) x(discard, 1) \ x(error, 2) \ x(cookie, 3) \ - x(whiteout, 4) \ + x(hash_whiteout, 4) \ x(btree_ptr, 5) \ x(extent, 6) \ x(reservation, 7) \ @@ -341,7 +340,8 @@ static inline void bkey_init(struct bkey *k) x(reflink_v, 16) \ x(inline_data, 17) \ x(btree_ptr_v2, 18) \ - x(indirect_inline_data, 19) + x(indirect_inline_data, 19) \ + x(alloc_v2, 20) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -350,11 +350,27 @@ enum bch_bkey_type { KEY_TYPE_MAX, }; +struct bch_deleted { + struct bch_val v; +}; + +struct bch_discard { + struct bch_val v; +}; + +struct bch_error { + struct bch_val v; +}; + struct bch_cookie { struct bch_val v; __le64 cookie; }; +struct bch_hash_whiteout { + struct bch_val v; +}; + /* Extents */ /* @@ -551,9 +567,11 @@ struct bch_extent_stripe_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:5, block:8, - idx:51; + redundancy:4, + idx:47; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 idx:51, + __u64 idx:47, + redundancy:4, block:8, type:5; #endif @@ -603,13 +621,14 @@ struct bch_btree_ptr_v2 { __u64 mem_ptr; __le64 seq; __le16 sectors_written; - /* In case we ever decide to do variable size btree nodes: */ - __le16 sectors; + __le16 flags; struct bpos min_key; struct bch_extent_ptr start[0]; __u64 _data[0]; } __attribute__((packed, aligned(8))); +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + struct bch_extent { struct bch_val v; @@ -634,8 +653,6 @@ struct bch_reservation { #define BKEY_EXTENT_VAL_U64s_MAX \ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) -#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) - /* * Maximum possible size of an entire extent, key + value: */ #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) @@ -669,10 +686,10 @@ struct bch_inode_generation { } __attribute__((packed, aligned(8))); #define BCH_INODE_FIELDS() \ - x(bi_atime, 64) \ - x(bi_ctime, 64) \ - x(bi_mtime, 64) \ - x(bi_otime, 64) \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ x(bi_size, 64) \ x(bi_sectors, 64) \ x(bi_uid, 32) \ @@ -689,7 +706,9 @@ struct bch_inode_generation { x(bi_foreground_target, 16) \ x(bi_background_target, 16) \ x(bi_erasure_code, 16) \ - x(bi_fields_set, 16) + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -725,6 +744,7 @@ enum { __BCH_INODE_I_SIZE_DIRTY= 5, __BCH_INODE_I_SECTORS_DIRTY= 6, __BCH_INODE_UNLINKED = 7, + __BCH_INODE_BACKPTR_UNTRUSTED = 8, /* bits 20+ reserved for packed fields below: */ }; @@ -737,9 +757,11 @@ enum { #define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) #define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) #define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) +#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); /* Dirents */ @@ -799,35 +821,40 @@ struct bch_alloc { __u8 data[]; } __attribute__((packed, aligned(8))); -#define BCH_ALLOC_FIELDS() \ +#define BCH_ALLOC_FIELDS_V1() \ x(read_time, 16) \ x(write_time, 16) \ x(data_type, 8) \ x(dirty_sectors, 16) \ x(cached_sectors, 16) \ - x(oldest_gen, 8) + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __attribute__((packed, aligned(8))); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) enum { -#define x(name, bytes) BCH_ALLOC_FIELD_##name, - BCH_ALLOC_FIELDS() +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() #undef x BCH_ALLOC_FIELD_NR }; -static const unsigned BCH_ALLOC_FIELD_BYTES[] = { -#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, - BCH_ALLOC_FIELDS() -#undef x -}; - -#define x(name, bits) + (bits / 8) -static const unsigned BKEY_ALLOC_VAL_U64s_MAX = - DIV_ROUND_UP(offsetof(struct bch_alloc, data) - BCH_ALLOC_FIELDS(), sizeof(u64)); -#undef x - -#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) - /* Quotas: */ enum quota_types { @@ -963,19 +990,29 @@ LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); #endif +#define BCH_MEMBER_STATES() \ + x(rw, 0) \ + x(ro, 1) \ + x(failed, 2) \ + x(spare, 3) + enum bch_member_state { - BCH_MEMBER_STATE_RW = 0, - BCH_MEMBER_STATE_RO = 1, - BCH_MEMBER_STATE_FAILED = 2, - BCH_MEMBER_STATE_SPARE = 3, - BCH_MEMBER_STATE_NR = 4, +#define x(t, n) BCH_MEMBER_STATE_##t = n, + BCH_MEMBER_STATES() +#undef x + BCH_MEMBER_STATE_NR }; -enum cache_replacement { - CACHE_REPLACEMENT_LRU = 0, - CACHE_REPLACEMENT_FIFO = 1, - CACHE_REPLACEMENT_RANDOM = 2, - CACHE_REPLACEMENT_NR = 3, +#define BCH_CACHE_REPLACEMENT_POLICIES() \ + x(lru, 0) \ + x(fifo, 1) \ + x(random, 2) + +enum bch_cache_replacement_policies { +#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n, + BCH_CACHE_REPLACEMENT_POLICIES() +#undef x + BCH_CACHE_REPLACEMENT_NR }; struct bch_sb_field_members { @@ -1131,8 +1168,8 @@ struct bch_sb_field_clean { struct bch_sb_field field; __le32 flags; - __le16 read_clock; - __le16 write_clock; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; __le64 journal_seq; union { @@ -1170,7 +1207,9 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_new_versioning = 10, bcachefs_metadata_version_bkey_renumber = 10, bcachefs_metadata_version_inode_btree_change = 11, - bcachefs_metadata_version_max = 12, + bcachefs_metadata_version_snapshot = 12, + bcachefs_metadata_version_inode_backpointers = 13, + bcachefs_metadata_version_max = 14, }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1275,7 +1314,8 @@ LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); -LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); +/* bit 61 was reflink option */ +LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); /* 61-64 unused */ @@ -1305,6 +1345,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); +LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); /* * Features: @@ -1330,13 +1371,25 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); x(btree_ptr_v2, 11) \ x(extents_above_btree_updates, 12) \ x(btree_updates_journalled, 13) \ - x(reflink_inline_data, 14) + x(reflink_inline_data, 14) \ + x(new_varint, 15) \ + x(journal_no_flush, 16) \ + x(alloc_v2, 17) \ + x(extents_across_btree_nodes, 18) + +#define BCH_SB_FEATURES_ALWAYS \ + ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_btree_updates_journalled)|\ + (1ULL << BCH_FEATURE_alloc_v2)|\ + (1ULL << BCH_FEATURE_extents_across_btree_nodes)) #define BCH_SB_FEATURES_ALL \ - ((1ULL << BCH_FEATURE_new_siphash)| \ - (1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (BCH_SB_FEATURES_ALWAYS| \ + (1ULL << BCH_FEATURE_new_siphash)| \ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ - (1ULL << BCH_FEATURE_extents_above_btree_updates)) + (1ULL << BCH_FEATURE_new_varint)| \ + (1ULL << BCH_FEATURE_journal_no_flush)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1345,20 +1398,35 @@ enum bch_sb_feature { BCH_FEATURE_NR, }; +#define BCH_SB_COMPAT() \ + x(alloc_info, 0) \ + x(alloc_metadata, 1) \ + x(extents_above_btree_updates_done, 2) \ + x(bformat_overflow_done, 3) + enum bch_sb_compat { - BCH_COMPAT_FEAT_ALLOC_INFO = 0, - BCH_COMPAT_FEAT_ALLOC_METADATA = 1, +#define x(f, n) BCH_COMPAT_##f, + BCH_SB_COMPAT() +#undef x + BCH_COMPAT_NR, }; /* options: */ #define BCH_REPLICAS_MAX 4U +#define BCH_BKEY_PTRS_MAX 16U + +#define BCH_ERROR_ACTIONS() \ + x(continue, 0) \ + x(ro, 1) \ + x(panic, 2) + enum bch_error_actions { - BCH_ON_ERROR_CONTINUE = 0, - BCH_ON_ERROR_RO = 1, - BCH_ON_ERROR_PANIC = 2, - BCH_NR_ERROR_ACTIONS = 3, +#define x(t, n) BCH_ON_ERROR_##t = n, + BCH_ERROR_ACTIONS() +#undef x + BCH_ON_ERROR_NR }; enum bch_str_hash_type { @@ -1369,11 +1437,16 @@ enum bch_str_hash_type { BCH_STR_HASH_NR = 4, }; +#define BCH_STR_HASH_OPTS() \ + x(crc32c, 0) \ + x(crc64, 1) \ + x(siphash, 2) + enum bch_str_hash_opts { - BCH_STR_HASH_OPT_CRC32C = 0, - BCH_STR_HASH_OPT_CRC64 = 1, - BCH_STR_HASH_OPT_SIPHASH = 2, - BCH_STR_HASH_OPT_NR = 3, +#define x(t, n) BCH_STR_HASH_OPT_##t = n, + BCH_STR_HASH_OPTS() +#undef x + BCH_STR_HASH_OPT_NR }; enum bch_csum_type { @@ -1408,11 +1481,16 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) } } +#define BCH_CSUM_OPTS() \ + x(none, 0) \ + x(crc32c, 1) \ + x(crc64, 2) + enum bch_csum_opts { - BCH_CSUM_OPT_NONE = 0, - BCH_CSUM_OPT_CRC32C = 1, - BCH_CSUM_OPT_CRC64 = 2, - BCH_CSUM_OPT_NR = 3, +#define x(t, n) BCH_CSUM_OPT_##t = n, + BCH_CSUM_OPTS() +#undef x + BCH_CSUM_OPT_NR }; #define BCH_COMPRESSION_TYPES() \ @@ -1424,7 +1502,7 @@ enum bch_csum_opts { x(incompressible, 5) enum bch_compression_type { -#define x(t, n) BCH_COMPRESSION_TYPE_##t, +#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, BCH_COMPRESSION_TYPES() #undef x BCH_COMPRESSION_TYPE_NR @@ -1437,7 +1515,7 @@ enum bch_compression_type { x(zstd, 3) enum bch_compression_opts { -#define x(t, n) BCH_COMPRESSION_OPT_##t, +#define x(t, n) BCH_COMPRESSION_OPT_##t = n, BCH_COMPRESSION_OPTS() #undef x BCH_COMPRESSION_OPT_NR @@ -1487,7 +1565,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(blacklist, 3) \ x(blacklist_v2, 4) \ x(usage, 5) \ - x(data_usage, 6) + x(data_usage, 6) \ + x(clock, 7) \ + x(dev_usage, 8) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1535,6 +1615,30 @@ struct jset_entry_data_usage { struct bch_replicas_entry r; } __attribute__((packed)); +struct jset_entry_clock { + struct jset_entry entry; + __u8 rw; + __u8 pad[7]; + __le64 time; +} __attribute__((packed)); + +struct jset_entry_dev_usage_type { + __le64 buckets; + __le64 sectors; + __le64 fragmented; +} __attribute__((packed)); + +struct jset_entry_dev_usage { + struct jset_entry entry; + __le32 dev; + __u32 pad; + + __le64 buckets_ec; + __le64 buckets_unavailable; + + struct jset_entry_dev_usage_type d[]; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1557,8 +1661,8 @@ struct jset { __u8 encrypted_start[0]; - __le16 read_clock; - __le16 write_clock; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; /* Sequence number of oldest dirty journal entry */ __le64 last_seq; @@ -1572,23 +1676,24 @@ struct jset { LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); +LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); #define BCH_JOURNAL_BUCKETS_MIN 8 /* Btree: */ -#define BCH_BTREE_IDS() \ - x(EXTENTS, 0, "extents") \ - x(INODES, 1, "inodes") \ - x(DIRENTS, 2, "dirents") \ - x(XATTRS, 3, "xattrs") \ - x(ALLOC, 4, "alloc") \ - x(QUOTAS, 5, "quotas") \ - x(EC, 6, "stripes") \ - x(REFLINK, 7, "reflink") +#define BCH_BTREE_IDS() \ + x(extents, 0) \ + x(inodes, 1) \ + x(dirents, 2) \ + x(xattrs, 3) \ + x(alloc, 4) \ + x(quotas, 5) \ + x(stripes, 6) \ + x(reflink, 7) enum btree_id { -#define x(kwd, val, name) BTREE_ID_##kwd = val, +#define x(kwd, val) BTREE_ID_##kwd = val, BCH_BTREE_IDS() #undef x BTREE_ID_NR @@ -1642,7 +1747,7 @@ struct btree_node { /* Closed interval: */ struct bpos min_key; struct bpos max_key; - struct bch_extent_ptr ptr; + struct bch_extent_ptr _ptr; /* not used anymore */ struct bkey_format format; union { diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index d71157a..f679fc2 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -14,6 +14,9 @@ #define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) #define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) +#define BCH_FORCE_IF_LOST \ + (BCH_FORCE_IF_DATA_LOST| \ + BCH_FORCE_IF_METADATA_LOST) #define BCH_FORCE_IF_DEGRADED \ (BCH_FORCE_IF_DATA_DEGRADED| \ BCH_FORCE_IF_METADATA_DEGRADED) @@ -73,6 +76,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) +#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -167,10 +171,11 @@ struct bch_ioctl_disk_set_state { }; enum bch_data_ops { - BCH_DATA_OP_SCRUB = 0, - BCH_DATA_OP_REREPLICATE = 1, - BCH_DATA_OP_MIGRATE = 2, - BCH_DATA_OP_NR = 3, + BCH_DATA_OP_SCRUB = 0, + BCH_DATA_OP_REREPLICATE = 1, + BCH_DATA_OP_MIGRATE = 2, + BCH_DATA_OP_REWRITE_OLD_NODES = 3, + BCH_DATA_OP_NR = 4, }; /* @@ -183,11 +188,13 @@ enum bch_data_ops { * job. The file descriptor is O_CLOEXEC. */ struct bch_ioctl_data { - __u32 op; + __u16 op; + __u8 start_btree; + __u8 end_btree; __u32 flags; - struct bpos start; - struct bpos end; + struct bpos start_pos; + struct bpos end_pos; union { struct { @@ -329,4 +336,17 @@ struct bch_ioctl_disk_resize { __u64 nbuckets; }; +/* + * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device + * + * @dev - member to resize + * @nbuckets - new number of buckets + */ +struct bch_ioctl_disk_resize_journal { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 nbuckets; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 4d0c912..3af5606 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -411,7 +411,7 @@ static bool bkey_packed_successor(struct bkey_packed *out, if ((*p & mask) != mask) { *p += 1ULL << offset; - EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); + EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); return true; } @@ -551,7 +551,12 @@ void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, unsigned bits, u64 offset) { - offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + + bits = min(bits, unpacked_bits); + + offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); f->bits_per_field[i] = bits; f->field_offset[i] = cpu_to_le64(offset); @@ -609,15 +614,19 @@ const char *bch2_bkey_format_validate(struct bkey_format *f) return "incorrect number of fields"; for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); u64 field_offset = le64_to_cpu(f->field_offset[i]); - if (f->bits_per_field[i] > 64) + if (f->bits_per_field[i] > unpacked_bits) return "field too large"; - if (field_offset && - (f->bits_per_field[i] == 64 || - (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < - field_offset))) + if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + return "offset + bits overflow"; + + if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & + unpacked_mask) < + field_offset) return "offset + bits overflow"; bits += f->bits_per_field[i]; @@ -1040,7 +1049,7 @@ int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, high_word(f, r), b->nr_key_bits); - EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), + EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), bkey_unpack_pos(b, r))); return ret; } @@ -1050,13 +1059,13 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, const struct bkey_packed *l, const struct bpos *r) { - return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); + return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); } __pure __flatten -int __bch2_bkey_cmp_packed(const struct bkey_packed *l, - const struct bkey_packed *r, - const struct btree *b) +int bch2_bkey_cmp_packed(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) { struct bkey unpacked; @@ -1071,7 +1080,7 @@ int __bch2_bkey_cmp_packed(const struct bkey_packed *l, r = (void*) &unpacked; } - return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); + return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); } __pure __flatten @@ -1082,7 +1091,7 @@ int __bch2_bkey_cmp_left_packed(const struct btree *b, const struct bkey *l_unpacked; return unlikely(l_unpacked = packed_to_bkey_c(l)) - ? bkey_cmp(l_unpacked->p, *r) + ? bpos_cmp(l_unpacked->p, *r) : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); } @@ -1118,11 +1127,12 @@ void bch2_bkey_pack_test(void) struct bkey_packed p; struct bkey_format test_format = { - .key_u64s = 2, + .key_u64s = 3, .nr_fields = BKEY_NR_FIELDS, .bits_per_field = { 13, 64, + 32, }, }; diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 80ea488..2e45d88 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -33,16 +33,6 @@ struct bkey_s { #define bkey_next(_k) vstruct_next(_k) -static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, - struct bkey_packed *end) -{ - k = bkey_next(k); - - while (k != end && !k->u64s) - k = (void *) ((u64 *) k + 1); - return k; -} - #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) static inline size_t bkey_val_bytes(const struct bkey *k) @@ -67,13 +57,6 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_whiteout(_k) \ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) -#define bkey_packed_typecheck(_k) \ -({ \ - BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ - !type_is(_k, struct bkey_packed *)); \ - type_is(_k, struct bkey_packed *); \ -}) - enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, @@ -81,9 +64,6 @@ enum bkey_lr_packed { BKEY_PACKED_NONE, }; -#define bkey_lr_packed_typecheck(_l, _r) \ - (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) - #define bkey_lr_packed(_l, _r) \ ((_l)->format + ((_r)->format << 1)) @@ -132,9 +112,9 @@ int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, const struct bpos *); __pure -int __bch2_bkey_cmp_packed(const struct bkey_packed *, - const struct bkey_packed *, - const struct btree *); +int bch2_bkey_cmp_packed(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); __pure int __bch2_bkey_cmp_left_packed(const struct btree *, @@ -160,55 +140,58 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b, return bkey_cmp_left_packed(b, l, &r); } -/* - * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to - * skip dispatching on k->format: - */ -#define bkey_cmp_packed(_b, _l, _r) \ -({ \ - int _cmp; \ - \ - switch (bkey_lr_packed_typecheck(_l, _r)) { \ - case BKEY_PACKED_NONE: \ - _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ - ((struct bkey *) (_r))->p); \ - break; \ - case BKEY_PACKED_LEFT: \ - _cmp = bkey_cmp_left_packed((_b), \ - (struct bkey_packed *) (_l), \ - &((struct bkey *) (_r))->p); \ - break; \ - case BKEY_PACKED_RIGHT: \ - _cmp = -bkey_cmp_left_packed((_b), \ - (struct bkey_packed *) (_r), \ - &((struct bkey *) (_l))->p); \ - break; \ - case BKEY_PACKED_BOTH: \ - _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ - (void *) (_r), (_b)); \ - break; \ - } \ - _cmp; \ -}) - -#if 1 +static __always_inline int bpos_cmp(struct bpos l, struct bpos r) +{ + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset) ?: + cmp_int(l.snapshot, r.snapshot); +} + static __always_inline int bkey_cmp(struct bpos l, struct bpos r) { - if (l.inode != r.inode) - return l.inode < r.inode ? -1 : 1; - if (l.offset != r.offset) - return l.offset < r.offset ? -1 : 1; - if (l.snapshot != r.snapshot) - return l.snapshot < r.snapshot ? -1 : 1; - return 0; + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset); } -#else -int bkey_cmp(struct bpos l, struct bpos r); -#endif static inline struct bpos bpos_min(struct bpos l, struct bpos r) { - return bkey_cmp(l, r) < 0 ? l : r; + return bpos_cmp(l, r) < 0 ? l : r; +} + +static inline struct bpos bpos_max(struct bpos l, struct bpos r) +{ + return bpos_cmp(l, r) > 0 ? l : r; +} + +#define sbb(a, b, borrow) \ +do { \ + typeof(a) d1, d2; \ + \ + d1 = a - borrow; \ + borrow = d1 > a; \ + \ + d2 = d1 - b; \ + borrow += d2 > d1; \ + a = d2; \ +} while (0) + +/* returns a - b: */ +static inline struct bpos bpos_sub(struct bpos a, struct bpos b) +{ + int borrow = 0; + + sbb(a.snapshot, b.snapshot, borrow); + sbb(a.offset, b.offset, borrow); + sbb(a.inode, b.inode, borrow); + return a; +} + +static inline struct bpos bpos_diff(struct bpos l, struct bpos r) +{ + if (bpos_cmp(l, r) > 0) + swap(l, r); + + return bpos_sub(r, l); } void bch2_bpos_swab(struct bpos *); @@ -267,24 +250,46 @@ static inline unsigned bkey_format_key_bits(const struct bkey_format *format) format->bits_per_field[BKEY_FIELD_SNAPSHOT]; } -static inline struct bpos bkey_successor(struct bpos p) +static inline struct bpos bpos_successor(struct bpos p) { - struct bpos ret = p; + if (!++p.snapshot && + !++p.offset && + !++p.inode) + BUG(); - if (!++ret.offset) - BUG_ON(!++ret.inode); + return p; +} - return ret; +static inline struct bpos bpos_predecessor(struct bpos p) +{ + if (!p.snapshot-- && + !p.offset-- && + !p.inode--) + BUG(); + + return p; } -static inline struct bpos bkey_predecessor(struct bpos p) +static inline struct bpos bpos_nosnap_successor(struct bpos p) { - struct bpos ret = p; + p.snapshot = 0; - if (!ret.offset--) - BUG_ON(!ret.inode--); + if (!++p.offset && + !++p.inode) + BUG(); - return ret; + return p; +} + +static inline struct bpos bpos_nosnap_predecessor(struct bpos p) +{ + p.snapshot = 0; + + if (!p.offset-- && + !p.inode--) + BUG(); + + return p; } static inline u64 bkey_start_offset(const struct bkey *k) @@ -439,7 +444,7 @@ static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion * functions. */ -#define BKEY_VAL_ACCESSORS(name) \ +#define x(name, ...) \ struct bkey_i_##name { \ union { \ struct bkey k; \ @@ -550,22 +555,8 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ return k; \ } -BKEY_VAL_ACCESSORS(cookie); -BKEY_VAL_ACCESSORS(btree_ptr); -BKEY_VAL_ACCESSORS(extent); -BKEY_VAL_ACCESSORS(reservation); -BKEY_VAL_ACCESSORS(inode); -BKEY_VAL_ACCESSORS(inode_generation); -BKEY_VAL_ACCESSORS(dirent); -BKEY_VAL_ACCESSORS(xattr); -BKEY_VAL_ACCESSORS(alloc); -BKEY_VAL_ACCESSORS(quota); -BKEY_VAL_ACCESSORS(stripe); -BKEY_VAL_ACCESSORS(reflink_p); -BKEY_VAL_ACCESSORS(reflink_v); -BKEY_VAL_ACCESSORS(inline_data); -BKEY_VAL_ACCESSORS(btree_ptr_v2); -BKEY_VAL_ACCESSORS(indirect_inline_data); +BCH_BKEY_TYPES(); +#undef x /* byte order helpers */ diff --git a/libbcachefs/bkey_buf.h b/libbcachefs/bkey_buf.h new file mode 100644 index 0000000..0d7c67a --- /dev/null +++ b/libbcachefs/bkey_buf.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_BUF_H +#define _BCACHEFS_BKEY_BUF_H + +#include "bcachefs.h" + +struct bkey_buf { + struct bkey_i *k; + u64 onstack[12]; +}; + +static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, + struct bch_fs *c, unsigned u64s) +{ + if (s->k == (void *) s->onstack && + u64s > ARRAY_SIZE(s->onstack)) { + s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); + memcpy(s->k, s->onstack, sizeof(s->onstack)); + } +} + +static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, + struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_buf_realloc(s, c, k.k->u64s); + bkey_reassemble(s->k, k); +} + +static inline void bch2_bkey_buf_copy(struct bkey_buf *s, + struct bch_fs *c, + struct bkey_i *src) +{ + bch2_bkey_buf_realloc(s, c, src->k.u64s); + bkey_copy(s->k, src); +} + +static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, + struct bch_fs *c, + struct btree *b, + struct bkey_packed *src) +{ + bch2_bkey_buf_realloc(s, c, BKEY_U64s + + bkeyp_val_u64s(&b->format, src)); + bch2_bkey_unpack(b, s->k, src); +} + +static inline void bch2_bkey_buf_init(struct bkey_buf *s) +{ + s->k = (void *) s->onstack; +} + +static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) +{ + if (s->k != (void *) s->onstack) + mempool_free(s->k, &c->large_bkey_pool); + s->k = NULL; +} + +#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 3284922..6fe95b8 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -59,7 +59,7 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, .key_invalid = key_type_cookie_invalid, \ } -#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ +#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) { \ .key_invalid = empty_val_key_invalid, \ } @@ -104,7 +104,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (k.k->u64s < BKEY_U64s) return "u64s too small"; - if (type == BKEY_TYPE_BTREE && + if (type == BKEY_TYPE_btree && bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; @@ -119,10 +119,17 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, return "nonzero size field"; } - if (k.k->p.snapshot) + if (type != BKEY_TYPE_btree && + !btree_type_has_snapshots(type) && + k.k->p.snapshot) return "nonzero snapshot"; - if (type != BKEY_TYPE_BTREE && + if (type != BKEY_TYPE_btree && + btree_type_has_snapshots(type) && + k.k->p.snapshot != U32_MAX) + return "invalid snapshot field"; + + if (type != BKEY_TYPE_btree && !bkey_cmp(k.k->p, POS_MAX)) return "POS_MAX key"; @@ -138,10 +145,10 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) { - if (bkey_cmp(k.k->p, b->data->min_key) < 0) + if (bpos_cmp(k.k->p, b->data->min_key) < 0) return "key before start of btree node"; - if (bkey_cmp(k.k->p, b->data->max_key) > 0) + if (bpos_cmp(k.k->p, b->data->max_key) > 0) return "key past end of btree node"; return NULL; @@ -149,7 +156,6 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { - const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; const char *invalid; BUG_ON(!k.k->u64s); @@ -161,33 +167,46 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) bch2_bkey_val_to_text(&PBUF(buf), c, k); bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); - return; } - - if (ops->key_debugcheck) - ops->key_debugcheck(c, k); } void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { - if (!bkey_cmp(pos, POS_MIN)) + if (!bpos_cmp(pos, POS_MIN)) pr_buf(out, "POS_MIN"); - else if (!bkey_cmp(pos, POS_MAX)) + else if (!bpos_cmp(pos, POS_MAX)) pr_buf(out, "POS_MAX"); - else - pr_buf(out, "%llu:%llu", pos.inode, pos.offset); + else { + if (pos.inode == U64_MAX) + pr_buf(out, "U64_MAX"); + else + pr_buf(out, "%llu", pos.inode); + pr_buf(out, ":"); + if (pos.offset == U64_MAX) + pr_buf(out, "U64_MAX"); + else + pr_buf(out, "%llu", pos.offset); + pr_buf(out, ":"); + if (pos.snapshot == U32_MAX) + pr_buf(out, "U32_MAX"); + else + pr_buf(out, "%u", pos.snapshot); + } } void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) { if (k) { - pr_buf(out, "u64s %u type %s ", k->u64s, - bch2_bkey_types[k->type]); + pr_buf(out, "u64s %u type ", k->u64s); + + if (k->type < KEY_TYPE_MAX) + pr_buf(out, "%s ", bch2_bkey_types[k->type]); + else + pr_buf(out, "%u ", k->type); bch2_bpos_to_text(out, k->p); - pr_buf(out, " snap %u len %u ver %llu", - k->p.snapshot, k->size, k->version.lo); + pr_buf(out, " len %u ver %llu", k->size, k->version.lo); } else { pr_buf(out, "(null)"); } @@ -196,10 +215,14 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + if (k.k->type < KEY_TYPE_MAX) { + const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; - if (likely(ops->val_to_text)) - ops->val_to_text(out, c, k); + if (likely(ops->val_to_text)) + ops->val_to_text(out, c, k); + } else { + pr_buf(out, "(invalid type %u)", k.k->type); + } } void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, @@ -236,11 +259,11 @@ enum merge_result bch2_bkey_merge(struct bch_fs *c, const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; enum merge_result ret; - if (key_merging_disabled(c) || + if (bch2_key_merging_disabled || !ops->key_merge || l.k->type != r.k->type || bversion_cmp(l.k->version, r.k->version) || - bkey_cmp(l.k->p, bkey_start_pos(r.k))) + bpos_cmp(l.k->p, bkey_start_pos(r.k))) return BCH_MERGE_NOMERGE; ret = ops->key_merge(c, l, r); @@ -255,18 +278,18 @@ static const struct old_bkey_type { u8 old; u8 new; } bkey_renumber_table[] = { - {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, - {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, - {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, - {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, - {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, - {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, - {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, - {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, - {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, - {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, - {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, - {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, + {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, + {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, + {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, + {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, + {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, + {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, + {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, + {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, + {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, + {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, + {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, + {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, }; void bch2_bkey_renumber(enum btree_node_type btree_node_type, @@ -294,14 +317,15 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, const struct bkey_ops *ops; struct bkey uk; struct bkey_s u; + unsigned nr_compat = 5; int i; /* * Do these operations in reverse order in the write path: */ - for (i = 0; i < 4; i++) - switch (!write ? i : 3 - i) { + for (i = 0; i < nr_compat; i++) + switch (!write ? i : nr_compat - 1 - i) { case 0: if (big_endian != CPU_BIG_ENDIAN) bch2_bkey_swab_key(f, k); @@ -312,7 +336,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, break; case 2: if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_INODES) { + btree_id == BTREE_ID_inodes) { if (!bkey_packed(k)) { struct bkey_i *u = packed_to_bkey(k); swap(u->k.p.inode, u->k.p.offset); @@ -335,6 +359,28 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, } break; case 3: + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + struct bkey_i *u = packed_to_bkey(k); + + if (u) { + u->k.p.snapshot = write + ? 0 : U32_MAX; + } else { + u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT]; + u64 max_packed = min_packed + + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + uk = __bch2_bkey_unpack_key(f, k); + uk.p.snapshot = write + ? min_packed : min_t(u64, U32_MAX, max_packed); + + BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); + } + } + + break; + case 4: if (!bkey_packed(k)) { u = bkey_i_to_s(packed_to_bkey(k)); } else { diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 0bca725..bfa6f11 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -26,7 +26,6 @@ struct bkey_ops { /* Returns reason for being invalid if invalid, else NULL: */ const char * (*key_invalid)(const struct bch_fs *, struct bkey_s_c); - void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); diff --git a/libbcachefs/bkey_on_stack.h b/libbcachefs/bkey_on_stack.h deleted file mode 100644 index f607a0c..0000000 --- a/libbcachefs/bkey_on_stack.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BKEY_ON_STACK_H -#define _BCACHEFS_BKEY_ON_STACK_H - -#include "bcachefs.h" - -struct bkey_on_stack { - struct bkey_i *k; - u64 onstack[12]; -}; - -static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, - struct bch_fs *c, unsigned u64s) -{ - if (s->k == (void *) s->onstack && - u64s > ARRAY_SIZE(s->onstack)) { - s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); - memcpy(s->k, s->onstack, sizeof(s->onstack)); - } -} - -static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, - struct bch_fs *c, - struct bkey_s_c k) -{ - bkey_on_stack_realloc(s, c, k.k->u64s); - bkey_reassemble(s->k, k); -} - -static inline void bkey_on_stack_init(struct bkey_on_stack *s) -{ - s->k = (void *) s->onstack; -} - -static inline void bkey_on_stack_exit(struct bkey_on_stack *s, - struct bch_fs *c) -{ - if (s->k != (void *) s->onstack) - mempool_free(s->k, &c->large_bkey_pool); - s->k = NULL; -} - -#endif /* _BCACHEFS_BKEY_ON_STACK_H */ diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 839e78d..537ab79 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "bkey_sort.h" #include "bset.h" #include "extents.h" @@ -14,9 +14,8 @@ static inline bool sort_iter_end(struct sort_iter *iter) return !iter->used; } -static inline void __sort_iter_sift(struct sort_iter *iter, - unsigned from, - sort_cmp_fn cmp) +static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, + sort_cmp_fn cmp) { unsigned i; @@ -27,18 +26,12 @@ static inline void __sort_iter_sift(struct sort_iter *iter, swap(iter->data[i], iter->data[i + 1]); } -static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) -{ - - __sort_iter_sift(iter, 0, cmp); -} - static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) { unsigned i = iter->used; while (i--) - __sort_iter_sift(iter, i, cmp); + sort_iter_sift(iter, i, cmp); } static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) @@ -46,26 +39,20 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) return !sort_iter_end(iter) ? iter->data->k : NULL; } -static inline void __sort_iter_advance(struct sort_iter *iter, - unsigned idx, sort_cmp_fn cmp) +static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) { - struct sort_iter_set *i = iter->data + idx; + struct sort_iter_set *i = iter->data; - BUG_ON(idx >= iter->used); + BUG_ON(!iter->used); - i->k = bkey_next_skip_noops(i->k, i->end); + i->k = bkey_next(i->k); BUG_ON(i->k > i->end); if (i->k == i->end) - array_remove_item(iter->data, iter->used, idx); + array_remove_item(iter->data, iter->used, 0); else - __sort_iter_sift(iter, idx, cmp); -} - -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -{ - __sort_iter_advance(iter, 0, cmp); + sort_iter_sift(iter, 0, cmp); } static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, @@ -86,7 +73,7 @@ static inline int key_sort_fix_overlapping_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed(b, l, r) ?: cmp_int((unsigned long) l, (unsigned long) r); } @@ -98,7 +85,7 @@ static inline bool should_drop_next_key(struct sort_iter *iter) * and should be dropped. */ return iter->used >= 2 && - !bkey_cmp_packed(iter->b, + !bch2_bkey_cmp_packed(iter->b, iter->data[0].k, iter->data[1].k); } @@ -116,7 +103,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, sort_iter_sort(iter, key_sort_fix_overlapping_cmp); while ((k = sort_iter_peek(iter))) { - if (!bkey_whiteout(k) && + if (!bkey_deleted(k) && !should_drop_next_key(iter)) { bkey_copy(out, k); btree_keys_account_key_add(&nr, 0, out); @@ -136,7 +123,7 @@ static void extent_sort_append(struct bch_fs *c, struct bkey_packed **out, struct bkey_s k) { - if (!bkey_whiteout(k.k)) { + if (!bkey_deleted(k.k)) { if (!bch2_bkey_pack_key(*out, k.k, f)) memcpy_u64s_small(*out, k.k, BKEY_U64s); @@ -161,7 +148,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, memset(&nr, 0, sizeof(nr)); while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { - if (filter_whiteouts && bkey_whiteout(in)) + if (filter_whiteouts && bkey_deleted(in)) continue; if (bch2_bkey_transform(out_f, out, bkey_packed(in) @@ -187,14 +174,14 @@ bch2_sort_repack_merge(struct bch_fs *c, bool filter_whiteouts) { struct bkey_packed *out = vstruct_last(dst), *k_packed; - struct bkey_on_stack k; + struct bkey_buf k; struct btree_nr_keys nr; memset(&nr, 0, sizeof(nr)); - bkey_on_stack_init(&k); + bch2_bkey_buf_init(&k); while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { - if (filter_whiteouts && bkey_whiteout(k_packed)) + if (filter_whiteouts && bkey_deleted(k_packed)) continue; /* @@ -204,7 +191,7 @@ bch2_sort_repack_merge(struct bch_fs *c, * node; we have to make a copy of the entire key before calling * normalize */ - bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); + bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s); bch2_bkey_unpack(src, k.k, k_packed); if (filter_whiteouts && @@ -215,7 +202,7 @@ bch2_sort_repack_merge(struct bch_fs *c, } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - bkey_on_stack_exit(&k, c); + bch2_bkey_buf_exit(&k, c); return nr; } @@ -223,7 +210,7 @@ static inline int sort_keys_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) ?: + return bch2_bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: (int) l->needs_whiteout - (int) r->needs_whiteout; } @@ -240,19 +227,19 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, while ((in = sort_iter_next(iter, sort_keys_cmp))) { bool needs_whiteout = false; - if (bkey_whiteout(in) && + if (bkey_deleted(in) && (filter_whiteouts || !in->needs_whiteout)) continue; while ((next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { + !bch2_bkey_cmp_packed(iter->b, in, next)) { BUG_ON(in->needs_whiteout && next->needs_whiteout); needs_whiteout |= in->needs_whiteout; in = sort_iter_next(iter, sort_keys_cmp); } - if (bkey_whiteout(in)) { + if (bkey_deleted(in)) { memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); set_bkeyp_val_u64s(f, out, 0); } else { @@ -264,252 +251,3 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, return (u64 *) out - (u64 *) dst; } - -/* Compat code for btree_node_old_extent_overwrite: */ - -/* - * If keys compare equal, compare by pointer order: - * - * Necessary for sort_fix_overlapping() - if there are multiple keys that - * compare equal in different sets, we have to process them newest to oldest. - */ -static inline int extent_sort_fix_overlapping_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - struct bkey ul = bkey_unpack_key(b, l); - struct bkey ur = bkey_unpack_key(b, r); - - return bkey_cmp(bkey_start_pos(&ul), - bkey_start_pos(&ur)) ?: - cmp_int((unsigned long) r, (unsigned long) l); -} - -/* - * The algorithm in extent_sort_fix_overlapping() relies on keys in the same - * bset being ordered by start offset - but 0 size whiteouts (which are always - * KEY_TYPE_deleted) break this ordering, so we need to skip over them: - */ -static void extent_iter_advance(struct sort_iter *iter, unsigned idx) -{ - struct sort_iter_set *i = iter->data + idx; - - do { - i->k = bkey_next_skip_noops(i->k, i->end); - } while (i->k != i->end && bkey_deleted(i->k)); - - if (i->k == i->end) - array_remove_item(iter->data, iter->used, idx); - else - __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); -} - -struct btree_nr_keys -bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, - struct sort_iter *iter) -{ - struct btree *b = iter->b; - struct bkey_format *f = &b->format; - struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; - struct bkey_packed *out = dst->start; - struct bkey l_unpacked, r_unpacked; - struct bkey_s l, r; - struct btree_nr_keys nr; - struct bkey_on_stack split; - unsigned i; - - memset(&nr, 0, sizeof(nr)); - bkey_on_stack_init(&split); - - sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); - for (i = 0; i < iter->used;) { - if (bkey_deleted(iter->data[i].k)) - __sort_iter_advance(iter, i, - extent_sort_fix_overlapping_cmp); - else - i++; - } - - while (!sort_iter_end(iter)) { - l = __bkey_disassemble(b, _l->k, &l_unpacked); - - if (iter->used == 1) { - extent_sort_append(c, f, &nr, &out, l); - extent_iter_advance(iter, 0); - continue; - } - - r = __bkey_disassemble(b, _r->k, &r_unpacked); - - /* If current key and next key don't overlap, just append */ - if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { - extent_sort_append(c, f, &nr, &out, l); - extent_iter_advance(iter, 0); - continue; - } - - /* Skip 0 size keys */ - if (!r.k->size) { - extent_iter_advance(iter, 1); - continue; - } - - /* - * overlap: keep the newer key and trim the older key so they - * don't overlap. comparing pointers tells us which one is - * newer, since the bsets are appended one after the other. - */ - - /* can't happen because of comparison func */ - BUG_ON(_l->k < _r->k && - !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); - - if (_l->k > _r->k) { - /* l wins, trim r */ - if (bkey_cmp(l.k->p, r.k->p) >= 0) { - extent_iter_advance(iter, 1); - } else { - bch2_cut_front_s(l.k->p, r); - extent_save(b, _r->k, r.k); - __sort_iter_sift(iter, 1, - extent_sort_fix_overlapping_cmp); - } - } else if (bkey_cmp(l.k->p, r.k->p) > 0) { - - /* - * r wins, but it overlaps in the middle of l - split l: - */ - bkey_on_stack_reassemble(&split, c, l.s_c); - bch2_cut_back(bkey_start_pos(r.k), split.k); - - bch2_cut_front_s(r.k->p, l); - extent_save(b, _l->k, l.k); - - __sort_iter_sift(iter, 0, - extent_sort_fix_overlapping_cmp); - - extent_sort_append(c, f, &nr, &out, - bkey_i_to_s(split.k)); - } else { - bch2_cut_back_s(bkey_start_pos(r.k), l); - extent_save(b, _l->k, l.k); - } - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - - bkey_on_stack_exit(&split, c); - return nr; -} - -static inline int sort_extents_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_deleted(l) - (int) bkey_deleted(r); -} - -unsigned bch2_sort_extents(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) -{ - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_extents_cmp); - - while ((in = sort_iter_next(iter, sort_extents_cmp))) { - if (bkey_deleted(in)) - continue; - - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} - -static inline int sort_extent_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - struct bkey ul = bkey_unpack_key(b, l); - struct bkey ur = bkey_unpack_key(b, r); - - return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); -} - -unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *out = dst; - struct bkey_i l, r; - bool prev = false, l_packed = false; - u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); - u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); - u64 new_size; - - max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); - - sort_iter_sort(iter, sort_extent_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { - if (bkey_deleted(in)) - continue; - - EBUG_ON(bkeyp_val_u64s(f, in)); - EBUG_ON(in->type != KEY_TYPE_discard); - - r.k = bkey_unpack_key(iter->b, in); - - if (prev && - bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; - - new_size = l_packed - ? min(max_packed_size, max_packed_offset - - bkey_start_offset(&l.k)) - : KEY_SIZE_MAX; - - new_size = min(new_size, r.k.p.offset - - bkey_start_offset(&l.k)); - - BUG_ON(new_size < l.k.size); - - bch2_key_resize(&l.k, new_size); - - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; - - bch2_cut_front(l.k.p, &r); - } - - if (prev) { - if (!bch2_bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } - - l = r; - prev = true; - l_packed = bkey_packed(in); - } - - if (prev) { - if (!bch2_bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h index 458a051..1059996 100644 --- a/libbcachefs/bkey_sort.h +++ b/libbcachefs/bkey_sort.h @@ -32,9 +32,6 @@ static inline void sort_iter_add(struct sort_iter *iter, struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, struct sort_iter *); -struct btree_nr_keys -bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, - struct sort_iter *); struct btree_nr_keys bch2_sort_repack(struct bset *, struct btree *, @@ -48,10 +45,5 @@ bch2_sort_repack_merge(struct bch_fs *, unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *, bool); -unsigned bch2_sort_extents(struct bkey_packed *, - struct sort_iter *, bool); - -unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, - struct sort_iter *); #endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index f7c2841..f92a757 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -78,7 +78,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, for (_k = i->start; _k < vstruct_last(i); _k = _n) { - _n = bkey_next_skip_noops(_k, vstruct_last(i)); + _n = bkey_next(_k); k = bkey_disassemble(b, _k, &uk); if (c) @@ -93,13 +93,13 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, n = bkey_unpack_key(b, _n); - if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { + if (bpos_cmp(n.p, k.k->p) < 0) { printk(KERN_ERR "Key skipped backwards\n"); continue; } if (!bkey_deleted(k.k) && - !bkey_cmp(n.p, k.k->p)) + !bpos_cmp(n.p, k.k->p)) printk(KERN_ERR "Duplicate keys\n"); } } @@ -144,7 +144,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) for_each_bset(b, t) bset_tree_for_each_key(b, t, k) - if (!bkey_whiteout(k)) + if (!bkey_deleted(k)) btree_keys_account_key_add(&nr, t - b->set, k); BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); @@ -369,10 +369,10 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(struct btree *b) +static void bset_aux_tree_verify(const struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG - struct bset_tree *t; + const struct bset_tree *t; for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) @@ -388,15 +388,13 @@ static void bset_aux_tree_verify(struct btree *b) #endif } -void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) +void bch2_btree_keys_init(struct btree *b) { unsigned i; b->nsets = 0; memset(&b->nr, 0, sizeof(b->nr)); -#ifdef CONFIG_BCACHEFS_DEBUG - b->expensive_debug_checks = expensive_debug_checks; -#endif + for (i = 0; i < MAX_BSETS; i++) b->set[i].data_offset = U16_MAX; @@ -522,7 +520,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, struct bkey_packed *k = btree_bkey_first(b, t); unsigned j = 0; - if (!btree_keys_expensive_checks(b)) + if (!bch2_expensive_debug_checks) return; BUG_ON(bset_has_ro_aux_tree(t)); @@ -536,7 +534,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, goto start; while (1) { if (rw_aux_to_bkey(b, t, j) == k) { - BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, + BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k, bkey_unpack_pos(b, k))); start: if (++j == t->size) @@ -546,7 +544,7 @@ start: rw_aux_tree(b, t)[j - 1].offset); } - k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + k = bkey_next(k); BUG_ON(k >= btree_bkey_last(b, t)); } } @@ -606,53 +604,23 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, return (u16) v; } -static void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) +__always_inline +static inline void __make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) { struct bkey_float *f = bkey_float(b, t, j); struct bkey_packed *m = tree_to_bkey(b, t, j); - struct bkey_packed *l, *r; + struct bkey_packed *l = is_power_of_2(j) + ? min_key + : tree_to_prev_bkey(b, t, j >> ffs(j)); + struct bkey_packed *r = is_power_of_2(j + 1) + ? max_key + : tree_to_bkey(b, t, j >> (ffz(j) + 1)); unsigned mantissa; int shift, exponent, high_bit; - if (is_power_of_2(j)) { - l = min_key; - - if (!l->u64s) { - if (!bkey_pack_pos(l, b->data->min_key, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = b->data->min_key; - bkey_copy(l, &tmp); - } - } - } else { - l = tree_to_prev_bkey(b, t, j >> ffs(j)); - - EBUG_ON(m < l); - } - - if (is_power_of_2(j + 1)) { - r = max_key; - - if (!r->u64s) { - if (!bkey_pack_pos(r, t->max_key, b)) { - struct bkey_i tmp; - - bkey_init(&tmp.k); - tmp.k.p = t->max_key; - bkey_copy(r, &tmp); - } - } - } else { - r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); - - EBUG_ON(m > r); - } - /* * for failed bfloats, the lookup code falls back to comparing against * the original key. @@ -709,26 +677,54 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, f->mantissa = mantissa; } +static void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) +{ + struct bkey_i *k; + + if (is_power_of_2(j) && + !min_key->u64s) { + if (!bkey_pack_pos(min_key, b->data->min_key, b)) { + k = (void *) min_key; + bkey_init(&k->k); + k->k.p = b->data->min_key; + } + } + + if (is_power_of_2(j + 1) && + !max_key->u64s) { + if (!bkey_pack_pos(max_key, b->data->max_key, b)) { + k = (void *) max_key; + bkey_init(&k->k); + k->k.p = b->data->max_key; + } + } + + __make_bfloat(b, t, j, min_key, max_key); +} + /* bytes remaining - only valid for last bset: */ -static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) { bset_aux_tree_verify(b); return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); } -static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / (sizeof(struct bkey_float) + sizeof(u8)); } -static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) +static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) { return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); } -static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) +static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *k; @@ -747,15 +743,12 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) } } -static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) +static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); - struct bkey_packed min_key, max_key; + struct bkey_i min_key, max_key; unsigned j, cacheline = 1; - /* signal to make_bfloat() that they're uninitialized: */ - min_key.u64s = max_key.u64s = 0; - t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), bset_ro_tree_capacity(b, t)); retry: @@ -770,7 +763,7 @@ retry: /* First we figure out where the first key in each cacheline is */ eytzinger1_for_each(j, t->size) { while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + prev = k, k = bkey_next(k); if (k >= btree_bkey_last(b, t)) { /* XXX: this path sucks */ @@ -787,13 +780,23 @@ retry: } while (k != btree_bkey_last(b, t)) - prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); + prev = k, k = bkey_next(k); + + if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { + bkey_init(&min_key.k); + min_key.k.p = b->data->min_key; + } - t->max_key = bkey_unpack_pos(b, prev); + if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { + bkey_init(&max_key.k); + max_key.k.p = b->data->max_key; + } /* Then we build the tree */ eytzinger1_for_each(j, t->size) - make_bfloat(b, t, j, &min_key, &max_key); + __make_bfloat(b, t, j, + bkey_to_packed(&min_key), + bkey_to_packed(&max_key)); } static void bset_alloc_tree(struct btree *b, struct bset_tree *t) @@ -915,21 +918,21 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_next_skip_noops(i, k)) + for (i = p; i != k; i = bkey_next(i)) if (i->type >= min_key_type) ret = i; k = p; } - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { BUG_ON(ret >= orig_k); for (i = ret - ? bkey_next_skip_noops(ret, orig_k) + ? bkey_next(ret) : btree_bkey_first(b, t); i != orig_k; - i = bkey_next_skip_noops(i, orig_k)) + i = bkey_next(i)) BUG_ON(i->type >= min_key_type); } @@ -964,9 +967,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, /* signal to make_bfloat() that they're uninitialized: */ min_key.u64s = max_key.u64s = 0; - if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { - t->max_key = bkey_unpack_pos(b, k); - + if (bkey_next(k) == btree_bkey_last(b, t)) { for (j = 1; j < t->size; j = j * 2 + 1) make_bfloat(b, t, j, &min_key, &max_key); } @@ -1088,7 +1089,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, struct bkey_packed *k = start; while (1) { - k = bkey_next_skip_noops(k, end); + k = bkey_next(k); if (k == end) break; @@ -1124,7 +1125,7 @@ void bch2_bset_insert(struct btree *b, if (bch2_bkey_pack_key(&packed, &insert->k, f)) src = &packed; - if (!bkey_whiteout(&insert->k)) + if (!bkey_deleted(&insert->k)) btree_keys_account_key_add(&b->nr, t - b->set, src); if (src->u64s != clobber_u64s) { @@ -1174,15 +1175,14 @@ void bch2_bset_delete(struct btree *b, __flatten static struct bkey_packed *bset_search_write_set(const struct btree *b, struct bset_tree *t, - struct bpos *search, - const struct bkey_packed *packed_search) + struct bpos *search) { unsigned l = 0, r = t->size; while (l + 1 != r) { unsigned m = (l + r) >> 1; - if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) + if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) l = m; else r = m; @@ -1227,8 +1227,8 @@ static inline bool bkey_mantissa_bits_dropped(const struct btree *b, __flatten static struct bkey_packed *bset_search_tree(const struct btree *b, - struct bset_tree *t, - struct bpos *search, + const struct bset_tree *t, + const struct bpos *search, const struct bkey_packed *packed_search) { struct ro_aux_tree *base = ro_aux_tree_base(b, t); @@ -1242,9 +1242,6 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, prefetch(&base->f[n << 4]); f = &base->f[n]; - - if (!unlikely(packed_search)) - goto slowpath; if (unlikely(f->exponent >= BFLOAT_FAILED)) goto slowpath; @@ -1308,18 +1305,8 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, case BSET_NO_AUX_TREE: return btree_bkey_first(b, t); case BSET_RW_AUX_TREE: - return bset_search_write_set(b, t, search, lossy_packed_search); + return bset_search_write_set(b, t, search); case BSET_RO_AUX_TREE: - /* - * Each node in the auxiliary search tree covers a certain range - * of bits, and keys above and below the set it covers might - * differ outside those bits - so we have to special case the - * start and end - handle that here: - */ - - if (bkey_cmp(*search, t->max_key) > 0) - return btree_bkey_last(b, t); - return bset_search_tree(b, t, search, lossy_packed_search); default: unreachable(); @@ -1338,14 +1325,14 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, while (m != btree_bkey_last(b, t) && bkey_iter_cmp_p_or_unp(b, m, lossy_packed_search, search) < 0) - m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + m = bkey_next(m); if (!packed_search) while (m != btree_bkey_last(b, t) && bkey_iter_pos_cmp(b, m, search) < 0) - m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + m = bkey_next(m); - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && @@ -1356,23 +1343,6 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, return m; } -/* - * Returns the first key greater than or equal to @search - */ -static __always_inline __flatten -struct bkey_packed *bch2_bset_search(struct btree *b, - struct bset_tree *t, - struct bpos *search, - struct bkey_packed *packed_search, - const struct bkey_packed *lossy_packed_search) -{ - struct bkey_packed *m = __bch2_bset_search(b, t, search, - lossy_packed_search); - - return bch2_bset_search_linear(b, t, search, - packed_search, lossy_packed_search, m); -} - /* Btree node iterator */ static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, @@ -1407,16 +1377,15 @@ noinline __flatten __attribute__((cold)) static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, struct btree *b, struct bpos *search) { - struct bset_tree *t; + struct bkey_packed *k; trace_bkey_pack_pos_fail(search); - for_each_bset(b, t) - __bch2_btree_node_iter_push(iter, b, - bch2_bset_search(b, t, search, NULL, NULL), - btree_bkey_last(b, t)); + bch2_btree_node_iter_init_from_start(iter, b); - bch2_btree_node_iter_sort(iter, b); + while ((k = bch2_btree_node_iter_peek(iter, b)) && + bkey_iter_pos_cmp(b, k, search) < 0) + bch2_btree_node_iter_advance(iter, b); } /** @@ -1450,7 +1419,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, * to the search key is going to have 0 sectors after the search key. * * But this does mean that we can't just search for - * bkey_successor(start_of_range) to get the first extent that overlaps with + * bpos_successor(start_of_range) to get the first extent that overlaps with * the range we want - if we're unlucky and there's an extent that ends * exactly where we searched, then there could be a deleted key at the same * position and we'd get that when we search instead of the preceding extent @@ -1468,7 +1437,8 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, struct bkey_packed *k[MAX_BSETS]; unsigned i; - EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0); bset_aux_tree_verify(b); memset(iter, 0, sizeof(*iter)); @@ -1601,7 +1571,7 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { bch2_btree_node_iter_verify(iter, b); bch2_btree_node_iter_next_check(iter, b); } @@ -1620,7 +1590,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct bset_tree *t; unsigned end = 0; - if (btree_keys_expensive_checks(b)) + if (bch2_expensive_debug_checks) bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { @@ -1656,20 +1626,19 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - if (btree_keys_expensive_checks(b)) + if (bch2_expensive_debug_checks) bch2_btree_node_iter_verify(iter, b); return prev; } -struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, - struct btree *b, - unsigned min_key_type) +struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, + struct btree *b) { struct bkey_packed *prev; do { prev = bch2_btree_node_iter_prev_all(iter, b); - } while (prev && prev->type < min_key_type); + } while (prev && bkey_deleted(prev)); return prev; } @@ -1734,9 +1703,10 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, uk = bkey_unpack_key(b, k); pr_buf(out, " failed unpacked at depth %u\n" - "\t%llu:%llu\n", - ilog2(j), - uk.p.inode, uk.p.offset); + "\t", + ilog2(j)); + bch2_bpos_to_text(out, uk.p); + pr_buf(out, "\n"); break; } } diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 5921cf6..506da4e 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -5,7 +5,7 @@ #include #include -#include "bcachefs_format.h" +#include "bcachefs.h" #include "bkey.h" #include "bkey_methods.h" #include "btree_types.h" @@ -147,17 +147,6 @@ * first key in that range of bytes again. */ -extern bool bch2_expensive_debug_checks; - -static inline bool btree_keys_expensive_checks(const struct btree *b) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - return bch2_expensive_debug_checks || *b->expensive_debug_checks; -#else - return false; -#endif -} - enum bset_aux_tree_type { BSET_NO_AUX_TREE, BSET_RO_AUX_TREE, @@ -201,17 +190,17 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree #define BSET_CACHELINE 128 -static inline size_t btree_keys_cachelines(struct btree *b) +static inline size_t btree_keys_cachelines(const struct btree *b) { return (1U << b->byte_order) / BSET_CACHELINE; } -static inline size_t btree_aux_data_bytes(struct btree *b) +static inline size_t btree_aux_data_bytes(const struct btree *b) { return btree_keys_cachelines(b) * 8; } -static inline size_t btree_aux_data_u64s(struct btree *b) +static inline size_t btree_aux_data_u64s(const struct btree *b) { return btree_aux_data_bytes(b) / sizeof(u64); } @@ -228,7 +217,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(dst, src); - if (btree_keys_expensive_checks(b)) { + if (bch2_expensive_debug_checks) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); @@ -316,7 +305,7 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, #define bset_tree_for_each_key(_b, _t, _k) \ for (_k = btree_bkey_first(_b, _t); \ _k != btree_bkey_last(_b, _t); \ - _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) + _k = bkey_next(_k)) static inline bool bset_has_ro_aux_tree(struct bset_tree *t) { @@ -366,7 +355,7 @@ static inline struct bset *bset_next_set(struct btree *b, return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); } -void bch2_btree_keys_init(struct btree *, bool *); +void bch2_btree_keys_init(struct btree *); void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct bch_fs *, struct btree *, @@ -389,7 +378,7 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, EBUG_ON(r_packed && !bkey_packed(r_packed)); if (unlikely(!bkey_packed(l))) - return bkey_cmp(packed_to_bkey_c(l)->p, *r); + return bpos_cmp(packed_to_bkey_c(l)->p, *r); if (likely(r_packed)) return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); @@ -411,25 +400,7 @@ bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) static inline struct bkey_packed * bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) { - return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); -} - -enum bch_extent_overlap { - BCH_EXTENT_OVERLAP_ALL = 0, - BCH_EXTENT_OVERLAP_BACK = 1, - BCH_EXTENT_OVERLAP_FRONT = 2, - BCH_EXTENT_OVERLAP_MIDDLE = 3, -}; - -/* Returns how k overlaps with m */ -static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, - const struct bkey *m) -{ - int cmp1 = bkey_cmp(k->p, m->p) < 0; - int cmp2 = bkey_cmp(bkey_start_pos(k), - bkey_start_pos(m)) > 0; - - return (cmp1 << 1) + cmp2; + return bch2_bkey_prev_filter(b, t, k, 1); } /* Btree key iteration */ @@ -477,7 +448,7 @@ static inline int bkey_iter_cmp(const struct btree *b, const struct bkey_packed *l, const struct bkey_packed *r) { - return bkey_cmp_packed(b, l, r) + return bch2_bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: cmp_int(l, r); } @@ -517,33 +488,23 @@ __bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, } static inline struct bkey_packed * -bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, - struct btree *b, - unsigned min_key_type) +bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) { - while (!bch2_btree_node_iter_end(iter)) { - struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); - - if (k->type >= min_key_type) - return k; - - bch2_btree_node_iter_advance(iter, b); - } - - return NULL; -} - -static inline struct bkey_packed * -bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, - struct btree *b) -{ - return bch2_btree_node_iter_peek_filter(iter, b, 0); + return !bch2_btree_node_iter_end(iter) + ? __btree_node_offset_to_key(b, iter->data->k) + : NULL; } static inline struct bkey_packed * bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) { - return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); + struct bkey_packed *k; + + while ((k = bch2_btree_node_iter_peek_all(iter, b)) && + bkey_deleted(k)) + bch2_btree_node_iter_advance(iter, b); + + return k; } static inline struct bkey_packed * @@ -559,14 +520,8 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, struct btree *); -struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, - struct btree *, unsigned); - -static inline struct bkey_packed * -bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) -{ - return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); -} +struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, + struct btree *); struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, struct btree *, @@ -654,7 +609,7 @@ static inline void bch2_verify_insert_pos(struct btree *b, static inline void bch2_verify_btree_nr_keys(struct btree *b) { - if (btree_keys_expensive_checks(b)) + if (bch2_debug_check_btree_accounting) __bch2_verify_btree_nr_keys(b); } diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index bb94fa2..1abc50f 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -1,23 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" #include "debug.h" +#include "error.h" #include #include #include -const char * const bch2_btree_ids[] = { -#define x(kwd, val, name) name, - BCH_BTREE_IDS() -#undef x - NULL -}; - void bch2_recalc_btree_reserve(struct bch_fs *c) { unsigned i, reserve = 16; @@ -151,6 +146,11 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, b->c.level = level; b->c.btree_id = id; + if (level) + six_lock_pcpu_alloc(&b->c.lock); + else + six_lock_pcpu_free_rcu(&b->c.lock); + mutex_lock(&bc->lock); ret = __bch2_btree_node_hash_insert(bc, b); if (!ret) @@ -211,7 +211,7 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) * - unless btree verify mode is enabled, since it runs out of * the post write cleanup: */ - if (verify_btree_ondisk(c)) + if (bch2_verify_btree_ondisk) bch2_btree_node_write(c, b, SIX_LOCK_intent); else __bch2_btree_node_write(c, b, SIX_LOCK_read); @@ -254,7 +254,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long freed = 0; unsigned i, flags; - if (btree_shrinker_disabled(c)) + if (bch2_btree_shrinker_disabled) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ @@ -328,9 +328,9 @@ restart: clear_btree_node_accessed(b); } - memalloc_nofs_restore(flags); mutex_unlock(&bc->lock); out: + memalloc_nofs_restore(flags); return (unsigned long) freed * btree_pages(c); } @@ -341,7 +341,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, btree_cache.shrink); struct btree_cache *bc = &c->btree_cache; - if (btree_shrinker_disabled(c)) + if (bch2_btree_shrinker_disabled) return 0; return btree_cache_can_free(bc) * btree_pages(c); @@ -381,14 +381,17 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (btree_node_dirty(b)) bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); btree_node_data_free(c, b); } + BUG_ON(atomic_read(&c->btree_cache.dirty)); + while (!list_empty(&bc->freed)) { b = list_first_entry(&bc->freed, struct btree, list); list_del(&b->list); + six_lock_pcpu_free(&b->c.lock); kfree(b); } @@ -445,7 +448,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->shrink.scan_objects = bch2_btree_cache_scan; bc->shrink.seeks = 4; bc->shrink.batch = btree_pages(c) * 2; - register_shrinker(&bc->shrink); + ret = register_shrinker(&bc->shrink); out: pr_verbose_init(c->opts, "ret %i", ret); return ret; @@ -590,7 +593,7 @@ out: b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; - bch2_btree_keys_init(b, &c->expensive_debug_checks); + bch2_btree_keys_init(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); @@ -705,7 +708,8 @@ static int lock_node_check_fn(struct six_lock *lock, void *p) */ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type) + enum six_lock_type lock_type, + unsigned long trace_ip) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -767,7 +771,7 @@ lock_node: btree_node_unlock(iter, level + 1); if (!btree_node_lock(b, k->k.p, level, iter, lock_type, - lock_node_check_fn, (void *) k)) { + lock_node_check_fn, (void *) k, trace_ip)) { if (b->hash_val != btree_ptr_hash_val(k)) goto retry; return ERR_PTR(-EINTR); @@ -808,9 +812,12 @@ lock_node: return ERR_PTR(-EIO); } - EBUG_ON(b->c.btree_id != iter->btree_id || - BTREE_NODE_LEVEL(b->data) != level || - bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->c.btree_id != iter->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bpos_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); return b; } @@ -818,7 +825,8 @@ lock_node: struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, const struct bkey_i *k, enum btree_id btree_id, - unsigned level) + unsigned level, + bool nofill) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -833,6 +841,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { + if (nofill) + goto out; + b = bch2_btree_node_fill(c, NULL, k, btree_id, level, SIX_LOCK_read, true); @@ -840,8 +851,12 @@ retry: if (!b) goto retry; + if (IS_ERR(b) && + !bch2_btree_cache_cannibalize_lock(c, NULL)) + goto retry; + if (IS_ERR(b)) - return b; + goto out; } else { lock_node: ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); @@ -876,143 +891,36 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - return ERR_PTR(-EIO); - } - - EBUG_ON(b->c.btree_id != btree_id || - BTREE_NODE_LEVEL(b->data) != level || - bkey_cmp(b->data->max_key, k->k.p)); - - return b; -} - -struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, - struct btree_iter *iter, - struct btree *b, - enum btree_node_sibling sib) -{ - struct btree_trans *trans = iter->trans; - struct btree *parent; - struct btree_node_iter node_iter; - struct bkey_packed *k; - BKEY_PADDED(k) tmp; - struct btree *ret = NULL; - unsigned level = b->c.level; - - parent = btree_iter_node(iter, level + 1); - if (!parent) - return NULL; - - /* - * There's a corner case where a btree_iter might have a node locked - * that is just outside its current pos - when - * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. - * - * But the lock ordering checks in __bch2_btree_node_lock() go off of - * iter->pos, not the node's key: so if the iterator is marked as - * needing to be traversed, we risk deadlock if we don't bail out here: - */ - if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) - return ERR_PTR(-EINTR); - - if (!bch2_btree_node_relock(iter, level + 1)) { - ret = ERR_PTR(-EINTR); + b = ERR_PTR(-EIO); goto out; } - node_iter = iter->l[parent->c.level].iter; - - k = bch2_btree_node_iter_peek_all(&node_iter, parent); - BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); - - k = sib == btree_prev_sib - ? bch2_btree_node_iter_prev(&node_iter, parent) - : (bch2_btree_node_iter_advance(&node_iter, parent), - bch2_btree_node_iter_peek(&node_iter, parent)); - if (!k) - goto out; - - bch2_bkey_unpack(parent, &tmp.k, k); - - ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); - - if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { - struct btree_iter *linked; - - if (!bch2_btree_node_relock(iter, level + 1)) - goto out; - - /* - * We might have got -EINTR because trylock failed, and we're - * holding other locks that would cause us to deadlock: - */ - trans_for_each_iter(trans, linked) - if (btree_iter_cmp(iter, linked) < 0) - __bch2_btree_iter_unlock(linked); - - if (sib == btree_prev_sib) - btree_node_unlock(iter, level); - - ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); - - /* - * before btree_iter_relock() calls btree_iter_verify_locks(): - */ - if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level + 1); - - if (!bch2_btree_node_relock(iter, level)) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); - - if (!IS_ERR(ret)) { - six_unlock_intent(&ret->c.lock); - ret = ERR_PTR(-EINTR); - } - } - - bch2_trans_relock(trans); - } + EBUG_ON(b->c.btree_id != btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bpos_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); out: - if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level + 1); - - if (PTR_ERR_OR_ZERO(ret) == -EINTR) - bch2_btree_iter_upgrade(iter, level + 2); - - BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); - - if (!IS_ERR_OR_NULL(ret)) { - struct btree *n1 = ret, *n2 = b; - - if (sib != btree_prev_sib) - swap(n1, n2); - - BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), - n2->data->min_key)); - } - - bch2_btree_trans_verify_locks(trans); - - return ret; + bch2_btree_cache_cannibalize_unlock(c); + return b; } void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, - const struct bkey_i *k, unsigned level) + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) { struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(!btree_node_locked(iter, level + 1)); + BUG_ON(iter && !btree_node_locked(iter, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); if (b) return; - bch2_btree_node_fill(c, iter, k, iter->btree_id, - level, SIX_LOCK_read, false); + bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, @@ -1025,20 +933,19 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, bch2_btree_keys_stats(b, &stats); - pr_buf(out, - "l %u %llu:%llu - %llu:%llu:\n" - " ptrs: ", - b->c.level, - b->data->min_key.inode, - b->data->min_key.offset, - b->data->max_key.inode, - b->data->max_key.offset); + pr_buf(out, "l %u ", b->c.level); + bch2_bpos_to_text(out, b->data->min_key); + pr_buf(out, " - "); + bch2_bpos_to_text(out, b->data->max_key); + pr_buf(out, ":\n" + " ptrs: "); bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + pr_buf(out, "\n" " format: u64s %u fields %u %u %u %u %u\n" " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %zu)\n" + " sib u64s: %u, %u (merge threshold %u)\n" " nr packed keys %u\n" " nr unpacked keys %u\n" " floats %zu\n" @@ -1055,9 +962,16 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], - BTREE_FOREGROUND_MERGE_THRESHOLD(c), + c->btree_foreground_merge_threshold, b->nr.packed_keys, b->nr.unpacked_keys, stats.floats, stats.failed); } + +void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) +{ + pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); + pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); + pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); +} diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index d0d3a85..4791c3b 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -7,8 +7,6 @@ struct btree_iter; -extern const char * const bch2_btree_ids[]; - void bch2_recalc_btree_reserve(struct bch_fs *); void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); @@ -23,16 +21,13 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, const struct bkey_i *, unsigned, - enum six_lock_type); + enum six_lock_type, unsigned long); struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, - enum btree_id, unsigned); - -struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, - struct btree *, enum btree_node_sibling); + enum btree_id, unsigned, bool); void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, - const struct bkey_i *, unsigned); + const struct bkey_i *, enum btree_id, unsigned); void bch2_fs_btree_cache_exit(struct bch_fs *); int bch2_fs_btree_cache_init(struct bch_fs *); @@ -94,11 +89,12 @@ static inline unsigned btree_blocks(struct bch_fs *c) #define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) #define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, struct btree *); +void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index e8c1e75..268e007 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -8,7 +8,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_methods.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -50,39 +50,248 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } +/* + * Missing: if an interior btree node is empty, we need to do something - + * perhaps just kill it + */ static int bch2_gc_check_topology(struct bch_fs *c, - struct bkey_s_c k, - struct bpos *expected_start, - struct bpos expected_end, + struct btree *b, + struct bkey_buf *prev, + struct bkey_buf cur, bool is_last) { + struct bpos node_start = b->data->min_key; + struct bpos node_end = b->data->max_key; + struct bpos expected_start = bkey_deleted(&prev->k->k) + ? node_start + : bpos_successor(prev->k->k.p); + char buf1[200], buf2[200]; + bool update_min = false; + bool update_max = false; int ret = 0; - if (k.k->type == KEY_TYPE_btree_ptr_v2) { - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, - "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", - bp.v->min_key.inode, - bp.v->min_key.offset, - expected_start->inode, - expected_start->offset)) { - BUG(); + if (bkey_deleted(&prev->k->k)) { + struct printbuf out = PBUF(buf1); + pr_buf(&out, "start of node: "); + bch2_bpos_to_text(&out, node_start); + } else { + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); } - } - *expected_start = bkey_cmp(k.k->p, POS_MAX) - ? bkey_successor(k.k->p) - : k.k->p; + if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, + (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) + update_min = true; + } if (fsck_err_on(is_last && - bkey_cmp(k.k->p, expected_end), c, - "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", - k.k->p.inode, - k.k->p.offset, - expected_end.inode, - expected_end.offset)) { - BUG(); + bpos_cmp(cur.k->k.p, node_end), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), + (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) + update_max = true; + + bch2_bkey_buf_copy(prev, c, cur.k); + + if (update_min || update_max) { + struct bkey_i *new; + struct bkey_i_btree_ptr_v2 *bp = NULL; + struct btree *n; + + if (update_max) { + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur.k->k.p); + if (ret) + return ret; + } + + new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL); + if (!new) { + bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; + } + + bkey_copy(new, cur.k); + + if (new->k.type == KEY_TYPE_btree_ptr_v2) + bp = bkey_i_to_btree_ptr_v2(new); + + if (update_min) + bp->v.min_key = expected_start; + if (update_max) + new->k.p = node_end; + if (bp) + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true); + + ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new); + if (ret) { + kfree(new); + return ret; + } + + n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id, + b->c.level - 1, true); + if (n) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, n); + + bkey_copy(&n->key, new); + if (update_min) + n->data->min_key = expected_start; + if (update_max) + n->data->max_key = node_end; + + ret = __bch2_btree_node_hash_insert(&c->btree_cache, n); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&n->c.lock); + } + } +fsck_err: + return ret; +} + +static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + int ret = 0; + + bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); + struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); + + if (fsck_err_on(!g->gen_valid, c, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen)) { + if (p.ptr.cached) { + g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } + } + + if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->mark.gen)) { + if (p.ptr.cached) { + g2->_mark.gen = g->_mark.gen = p.ptr.gen; + g2->gen_valid = g->gen_valid = true; + g2->_mark.data_type = 0; + g2->_mark.dirty_sectors = 0; + g2->_mark.cached_sectors = 0; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } + } + + if (fsck_err_on(!p.ptr.cached && + gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->mark.gen)) + do_update = true; + + if (p.has_ec) { + struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx)) + do_update = true; + + if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, + "pointer does not match stripe %llu", + (u64) p.ec.idx)) + do_update = true; + } + } + + if (do_update) { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct bkey_i *new; + + if (is_root) { + bch_err(c, "cannot update btree roots yet"); + return -EINVAL; + } + + new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + if (!new) { + bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; + } + + bkey_reassemble(new, *k); + + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); + + (ptr->cached && + (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!ptr->cached && + gen_cmp(ptr->gen, g->mark.gen) < 0); + })); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct stripe *m = genradix_ptr(&c->stripes[true], + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + + bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) + if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) + goto found; + next_ptr = NULL; +found: + if (!next_ptr) { + bch_err(c, "aieee, found stripe ptr with no data ptr"); + continue; + } + + if (!m || !m->alive || + !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], + &next_ptr->ptr, + m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } + } + } + + ret = bch2_journal_key_insert(c, btree_id, level, new); + if (ret) + kfree(new); + else + *k = bkey_i_to_s_c(new); } fsck_err: return ret; @@ -90,7 +299,9 @@ fsck_err: /* marking of btree keys/nodes: */ -static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, +static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c k, u8 *max_stale, bool initial) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -101,10 +312,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, int ret = 0; if (initial) { - BUG_ON(journal_seq_verify(c) && + BUG_ON(bch2_journal_seq_verify && k.k->version.lo > journal_cur_seq(&c->journal)); - /* XXX change to fsck check */ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, "key version number higher than recorded: %llu > %llu", k.k->version.lo, @@ -116,37 +326,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, "superblock not marked as containing replicas (type %u)", k.k->type)) { ret = bch2_mark_bkey_replicas(c, k); - if (ret) - return ret; - } - - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); - struct bucket *g2 = PTR_BUCKET(ca, ptr, false); - - if (mustfix_fsck_err_on(!g->gen_valid, c, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", - ptr->dev, PTR_BUCKET_NR(ca, ptr), - bch2_data_types[ptr_data_type(k.k, ptr)], - ptr->gen)) { - g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->gen_valid = g->gen_valid = true; - } - - if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u", - ptr->dev, PTR_BUCKET_NR(ca, ptr), - bch2_data_types[ptr_data_type(k.k, ptr)], - ptr->gen, g->mark.gen)) { - g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->gen_valid = g->gen_valid = true; - g2->_mark.data_type = 0; - g2->_mark.dirty_sectors = 0; - g2->_mark.cached_sectors = 0; - set_bit(BCH_FS_FIXED_GENS, &c->flags); + if (ret) { + bch_err(c, "error marking bkey replicas: %i", ret); + goto err; } } + + ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k); } bkey_for_each_ptr(ptrs, ptr) { @@ -161,16 +347,19 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); fsck_err: +err: + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); return ret; } static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, bool initial) { - struct bpos next_node_start = b->data->min_key; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; + struct bkey_buf prev, cur; int ret = 0; *max_stale = 0; @@ -179,37 +368,40 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, return 0; bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { - bch2_bkey_debugcheck(c, b, k); - - ret = bch2_gc_mark_key(c, k, max_stale, initial); + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + k, max_stale, initial); if (ret) break; bch2_btree_node_iter_advance(&iter, b); if (b->c.level) { - ret = bch2_gc_check_topology(c, k, - &next_node_start, - b->data->max_key, + bch2_bkey_buf_reassemble(&cur, c, k); + + ret = bch2_gc_check_topology(c, b, &prev, cur, bch2_btree_node_iter_end(&iter)); if (ret) break; } } + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); return ret; } static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, - bool initial, bool metadata_only) + bool initial) { struct btree_trans trans; struct btree_iter *iter; struct btree *b; - unsigned depth = metadata_only ? 1 - : expensive_debug_checks(c) ? 0 + unsigned depth = bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; @@ -233,11 +425,10 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, if (max_stale > 64) bch2_btree_node_rewrite(c, iter, b->data->keys.seq, - BTREE_INSERT_USE_RESERVE| BTREE_INSERT_NOWAIT| BTREE_INSERT_GC_LOCK_HELD); - else if (!btree_gc_rewrite_disabled(c) && - (btree_gc_always_rewrite(c) || max_stale > 16)) + else if (!bch2_btree_gc_rewrite_disabled && + (bch2_btree_gc_always_rewrite || max_stale > 16)) bch2_btree_node_rewrite(c, iter, b->data->keys.seq, BTREE_INSERT_NOWAIT| @@ -246,6 +437,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, bch2_trans_cond_resched(&trans); } + bch2_trans_iter_put(&trans, iter); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; @@ -253,7 +446,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, mutex_lock(&c->btree_root_lock); b = c->btree_roots[btree_id].b; if (!btree_node_fake(b)) - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, + bkey_i_to_s_c(&b->key), &max_stale, initial); gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); @@ -262,76 +456,102 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, } static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, - struct journal_keys *journal_keys, unsigned target_depth) { struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bpos next_node_start = b->data->min_key; + struct bkey_buf cur, prev; u8 max_stale = 0; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_debugcheck(c, b, k); - - BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); - BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); - ret = bch2_gc_mark_key(c, k, &max_stale, true); - if (ret) + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + k, &max_stale, true); + if (ret) { + bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); break; + } if (b->c.level) { - struct btree *child; - BKEY_PADDED(k) tmp; - - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bch2_bkey_buf_reassemble(&cur, c, k); + k = bkey_i_to_s_c(cur.k); bch2_btree_and_journal_iter_advance(&iter); - ret = bch2_gc_check_topology(c, k, - &next_node_start, - b->data->max_key, + ret = bch2_gc_check_topology(c, b, + &prev, cur, !bch2_btree_and_journal_iter_peek(&iter).k); if (ret) break; + } else { + bch2_btree_and_journal_iter_advance(&iter); + } + } - if (b->c.level > target_depth) { - child = bch2_btree_node_get_noiter(c, &tmp.k, - b->c.btree_id, b->c.level - 1); - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; + if (b->c.level > target_depth) { + bch2_btree_and_journal_iter_exit(&iter); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); - ret = bch2_gc_btree_init_recurse(c, child, - journal_keys, target_depth); - six_unlock_read(&child->c.lock); + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + struct btree *child; + bch2_bkey_buf_reassemble(&cur, c, k); + bch2_btree_and_journal_iter_advance(&iter); + + child = bch2_btree_node_get_noiter(c, cur.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(child); + + if (fsck_err_on(ret == -EIO, c, + "unreadable btree node")) { + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur.k->k.p); if (ret) - break; + return ret; + + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + continue; } - } else { - bch2_btree_and_journal_iter_advance(&iter); + + if (ret) { + bch_err(c, "%s: error %i getting btree node", + __func__, ret); + break; + } + + ret = bch2_gc_btree_init_recurse(c, child, + target_depth); + six_unlock_read(&child->c.lock); + + if (ret) + break; } } - +fsck_err: + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + bch2_btree_and_journal_iter_exit(&iter); return ret; } static int bch2_gc_btree_init(struct bch_fs *c, - struct journal_keys *journal_keys, - enum btree_id btree_id, - bool metadata_only) + enum btree_id btree_id) { struct btree *b; - unsigned target_depth = metadata_only ? 1 - : expensive_debug_checks(c) ? 0 - : !btree_node_type_needs_gc(btree_id) ? 1 + unsigned target_depth = bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 : 0; u8 max_stale = 0; + char buf[100]; int ret = 0; b = c->btree_roots[btree_id].b; @@ -340,30 +560,30 @@ static int bch2_gc_btree_init(struct bch_fs *c, return 0; six_lock_read(&b->c.lock, NULL, NULL); - if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, - "btree root with incorrect min_key: %llu:%llu", - b->data->min_key.inode, - b->data->min_key.offset)) { + if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, + "btree root with incorrect min_key: %s", + (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) { BUG(); } - if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, - "btree root with incorrect min_key: %llu:%llu", - b->data->max_key.inode, - b->data->max_key.offset)) { + if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c, + "btree root with incorrect max_key: %s", + (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) { BUG(); } if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(c, b, - journal_keys, target_depth); + ret = bch2_gc_btree_init_recurse(c, b, target_depth); if (!ret) - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, + bkey_i_to_s_c(&b->key), &max_stale, true); fsck_err: six_unlock_read(&b->c.lock); + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); return ret; } @@ -373,8 +593,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, - bool initial, bool metadata_only) +static int bch2_gc_btrees(struct bch_fs *c, bool initial) { enum btree_id ids[BTREE_ID_NR]; unsigned i; @@ -386,11 +605,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, for (i = 0; i < BTREE_ID_NR; i++) { enum btree_id id = ids[i]; int ret = initial - ? bch2_gc_btree_init(c, journal_keys, - id, metadata_only) - : bch2_gc_btree(c, id, initial, metadata_only); - if (ret) + ? bch2_gc_btree_init(c, id) + : bch2_gc_btree(c, id, initial); + if (ret) { + bch_err(c, "%s: ret %i", __func__, ret); return ret; + } } return 0; @@ -546,8 +766,8 @@ static void bch2_gc_free(struct bch_fs *c) ca->mi.nbuckets * sizeof(struct bucket)); ca->buckets[1] = NULL; - free_percpu(ca->usage[1]); - ca->usage[1] = NULL; + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; } free_percpu(c->usage_gc); @@ -555,13 +775,12 @@ static void bch2_gc_free(struct bch_fs *c) } static int bch2_gc_done(struct bch_fs *c, - bool initial, bool metadata_only) + bool initial) { struct bch_dev *ca; - bool verify = !metadata_only && - (!initial || - (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); - unsigned i; + bool verify = (!initial || + (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; int ret = 0; #define copy_field(_f, _msg, ...) \ @@ -570,18 +789,17 @@ static int bch2_gc_done(struct bch_fs *c, fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ - ret = 1; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ fsck_err(c, "stripe %zu has wrong "_msg \ ": got %u, should be %u", \ - dst_iter.pos, ##__VA_ARGS__, \ + iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ - dst->dirty = true; \ - ret = 1; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_bucket_field(_f) \ if (dst->b[b].mark._f != src->b[b].mark._f) { \ @@ -592,49 +810,46 @@ static int bch2_gc_done(struct bch_fs *c, bch2_data_types[dst->b[b].mark.data_type],\ dst->b[b].mark._f, src->b[b].mark._f); \ dst->b[b]._mark._f = src->b[b].mark._f; \ - ret = 1; \ + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) #define copy_fs_field(_f, _msg, ...) \ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) - if (!metadata_only) { - struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); - struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); + { + struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); struct stripe *dst, *src; - unsigned i; - - c->ec_stripes_heap.used = 0; - - while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && - (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { - BUG_ON(src_iter.pos != dst_iter.pos); - copy_stripe_field(alive, "alive"); - copy_stripe_field(sectors, "sectors"); - copy_stripe_field(algorithm, "algorithm"); - copy_stripe_field(nr_blocks, "nr_blocks"); - copy_stripe_field(nr_redundant, "nr_redundant"); - copy_stripe_field(blocks_nonempty, - "blocks_nonempty"); + while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) { + dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL); + + if (dst->alive != src->alive || + dst->sectors != src->sectors || + dst->algorithm != src->algorithm || + dst->nr_blocks != src->nr_blocks || + dst->nr_redundant != src->nr_redundant) { + bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused"); + ret = -EINVAL; + goto fsck_err; + } for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) copy_stripe_field(block_sectors[i], "block_sectors[%u]", i); - if (dst->alive) { - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_insert(c, dst, dst_iter.pos); - spin_unlock(&c->ec_stripes_heap_lock); - } + dst->blocks_nonempty = 0; + for (i = 0; i < dst->nr_blocks; i++) + dst->blocks_nonempty += dst->block_sectors[i] != 0; - genradix_iter_advance(&dst_iter, &c->stripes[0]); - genradix_iter_advance(&src_iter, &c->stripes[1]); + genradix_iter_advance(&iter, &c->stripes[1]); } } - for_each_member_device(ca, c, i) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { struct bucket_array *dst = __bucket_array(ca, 0); struct bucket_array *src = __bucket_array(ca, 1); size_t b; @@ -649,12 +864,23 @@ static int bch2_gc_done(struct bch_fs *c, dst->b[b].oldest_gen = src->b[b].oldest_gen; } - }; - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); - bch2_dev_usage_from_buckets(c); + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + } + }; { unsigned nr = fs_usage_u64s(c); @@ -664,28 +890,20 @@ static int bch2_gc_done(struct bch_fs *c, copy_fs_field(hidden, "hidden"); copy_fs_field(btree, "btree"); + copy_fs_field(data, "data"); + copy_fs_field(cached, "cached"); + copy_fs_field(reserved, "reserved"); + copy_fs_field(nr_inodes,"nr_inodes"); - if (!metadata_only) { - copy_fs_field(data, "data"); - copy_fs_field(cached, "cached"); - copy_fs_field(reserved, "reserved"); - copy_fs_field(nr_inodes,"nr_inodes"); - - for (i = 0; i < BCH_REPLICAS_MAX; i++) - copy_fs_field(persistent_reserved[i], - "persistent_reserved[%i]", i); - } + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(persistent_reserved[i], + "persistent_reserved[%i]", i); for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); char buf[80]; - if (metadata_only && - (e->data_type == BCH_DATA_user || - e->data_type == BCH_DATA_cached)) - continue; - bch2_replicas_entry_to_text(&PBUF(buf), e); copy_fs_field(replicas[i], "%s", buf); @@ -698,11 +916,12 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_stripe_field #undef copy_field fsck_err: + if (ret) + bch_err(c, "%s: ret %i", __func__, ret); return ret; } -static int bch2_gc_start(struct bch_fs *c, - bool metadata_only) +static int bch2_gc_start(struct bch_fs *c) { struct bch_dev *ca; unsigned i; @@ -719,7 +938,7 @@ static int bch2_gc_start(struct bch_fs *c, for_each_member_device(ca, c, i) { BUG_ON(ca->buckets[1]); - BUG_ON(ca->usage[1]); + BUG_ON(ca->usage_gc); ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), @@ -730,9 +949,9 @@ static int bch2_gc_start(struct bch_fs *c, return -ENOMEM; } - ca->usage[1] = alloc_percpu(struct bch_dev_usage); - if (!ca->usage[1]) { - bch_err(c, "error allocating ca->usage[gc]"); + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); percpu_ref_put(&ca->ref); return -ENOMEM; } @@ -766,13 +985,6 @@ static int bch2_gc_start(struct bch_fs *c, d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; d->gen_valid = s->gen_valid; - - if (metadata_only && - (s->mark.data_type == BCH_DATA_user || - s->mark.data_type == BCH_DATA_cached)) { - d->_mark = s->mark; - d->_mark.owned_by_allocator = 0; - } } }; @@ -799,8 +1011,7 @@ static int bch2_gc_start(struct bch_fs *c, * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, - bool initial, bool metadata_only) +int bch2_gc(struct bch_fs *c, bool initial) { struct bch_dev *ca; u64 start_time = local_clock(); @@ -816,13 +1027,13 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); again: - ret = bch2_gc_start(c, metadata_only); + ret = bch2_gc_start(c); if (ret) goto out; bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); + ret = bch2_gc_btrees(c, initial); if (ret) goto out; @@ -832,16 +1043,15 @@ again: bch2_mark_allocator_buckets(c); c->gc_count++; -out: - if (!ret && - (test_bit(BCH_FS_FIXED_GENS, &c->flags) || - (!iter && test_restart_gc(c)))) { + + if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + (!iter && bch2_test_restart_gc)) { /* * XXX: make sure gens we fixed got saved */ if (iter++ <= 2) { - bch_info(c, "Fixed gens, restarting mark and sweep:"); - clear_bit(BCH_FS_FIXED_GENS, &c->flags); + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); percpu_down_write(&c->mark_lock); @@ -856,12 +1066,12 @@ out: bch_info(c, "Unable to fix bucket gens, looping"); ret = -EINVAL; } - +out: if (!ret) { bch2_journal_block(&c->journal); percpu_down_write(&c->mark_lock); - ret = bch2_gc_done(c, initial, metadata_only); + ret = bch2_gc_done(c, initial); bch2_journal_unblock(&c->journal); } else { @@ -931,19 +1141,21 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_on_stack sk; + struct bkey_buf sk; int ret = 0; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, - BTREE_ITER_PREFETCH); + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { if (gc_btree_gens_key(c, k)) { - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); bch2_extent_normalize(c, bkey_i_to_s(sk.k)); bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); @@ -959,11 +1171,12 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) } } - bch2_btree_iter_next(iter); + bch2_btree_iter_advance(iter); } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } @@ -1061,6 +1274,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, /* Find a format that all keys in @old_nodes can pack into */ bch2_bkey_format_init(&format_state); + /* + * XXX: this won't correctly take it account the new min/max keys: + */ for (i = 0; i < nr_old_nodes; i++) __bch2_btree_calc_format(&format_state, old_nodes[i]); @@ -1075,17 +1291,16 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, } if (bch2_keylist_realloc(&keylist, NULL, 0, - (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { + BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) { trace_btree_gc_coalesce_fail(c, BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); return; } - as = bch2_btree_update_start(iter->trans, iter->btree_id, + as = bch2_btree_update_start(iter, old_nodes[0]->c.level, btree_update_reserve_required(c, parent) + nr_old_nodes, BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - NULL); + BTREE_INSERT_USE_RESERVE); if (IS_ERR(as)) { trace_btree_gc_coalesce_fail(c, BTREE_GC_COALESCE_FAIL_RESERVE_GET); @@ -1123,7 +1338,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, k < vstruct_last(s2) && vstruct_blocks_plus(n1->data, c->block_bits, u64s + k->u64s) <= blocks; - k = bkey_next_skip_noops(k, vstruct_last(s2))) { + k = bkey_next(k)) { last = k; u64s += k->u64s; } @@ -1152,7 +1367,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, n1->key.k.p = n1->data->max_key = bkey_unpack_pos(n1, last); - n2->data->min_key = bkey_successor(n1->data->max_key); + n2->data->min_key = bpos_successor(n1->data->max_key); memcpy_u64s(vstruct_last(s1), s2->start, u64s); @@ -1195,7 +1410,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, unsigned j; for (j = 0; j < nr_new_nodes; j++) - if (!bkey_cmp(old_nodes[i]->key.k.p, + if (!bpos_cmp(old_nodes[i]->key.k.p, new_nodes[j]->key.k.p)) goto next; @@ -1258,6 +1473,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) struct btree *b; bool kthread = (current->flags & PF_KTHREAD) != 0; unsigned i; + int ret = 0; /* Sliding window of adjacent btree nodes */ struct btree *merge[GC_MERGE_NODES]; @@ -1306,8 +1522,8 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) lock_seq[0] = merge[0]->c.lock.state.seq; if (kthread && kthread_should_stop()) { - bch2_trans_exit(&trans); - return -ESHUTDOWN; + ret = -ESHUTDOWN; + break; } bch2_trans_cond_resched(&trans); @@ -1322,7 +1538,9 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) memset(merge + 1, 0, (GC_MERGE_NODES - 1) * sizeof(merge[0])); } - return bch2_trans_exit(&trans); + bch2_trans_iter_put(&trans, iter); + + return bch2_trans_exit(&trans) ?: ret; } /** @@ -1355,7 +1573,7 @@ static int bch2_gc_thread(void *arg) { struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic_long_read(&clock->now); + unsigned long last = atomic64_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); int ret; @@ -1376,7 +1594,7 @@ static int bch2_gc_thread(void *arg) if (c->btree_gc_periodic) { unsigned long next = last + c->capacity / 16; - if (atomic_long_read(&clock->now) >= next) + if (atomic64_read(&clock->now) >= next) break; bch2_io_clock_schedule_timeout(clock, next); @@ -1388,14 +1606,14 @@ static int bch2_gc_thread(void *arg) } __set_current_state(TASK_RUNNING); - last = atomic_long_read(&clock->now); + last = atomic64_read(&clock->now); last_kick = atomic_read(&c->kick_gc); /* * Full gc is currently incompatible with btree key cache: */ #if 0 - ret = bch2_gc(c, NULL, false, false); + ret = bch2_gc(c, false, false); #else ret = bch2_gc_gens(c); #endif @@ -1425,11 +1643,14 @@ int bch2_gc_thread_start(struct bch_fs *c) { struct task_struct *p; - BUG_ON(c->gc_thread); + if (c->gc_thread) + return 0; - p = kthread_create(bch2_gc_thread, c, "bch_gc"); - if (IS_ERR(p)) + p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); + if (IS_ERR(p)) { + bch_err(c, "error creating gc thread: %li", PTR_ERR(p)); return PTR_ERR(p); + } get_task_struct(p); c->gc_thread = p; diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 3694a3d..b1362a9 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -6,8 +6,7 @@ void bch2_coalesce(struct bch_fs *); -struct journal_keys; -int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); +int bch2_gc(struct bch_fs *, bool); int bch2_gc_gens(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); @@ -46,19 +45,15 @@ static inline struct gc_pos gc_phase(enum gc_phase phase) static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) { - if (l.phase != r.phase) - return l.phase < r.phase ? -1 : 1; - if (bkey_cmp(l.pos, r.pos)) - return bkey_cmp(l.pos, r.pos); - if (l.level != r.level) - return l.level < r.level ? -1 : 1; - return 0; + return cmp_int(l.phase, r.phase) ?: + bpos_cmp(l.pos, r.pos) ?: + cmp_int(l.level, r.level); } static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) { switch (id) { -#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; +#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; BCH_BTREE_IDS() #undef x default: diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 682f599..ec1290f 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -24,8 +24,7 @@ static void verify_no_dups(struct btree *b, struct bkey_packed *start, - struct bkey_packed *end, - bool extents) + struct bkey_packed *end) { #ifdef CONFIG_BCACHEFS_DEBUG struct bkey_packed *k, *p; @@ -33,16 +32,13 @@ static void verify_no_dups(struct btree *b, if (start == end) return; - for (p = start, k = bkey_next_skip_noops(start, end); + for (p = start, k = bkey_next(start); k != end; - p = k, k = bkey_next_skip_noops(k, end)) { + p = k, k = bkey_next(k)) { struct bkey l = bkey_unpack_key(b, p); struct bkey r = bkey_unpack_key(b, k); - BUG_ON(extents - ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 - : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); - //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); + BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0); } #endif } @@ -51,9 +47,7 @@ static void set_needs_whiteout(struct bset *i, int v) { struct bkey_packed *k; - for (k = i->start; - k != vstruct_last(i); - k = bkey_next_skip_noops(k, vstruct_last(i))) + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) k->needs_whiteout = v; } @@ -102,14 +96,14 @@ static void sort_bkey_ptrs(const struct btree *bt, break; for (b = a; c = 2 * b + 1, (d = c + 1) < n;) - b = bkey_cmp_packed(bt, + b = bch2_bkey_cmp_packed(bt, ptrs[c], ptrs[d]) >= 0 ? c : d; if (d == n) b = c; while (b != a && - bkey_cmp_packed(bt, + bch2_bkey_cmp_packed(bt, ptrs[a], ptrs[b]) >= 0) b = (b - 1) / 2; @@ -150,8 +144,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) } verify_no_dups(b, new_whiteouts, - (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), - btree_node_old_extent_overwrite(b)); + (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); memcpy_u64s(unwritten_whiteouts_start(c, b), new_whiteouts, b->whiteout_u64s); @@ -176,144 +169,6 @@ static bool should_compact_bset(struct btree *b, struct bset_tree *t, } } -static bool bch2_compact_extent_whiteouts(struct bch_fs *c, - struct btree *b, - enum compact_mode mode) -{ - const struct bkey_format *f = &b->format; - struct bset_tree *t; - struct bkey_packed *whiteouts = NULL; - struct bkey_packed *u_start, *u_pos; - struct sort_iter sort_iter; - unsigned bytes, whiteout_u64s = 0, u64s; - bool used_mempool, compacting = false; - - BUG_ON(!btree_node_is_extents(b)); - - for_each_bset(b, t) - if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) - whiteout_u64s += bset_dead_u64s(b, t); - - if (!whiteout_u64s) - return false; - - bch2_sort_whiteouts(c, b); - - sort_iter_init(&sort_iter, b); - - whiteout_u64s += b->whiteout_u64s; - bytes = whiteout_u64s * sizeof(u64); - - whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); - u_start = u_pos = whiteouts; - - memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), - b->whiteout_u64s); - u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); - - sort_iter_add(&sort_iter, u_start, u_pos); - - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k, *n, *out, *start, *end; - struct btree_node_entry *src = NULL, *dst = NULL; - - if (t != b->set && !bset_written(b, i)) { - src = container_of(i, struct btree_node_entry, keys); - dst = max(write_block(b), - (void *) btree_bkey_last(b, t - 1)); - } - - if (src != dst) - compacting = true; - - if (!should_compact_bset(b, t, compacting, mode)) { - if (src != dst) { - memmove(dst, src, sizeof(*src) + - le16_to_cpu(src->keys.u64s) * - sizeof(u64)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - continue; - } - - compacting = true; - u_start = u_pos; - start = i->start; - end = vstruct_last(i); - - if (src != dst) { - memmove(dst, src, sizeof(*src)); - i = &dst->keys; - set_btree_bset(b, t, i); - } - - out = i->start; - - for (k = start; k != end; k = n) { - n = bkey_next_skip_noops(k, end); - - if (bkey_deleted(k)) - continue; - - BUG_ON(bkey_whiteout(k) && - k->needs_whiteout && - bkey_written(b, k)); - - if (bkey_whiteout(k) && !k->needs_whiteout) - continue; - - if (bkey_whiteout(k)) { - memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); - set_bkeyp_val_u64s(f, u_pos, 0); - u_pos = bkey_next(u_pos); - } else { - bkey_copy(out, k); - out = bkey_next(out); - } - } - - sort_iter_add(&sort_iter, u_start, u_pos); - - i->u64s = cpu_to_le16((u64 *) out - i->_data); - set_btree_bset_end(b, t); - bch2_bset_set_no_aux_tree(b, t); - } - - b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; - - BUG_ON((void *) unwritten_whiteouts_start(c, b) < - (void *) btree_bkey_last(b, bset_tree_last(b))); - - u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter); - - BUG_ON(u64s > b->whiteout_u64s); - BUG_ON(u_pos != whiteouts && !u64s); - - if (u64s != b->whiteout_u64s) { - void *src = unwritten_whiteouts_start(c, b); - - b->whiteout_u64s = u64s; - memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); - } - - verify_no_dups(b, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b), - true); - - btree_bounce_free(c, bytes, used_mempool, whiteouts); - - bch2_btree_build_aux_trees(b); - - bch_btree_keys_u64s_remaining(c, b); - bch2_verify_btree_nr_keys(b); - - return true; -} - static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) { struct bset_tree *t; @@ -356,9 +211,9 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) out = i->start; for (k = start; k != end; k = n) { - n = bkey_next_skip_noops(k, end); + n = bkey_next(k); - if (!bkey_whiteout(k)) { + if (!bkey_deleted(k)) { bkey_copy(out, k); out = bkey_next(out); } else { @@ -382,9 +237,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, enum compact_mode mode) { - return !btree_node_old_extent_overwrite(b) - ? bch2_drop_whiteouts(b, mode) - : bch2_compact_extent_whiteouts(c, b, mode); + return bch2_drop_whiteouts(b, mode); } static void btree_node_sort(struct bch_fs *c, struct btree *b, @@ -422,14 +275,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, start_time = local_clock(); - if (btree_node_old_extent_overwrite(b)) - filter_whiteouts = bset_written(b, start_bset); - - u64s = (btree_node_old_extent_overwrite(b) - ? bch2_sort_extents - : bch2_sort_keys)(out->keys.start, - &sort_iter, - filter_whiteouts); + u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts); out->keys.u64s = cpu_to_le16(u64s); @@ -597,18 +443,30 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, bch2_btree_iter_reinit_node(iter, b); } -static void btree_err_msg(struct printbuf *out, struct bch_fs *c, - struct btree *b, struct bset *i, - unsigned offset, int write) +static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) { - pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" - "pos ", - write ? "before write " : "", - b->c.btree_id, b->c.level, + pr_buf(out, "%s level %u/%u\n ", + bch2_btree_ids[b->c.btree_id], + b->c.level, c->btree_roots[b->c.btree_id].level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} - pr_buf(out, " node offset %u", b->written); +static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned offset, int write) +{ + pr_buf(out, "error validating btree node "); + if (write) + pr_buf(out, "before write "); + if (ca) + pr_buf(out, "on %s ", ca->name); + pr_buf(out, "at btree "); + btree_pos_to_text(out, c, b); + + pr_buf(out, "\n node offset %u", b->written); if (i) pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); } @@ -624,25 +482,30 @@ enum btree_validate_ret { BTREE_RETRY_READ = 64, }; -#define btree_err(type, c, b, i, msg, ...) \ +#define btree_err(type, c, ca, b, i, msg, ...) \ ({ \ __label__ out; \ char _buf[300]; \ + char *_buf2 = _buf; \ struct printbuf out = PBUF(_buf); \ \ - btree_err_msg(&out, c, b, i, b->written, write); \ + _buf2 = kmalloc(4096, GFP_ATOMIC); \ + if (_buf2) \ + out = _PBUF(_buf2, 4986); \ + \ + btree_err_msg(&out, c, ca, b, i, b->written, write); \ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ \ if (type == BTREE_ERR_FIXABLE && \ write == READ && \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ - mustfix_fsck_err(c, "%s", _buf); \ + mustfix_fsck_err(c, "%s", _buf2); \ goto out; \ } \ \ switch (write) { \ case READ: \ - bch_err(c, "%s", _buf); \ + bch_err(c, "%s", _buf2); \ \ switch (type) { \ case BTREE_ERR_FIXABLE: \ @@ -663,7 +526,7 @@ enum btree_validate_ret { } \ break; \ case WRITE: \ - bch_err(c, "corrupt metadata before write: %s", _buf); \ + bch_err(c, "corrupt metadata before write: %s", _buf2); \ \ if (bch2_fs_inconsistent(c)) { \ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ @@ -672,34 +535,62 @@ enum btree_validate_ret { break; \ } \ out: \ + if (_buf2 != _buf) \ + kfree(_buf2); \ true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) -static int validate_bset(struct bch_fs *c, struct btree *b, - struct bset *i, unsigned sectors, - int write, bool have_retry) +static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned sectors, int write, bool have_retry) { unsigned version = le16_to_cpu(i->version); const char *err; + char buf1[100]; + char buf2[100]; int ret = 0; btree_err_on((version != BCH_BSET_VERSION_OLD && version < bcachefs_metadata_version_min) || version >= bcachefs_metadata_version_max, - BTREE_ERR_FATAL, c, b, i, + BTREE_ERR_FATAL, c, ca, b, i, "unsupported bset version"); + if (btree_err_on(version < c->sb.version_min, + BTREE_ERR_FIXABLE, c, NULL, b, i, + "bset version %u older than superblock version_min %u", + version, c->sb.version_min)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version_min = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + if (btree_err_on(version > c->sb.version, + BTREE_ERR_FIXABLE, c, NULL, b, i, + "bset version %u newer than superblock version %u", + version, c->sb.version)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + btree_err_on(BSET_SEPARATE_WHITEOUTS(i), + BTREE_ERR_FATAL, c, ca, b, i, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + if (btree_err_on(b->written + sectors > c->opts.btree_node_size, - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, ca, b, i, "bset past end of btree node")) { i->u64s = 0; return 0; } btree_err_on(b->written && !i->u64s, - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, ca, b, i, "empty bset"); if (!b->written) { @@ -713,24 +604,18 @@ static int validate_bset(struct bch_fs *c, struct btree *b, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, - BTREE_ERR_MUST_RETRY, c, b, i, + BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, - BTREE_ERR_MUST_RETRY, c, b, i, + BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect level"); - if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { - u64 *p = (u64 *) &bn->ptr; - - *p = swab64(*p); - } - if (!write) compat_btree_node(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, bn); @@ -739,42 +624,30 @@ static int validate_bset(struct bch_fs *c, struct btree *b, struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; - btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), - BTREE_ERR_MUST_RETRY, c, b, NULL, - "incorrect min_key: got %llu:%llu should be %llu:%llu", - b->data->min_key.inode, - b->data->min_key.offset, - bp->min_key.inode, - bp->min_key.offset); + if (BTREE_PTR_RANGE_UPDATED(bp)) { + b->data->min_key = bp->min_key; + b->data->max_key = b->key.k.p; + } + + btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect min_key: got %s should be %s", + (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), + (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); } - btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), - BTREE_ERR_MUST_RETRY, c, b, i, - "incorrect max key %llu:%llu", - bn->max_key.inode, - bn->max_key.offset); + btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect max key %s", + (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); if (write) compat_btree_node(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, bn); - /* XXX: ideally we would be validating min_key too */ -#if 0 - /* - * not correct anymore, due to btree node write error - * handling - * - * need to add bn->seq to btree keys and verify - * against that - */ - btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), - bn->ptr), - BTREE_ERR_FATAL, c, b, i, - "incorrect backpointer"); -#endif err = bch2_bkey_format_validate(&bn->format); btree_err_on(err, - BTREE_ERR_FATAL, c, b, i, + BTREE_ERR_FATAL, c, ca, b, i, "invalid bkey format: %s", err); compat_bformat(b->c.level, b->c.btree_id, version, @@ -791,14 +664,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; - bool seen_non_whiteout = false; int ret = 0; - if (!BSET_SEPARATE_WHITEOUTS(i)) { - seen_non_whiteout = true; - *whiteout_u64s = 0; - } - for (k = i->start; k != vstruct_last(i);) { struct bkey_s u; @@ -806,14 +673,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, const char *invalid; if (btree_err_on(bkey_next(k) > vstruct_last(i), - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, NULL, b, i, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); break; } if (btree_err_on(k->format > KEY_FORMAT_CURRENT, - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, NULL, b, i, "invalid bkey format %u", k->format)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -836,8 +703,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, char buf[160]; bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, b, i, - "invalid bkey:\n%s\n%s", invalid, buf); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey: %s\n%s", invalid, buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -850,18 +717,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, BSET_BIG_ENDIAN(i), write, &b->format, k); - /* - * with the separate whiteouts thing (used for extents), the - * second set of keys actually can have whiteouts too, so we - * can't solely go off bkey_whiteout()... - */ - - if (!seen_non_whiteout && - (!bkey_whiteout(k) || - (prev && bkey_iter_cmp(b, prev, k) > 0))) { - *whiteout_u64s = k->_data - i->_data; - seen_non_whiteout = true; - } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { + if (prev && bkey_iter_cmp(b, prev, k) > 0) { char buf1[80]; char buf2[80]; struct bkey up = bkey_unpack_key(b, prev); @@ -870,20 +726,26 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_to_text(&PBUF(buf2), u.k); bch2_dump_bset(c, b, i, 0); - btree_err(BTREE_ERR_FATAL, c, b, i, - "keys out of order: %s > %s", - buf1, buf2); - /* XXX: repair this */ + + if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "keys out of order: %s > %s", + buf1, buf2)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } } prev = k; - k = bkey_next_skip_noops(k, vstruct_last(i)); + k = bkey_next(k); } fsck_err: return ret; } -int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) +int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, bool have_retry) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -895,20 +757,22 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry unsigned u64s; int ret, retry_read = 0, write = READ; + b->version_ondisk = U16_MAX; + iter = mempool_alloc(&c->fill_iter, GFP_NOIO); sort_iter_init(iter, b); iter->size = (btree_blocks(c) + 1) * 2; if (bch2_meta_read_fault("btree")) - btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, + btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "bad magic"); btree_err_on(!b->data->keys.seq, - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "bad btree header"); if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { @@ -916,7 +780,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry &bkey_i_to_btree_ptr_v2(&b->key)->v; btree_err_on(b->data->keys.seq != bp->seq, - BTREE_ERR_MUST_RETRY, c, b, NULL, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "got wrong btree node (seq %llx want %llx)", b->data->keys.seq, bp->seq); } @@ -931,7 +795,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry i = &b->data->keys; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - BTREE_ERR_WANT_RETRY, c, b, i, + BTREE_ERR_WANT_RETRY, c, ca, b, i, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -939,16 +803,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); btree_err_on(bch2_crc_cmp(csum, b->data->csum), - BTREE_ERR_WANT_RETRY, c, b, i, + BTREE_ERR_WANT_RETRY, c, ca, b, i, "invalid checksum"); bset_encrypt(c, i, b->written << 9); - if (btree_node_is_extents(b) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { - set_btree_node_old_extent_overwrite(b); - set_btree_node_need_rewrite(b); - } + btree_err_on(btree_node_is_extents(b) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), + BTREE_ERR_FATAL, c, NULL, b, NULL, + "btree node does not have NEW_EXTENT_OVERWRITE set"); sectors = vstruct_sectors(b->data, c->block_bits); } else { @@ -959,7 +822,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry break; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - BTREE_ERR_WANT_RETRY, c, b, i, + BTREE_ERR_WANT_RETRY, c, ca, b, i, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -967,7 +830,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); btree_err_on(bch2_crc_cmp(csum, bne->csum), - BTREE_ERR_WANT_RETRY, c, b, i, + BTREE_ERR_WANT_RETRY, c, ca, b, i, "invalid checksum"); bset_encrypt(c, i, b->written << 9); @@ -975,7 +838,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry sectors = vstruct_sectors(bne, c->block_bits); } - ret = validate_bset(c, b, i, sectors, + b->version_ondisk = min(b->version_ondisk, + le16_to_cpu(i->version)); + + ret = validate_bset(c, ca, b, i, sectors, READ, have_retry); if (ret) goto fsck_err; @@ -997,7 +863,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry true); btree_err_on(blacklisted && first, - BTREE_ERR_FIXABLE, c, b, i, + BTREE_ERR_FIXABLE, c, ca, b, i, "first btree node bset has blacklisted journal seq"); if (blacklisted && !first) continue; @@ -1014,7 +880,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry bset_byte_offset(b, bne) < btree_bytes(c); bne = (void *) bne + block_bytes(c)) btree_err_on(bne->keys.seq == b->data->keys.seq, - BTREE_ERR_WANT_RETRY, c, b, NULL, + BTREE_ERR_WANT_RETRY, c, ca, b, NULL, "found bset signature after last bset"); sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); @@ -1022,9 +888,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry set_btree_bset(b, b->set, &b->data->keys); - b->nr = (btree_node_old_extent_overwrite(b) - ? bch2_extent_sort_fix_overlapping - : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); + b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); u64s = le16_to_cpu(sorted->keys.u64s); *sorted = *b->data; @@ -1044,12 +908,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry const char *invalid = bch2_bkey_val_invalid(c, u.s_c); if (invalid || - (inject_invalid_keys(c) && + (bch2_inject_invalid_keys && !bversion_cmp(u.k->version, MAX_VERSION))) { char buf[160]; bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, b, i, + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "invalid bkey %s: %s", buf, invalid); btree_keys_account_key_drop(&b->nr, 0, k); @@ -1067,7 +931,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry bp.v->mem_ptr = 0; } - k = bkey_next_skip_noops(k, vstruct_last(i)); + k = bkey_next(k); } bch2_bset_build_aux_tree(b, b->set, false); @@ -1079,7 +943,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - if (ca->mi.state != BCH_MEMBER_STATE_RW) + if (ca->mi.state != BCH_MEMBER_STATE_rw) set_btree_node_need_rewrite(b); } out: @@ -1104,6 +968,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; + char buf[200]; + struct printbuf out; bool can_retry; goto start; @@ -1123,8 +989,10 @@ static void btree_node_read_work(struct work_struct *work) bio->bi_status = BLK_STS_REMOVED; } start: - bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", - bch2_blk_status_to_str(bio->bi_status)); + out = PBUF(buf); + btree_pos_to_text(&out, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; @@ -1136,7 +1004,7 @@ start: &failed, &rb->pick) > 0; if (!bio->bi_status && - !bch2_btree_node_read_done(c, b, can_retry)) + !bch2_btree_node_read_done(c, ca, b, can_retry)) break; if (!can_retry) { @@ -1302,12 +1170,13 @@ static void bch2_btree_node_write_error(struct bch_fs *c, struct btree_write_bio *wbio) { struct btree *b = wbio->wbio.bio.bi_private; - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + struct bkey_buf k; struct bch_extent_ptr *ptr; struct btree_trans trans; struct btree_iter *iter; int ret; + bch2_bkey_buf_init(&k); bch2_trans_init(&trans, c, 0, 0); iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, @@ -1326,21 +1195,23 @@ retry: BUG_ON(!btree_node_hashed(b)); - bkey_copy(&tmp.k, &b->key); + bch2_bkey_buf_copy(&k, c, &b->key); - bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, + bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr, bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k))) goto err; - ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); + ret = bch2_btree_node_update_key(c, iter, b, k.k); if (ret == -EINTR) goto retry; if (ret) goto err; out: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&k, c); bio_put(&wbio->wbio.bio); btree_node_write_done(c, b); return; @@ -1408,7 +1279,7 @@ static void btree_node_write_endio(struct bio *bio) if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); @@ -1437,13 +1308,15 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, unsigned whiteout_u64s = 0; int ret; - if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) + if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree)) return -1; - ret = validate_bset(c, b, i, sectors, WRITE, false) ?: - validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); - if (ret) + ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: + validate_bset(c, NULL, b, i, sectors, WRITE, false); + if (ret) { bch2_inconsistent_error(c); + dump_stack(); + } return ret; } @@ -1456,7 +1329,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; - BKEY_PADDED(key) k; + struct bkey_buf k; struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; @@ -1467,6 +1340,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool validate_before_checksum = false; void *data; + bch2_bkey_buf_init(&k); + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) return; @@ -1486,6 +1361,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (!btree_node_may_write(b)) return; + if (old & (1 << BTREE_NODE_never_write)) + return; + if (old & (1 << BTREE_NODE_write_in_flight)) { btree_node_wait_on_io(b); continue; @@ -1498,6 +1376,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, new ^= (1 << BTREE_NODE_write_idx); } while (cmpxchg_acquire(&b->flags, old, new) != old); + atomic_dec(&c->btree_cache.dirty); + BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); @@ -1530,6 +1410,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, seq = max(seq, le64_to_cpu(i->journal_seq)); } + BUG_ON(b->written && !seq); + + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ + bytes += 8; + data = btree_bounce_alloc(c, bytes, &used_mempool); if (!b->written) { @@ -1545,24 +1430,14 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, i->journal_seq = cpu_to_le64(seq); i->u64s = 0; - if (!btree_node_old_extent_overwrite(b)) { - sort_iter_add(&sort_iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); - SET_BSET_SEPARATE_WHITEOUTS(i, false); - } else { - memcpy_u64s(i->start, - unwritten_whiteouts_start(c, b), - b->whiteout_u64s); - i->u64s = cpu_to_le16(b->whiteout_u64s); - SET_BSET_SEPARATE_WHITEOUTS(i, true); - } + sort_iter_add(&sort_iter, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; - u64s = btree_node_old_extent_overwrite(b) - ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) - : bch2_sort_keys(i->start, &sort_iter, false); + u64s = bch2_sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); set_needs_whiteout(i, false); @@ -1590,7 +1465,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, validate_before_checksum = true; /* validate_bset will be modifying: */ - if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) + if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) validate_before_checksum = true; /* if we're going to be encrypting, check metadata validity first: */ @@ -1665,15 +1540,19 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, * just make all btree node writes FUA to keep things sane. */ - bkey_copy(&k.key, &b->key); + bch2_bkey_buf_copy(&k, c, &b->key); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr) ptr->offset += b->written; b->written += sectors_to_write; + atomic64_inc(&c->btree_writes_nr); + atomic64_add(sectors_to_write, &c->btree_writes_sectors); + /* XXX: submitting IO with btree locks held: */ - bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); + bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k); + bch2_bkey_buf_exit(&k, c); return; err: set_btree_node_noevict(b); @@ -1793,23 +1672,6 @@ void bch2_btree_flush_all_writes(struct bch_fs *c) __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } -void bch2_btree_verify_flushed(struct bch_fs *c) -{ - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - unsigned i; - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) { - unsigned long flags = READ_ONCE(b->flags); - - BUG_ON((flags & (1 << BTREE_NODE_dirty)) || - (flags & (1 << BTREE_NODE_write_in_flight))); - } - rcu_read_unlock(); -} - void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) { struct bucket_table *tbl; diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 626d0f0..9c14cd3 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -14,6 +14,23 @@ struct btree_write; struct btree; struct btree_iter; +static inline bool btree_node_dirty(struct btree *b) +{ + return test_bit(BTREE_NODE_dirty, &b->flags); +} + +static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) +{ + if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) + atomic_inc(&c->btree_cache.dirty); +} + +static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) +{ + if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) + atomic_dec(&c->btree_cache.dirty); +} + struct btree_read_bio { struct bch_fs *c; u64 start_time; @@ -117,7 +134,8 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct bch_fs *, struct btree *, struct btree_iter *); -int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); +int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, + struct btree *, bool); void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); @@ -168,20 +186,29 @@ do { \ void bch2_btree_flush_all_reads(struct bch_fs *); void bch2_btree_flush_all_writes(struct bch_fs *); -void bch2_btree_verify_flushed(struct bch_fs *); void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, - unsigned version, unsigned big_endian, - int write, struct bkey_format *f) + unsigned version, unsigned big_endian, + int write, struct bkey_format *f) { if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_INODES) { + btree_id == BTREE_ID_inodes) { swap(f->bits_per_field[BKEY_FIELD_INODE], f->bits_per_field[BKEY_FIELD_OFFSET]); swap(f->field_offset[BKEY_FIELD_INODE], f->field_offset[BKEY_FIELD_OFFSET]); } + + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + u64 max_packed = + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + f->field_offset[BKEY_FIELD_SNAPSHOT] = write + ? 0 + : U32_MAX - max_packed; + } } static inline void compat_bpos(unsigned level, enum btree_id btree_id, @@ -192,7 +219,7 @@ static inline void compat_bpos(unsigned level, enum btree_id btree_id, bch2_bpos_swab(p); if (version < bcachefs_metadata_version_inode_btree_change && - btree_id == BTREE_ID_INODES) + btree_id == BTREE_ID_inodes) swap(p->inode, p->offset); } @@ -203,18 +230,26 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, { if (version < bcachefs_metadata_version_inode_btree_change && btree_node_type_is_extents(btree_id) && - bkey_cmp(bn->min_key, POS_MIN) && + bpos_cmp(bn->min_key, POS_MIN) && write) - bn->min_key = bkey_predecessor(bn->min_key); + bn->min_key = bpos_nosnap_predecessor(bn->min_key); + + if (version < bcachefs_metadata_version_snapshot && + write) + bn->max_key.snapshot = 0; compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); + if (version < bcachefs_metadata_version_snapshot && + !write) + bn->max_key.snapshot = U32_MAX; + if (version < bcachefs_metadata_version_inode_btree_change && btree_node_type_is_extents(btree_id) && - bkey_cmp(bn->min_key, POS_MIN) && + bpos_cmp(bn->min_key, POS_MIN) && !write) - bn->min_key = bkey_successor(bn->min_key); + bn->min_key = bpos_nosnap_successor(bn->min_key); } #endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 6fab76c..425c9ad 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2,18 +2,53 @@ #include "bcachefs.h" #include "bkey_methods.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "journal.h" +#include "replicas.h" #include #include +static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); + +static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +{ + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); + + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_successor(p); + } else { + p = bpos_nosnap_successor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + +static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) +{ + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); + + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_predecessor(p); + } else { + p = bpos_nosnap_predecessor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + static inline bool is_btree_node(struct btree_iter *iter, unsigned l) { return l < BTREE_MAX_DEPTH && @@ -26,20 +61,20 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) if ((iter->flags & BTREE_ITER_IS_EXTENTS) && bkey_cmp(pos, POS_MAX)) - pos = bkey_successor(pos); + pos = bkey_successor(iter, pos); return pos; } static inline bool btree_iter_pos_before_node(struct btree_iter *iter, struct btree *b) { - return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; + return bpos_cmp(iter->real_pos, b->data->min_key) < 0; } static inline bool btree_iter_pos_after_node(struct btree_iter *iter, struct btree *b) { - return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; + return bpos_cmp(b->key.k.p, iter->real_pos) < 0; } static inline bool btree_iter_pos_in_node(struct btree_iter *iter, @@ -197,13 +232,14 @@ static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, - void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { struct btree_trans *trans = iter->trans; - struct btree_iter *linked; + struct btree_iter *linked, *deadlock_iter = NULL; u64 start_time = local_clock(); - bool ret = true; + unsigned reason = 9; + bool ret; /* Check if it's safe to block: */ trans_for_each_iter(trans, linked) { @@ -224,15 +260,33 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - if (!(trans->nounlock)) { - linked->locks_want = max_t(unsigned, - linked->locks_want, - __fls(linked->nodes_locked) + 1); - if (!btree_iter_get_locks(linked, true, false)) - ret = false; - } else { - ret = false; + linked->locks_want = max_t(unsigned, + linked->locks_want, + __fls(linked->nodes_locked) + 1); + if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; + reason = 1; + } + } + + if (linked->btree_id != iter->btree_id) { + if (linked->btree_id > iter->btree_id) { + deadlock_iter = linked; + reason = 3; } + continue; + } + + /* + * Within the same btree, cached iterators come before non + * cached iterators: + */ + if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) { + if (btree_iter_is_cached(iter)) { + deadlock_iter = linked; + reason = 4; + } + continue; } /* @@ -240,30 +294,24 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, * another iterator has possible descendants locked of the node * we're about to lock, it must have the ancestors locked too: */ - if (linked->btree_id == iter->btree_id && - level > __fls(linked->nodes_locked)) { - if (!(trans->nounlock)) { - linked->locks_want = - max(level + 1, max_t(unsigned, - linked->locks_want, - iter->locks_want)); - if (!btree_iter_get_locks(linked, true, false)) - ret = false; - } else { - ret = false; + if (level > __fls(linked->nodes_locked)) { + linked->locks_want = + max(level + 1, max_t(unsigned, + linked->locks_want, + iter->locks_want)); + if (!btree_iter_get_locks(linked, true, false)) { + deadlock_iter = linked; + reason = 5; } } /* Must lock btree nodes in key order: */ - if ((cmp_int(iter->btree_id, linked->btree_id) ?: - -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) - ret = false; - - if (iter->btree_id == linked->btree_id && - btree_node_locked(linked, level) && - bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, - btree_iter_type(linked))) <= 0) - ret = false; + if (btree_node_locked(linked, level) && + bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, + btree_iter_type(linked))) <= 0) { + deadlock_iter = linked; + reason = 7; + } /* * Recheck if this is a node we already have locked - since one @@ -277,20 +325,36 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, } } - if (unlikely(!ret)) { - trace_trans_restart_would_deadlock(iter->trans->ip); + if (unlikely(deadlock_iter)) { + trace_trans_restart_would_deadlock(iter->trans->ip, ip, + reason, + deadlock_iter->btree_id, + btree_iter_type(deadlock_iter), + iter->btree_id, + btree_iter_type(iter)); return false; } if (six_trylock_type(&b->c.lock, type)) return true; - if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) - return false; +#ifdef CONFIG_BCACHEFS_DEBUG + trans->locking_iter_idx = iter->idx; + trans->locking_pos = pos; + trans->locking_btree_id = iter->btree_id; + trans->locking_level = level; + trans->locking = b; +#endif - bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], - start_time); - return true; + ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; + +#ifdef CONFIG_BCACHEFS_DEBUG + trans->locking = NULL; +#endif + if (ret) + bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], + start_time); + return ret; } /* Btree iterator locking: */ @@ -319,7 +383,7 @@ void bch2_btree_trans_verify_locks(struct btree_trans *trans) { struct btree_iter *iter; - trans_for_each_iter_all(trans, iter) + trans_for_each_iter(trans, iter) bch2_btree_iter_verify_locks(iter); } #else @@ -360,50 +424,25 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, return false; } -bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, - unsigned new_locks_want) +void __bch2_btree_iter_downgrade(struct btree_iter *iter, + unsigned new_locks_want) { - unsigned l = iter->level; + unsigned l; - EBUG_ON(iter->locks_want >= new_locks_want); + EBUG_ON(iter->locks_want < new_locks_want); iter->locks_want = new_locks_want; - do { - if (!btree_iter_node(iter, l)) - break; - - if (!bch2_btree_node_upgrade(iter, l)) { - iter->locks_want = l; - return false; - } - - l++; - } while (l < iter->locks_want); - - return true; -} - -void __bch2_btree_iter_downgrade(struct btree_iter *iter, - unsigned downgrade_to) -{ - unsigned l, new_locks_want = downgrade_to ?: - (iter->flags & BTREE_ITER_INTENT ? 1 : 0); - - if (iter->locks_want < downgrade_to) { - iter->locks_want = new_locks_want; - - while (iter->nodes_locked && - (l = __fls(iter->nodes_locked)) >= iter->locks_want) { - if (l > iter->level) { - btree_node_unlock(iter, l); - } else { - if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->l[l].b->c.lock); - iter->nodes_intent_locked ^= 1 << l; - } - break; + while (iter->nodes_locked && + (l = __fls(iter->nodes_locked)) >= iter->locks_want) { + if (l > iter->level) { + btree_node_unlock(iter, l); + } else { + if (btree_node_intent_locked(iter, l)) { + six_lock_downgrade(&iter->l[l].b->c.lock); + iter->nodes_intent_locked ^= 1 << l; } + break; } } @@ -423,13 +462,12 @@ void bch2_trans_downgrade(struct btree_trans *trans) bool bch2_trans_relock(struct btree_trans *trans) { struct btree_iter *iter; - bool ret = true; trans_for_each_iter(trans, iter) - if (iter->uptodate == BTREE_ITER_NEED_RELOCK) - ret &= bch2_btree_iter_relock(iter, true); - - return ret; + if (btree_iter_keep(trans, iter) && + !bch2_btree_iter_relock(iter, true)) + return false; + return true; } void bch2_trans_unlock(struct btree_trans *trans) @@ -463,17 +501,20 @@ static void bch2_btree_iter_verify_cached(struct btree_iter *iter) static void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned level) { - struct bpos pos = btree_iter_search_key(iter); - struct btree_iter_level *l = &iter->l[level]; - struct btree_node_iter tmp = l->iter; - bool locked = btree_node_locked(iter, level); + struct btree_iter_level *l; + struct btree_node_iter tmp; + bool locked; struct bkey_packed *p, *k; - char buf1[100], buf2[100]; + char buf1[100], buf2[100], buf3[100]; const char *msg; - if (!debug_check_iterators(iter->trans->c)) + if (!bch2_debug_check_iterators) return; + l = &iter->l[level]; + tmp = l->iter; + locked = btree_node_locked(iter, level); + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { if (!level) bch2_btree_iter_verify_cached(iter); @@ -488,12 +529,7 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, if (!bch2_btree_node_relock(iter, level)) return; - /* - * Ideally this invariant would always be true, and hopefully in the - * future it will be, but for now set_pos_same_leaf() breaks it: - */ - BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && - !btree_iter_pos_in_node(iter, l->b)); + BUG_ON(!btree_iter_pos_in_node(iter, l->b)); /* * node iterators don't use leaf node iterator: @@ -512,16 +548,16 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, * whiteouts) */ p = level || btree_node_type_is_extents(iter->btree_id) - ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) + ? bch2_btree_node_iter_prev(&tmp, l->b) : bch2_btree_node_iter_prev_all(&tmp, l->b); k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { + if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) { msg = "before"; goto err; } - if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { + if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { msg = "after"; goto err; } @@ -530,44 +566,72 @@ unlock: btree_node_unlock(iter, level); return; err: - strcpy(buf1, "(none)"); strcpy(buf2, "(none)"); + strcpy(buf3, "(none)"); + + bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); if (p) { struct bkey uk = bkey_unpack_key(l->b, p); - bch2_bkey_to_text(&PBUF(buf1), &uk); + bch2_bkey_to_text(&PBUF(buf2), &uk); } if (k) { struct bkey uk = bkey_unpack_key(l->b, k); - bch2_bkey_to_text(&PBUF(buf2), &uk); + bch2_bkey_to_text(&PBUF(buf3), &uk); } panic("iterator should be %s key at level %u:\n" - "iter pos %s %llu:%llu\n" + "iter pos %s\n" "prev key %s\n" "cur key %s\n", - msg, level, - iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", - iter->pos.inode, iter->pos.offset, - buf1, buf2); + msg, level, buf1, buf2, buf3); } static void bch2_btree_iter_verify(struct btree_iter *iter) { + enum btree_iter_type type = btree_iter_type(iter); unsigned i; - bch2_btree_trans_verify_locks(iter->trans); + EBUG_ON(iter->btree_id >= BTREE_ID_NR); + + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + + BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + + BUG_ON(type == BTREE_ITER_NODES && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + + BUG_ON(type != BTREE_ITER_NODES && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + + bch2_btree_iter_verify_locks(iter); for (i = 0; i < BTREE_MAX_DEPTH; i++) bch2_btree_iter_verify_level(iter, i); } +static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + enum btree_iter_type type = btree_iter_type(iter); + + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + + BUG_ON((type == BTREE_ITER_KEYS || + type == BTREE_ITER_CACHED) && + (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || + bkey_cmp(iter->pos, iter->k.p) > 0)); +} + void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) { struct btree_iter *iter; - if (!debug_check_iterators(trans->c)) + if (!bch2_debug_check_iterators) return; trans_for_each_iter_with_node(trans, b, iter) @@ -578,6 +642,7 @@ void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} #endif @@ -603,12 +668,11 @@ static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, struct bkey_packed *where) { struct btree_iter_level *l = &iter->l[b->c.level]; - struct bpos pos = btree_iter_search_key(iter); if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) return; - if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) + if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0) bch2_btree_node_iter_advance(&l->iter, l->b); btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); @@ -643,7 +707,6 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, bool iter_current_key_modified = orig_iter_pos >= offset && orig_iter_pos <= offset + clobber_u64s; - struct bpos iter_pos = btree_iter_search_key(iter); btree_node_iter_for_each(node_iter, set) if (set->end == old_end) @@ -651,7 +714,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && - bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { + bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { bch2_btree_node_iter_push(node_iter, b, where, end); goto fixup_done; } else { @@ -666,7 +729,7 @@ found: return; if (new_u64s && - bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { + bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { set->k = offset; } else if (set->k < offset + clobber_u64s) { set->k = offset + new_u64s; @@ -739,7 +802,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, __bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, new_u64s); - if (debug_check_iterators(iter->trans->c)) + if (bch2_debug_check_iterators) bch2_btree_node_iter_verify(node_iter, b); } @@ -769,45 +832,50 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, ret = bkey_disassemble(l->b, k, u); - if (debug_check_bkeys(iter->trans->c)) + if (bch2_debug_check_bkeys) bch2_bkey_debugcheck(iter->trans->c, l->b, ret); return ret; } /* peek_all() doesn't skip deleted keys */ -static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, - struct btree_iter_level *l, - struct bkey *u) +static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter, + struct btree_iter_level *l, + struct bkey *u) { return __btree_iter_unpack(iter, l, u, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, - struct btree_iter_level *l) +static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter, + struct btree_iter_level *l) { - return __btree_iter_unpack(iter, l, &iter->k, + struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, bch2_btree_node_iter_peek(&l->iter, l->b)); + + iter->real_pos = k.k ? k.k->p : l->b->key.k.p; + return k; } -static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, - struct btree_iter_level *l) +static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter, + struct btree_iter_level *l) { - return __btree_iter_unpack(iter, l, &iter->k, + struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, bch2_btree_node_iter_prev(&l->iter, l->b)); + + iter->real_pos = k.k ? k.k->p : l->b->data->min_key; + return k; } static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, struct btree_iter_level *l, int max_advance) { - struct bpos pos = btree_iter_search_key(iter); struct bkey_packed *k; int nr_advanced = 0; while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - bkey_iter_pos_cmp(l->b, k, &pos) < 0) { + bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { if (max_advance > 0 && nr_advanced >= max_advance) return false; @@ -845,12 +913,23 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) if (!k || bkey_deleted(k) || bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { - char buf[100]; + char buf1[100]; + char buf2[100]; + char buf3[100]; + char buf4[100]; struct bkey uk = bkey_unpack_key(b, k); - bch2_bkey_to_text(&PBUF(buf), &uk); - panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", - buf, b->key.k.p.inode, b->key.k.p.offset); + bch2_dump_btree_node(iter->trans->c, l->b); + bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); + bch2_bkey_to_text(&PBUF(buf2), &uk); + bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); + bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); + panic("parent iter doesn't point to new node:\n" + "iter pos %s %s\n" + "iter key %s\n" + "new node %s-%s\n", + bch2_btree_ids[iter->btree_id], buf1, + buf2, buf3, buf4); } if (!parent_locked) @@ -860,10 +939,16 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) static inline void __btree_iter_init(struct btree_iter *iter, unsigned level) { - struct bpos pos = btree_iter_search_key(iter); struct btree_iter_level *l = &iter->l[level]; - bch2_btree_node_iter_init(&l->iter, l->b, &pos); + bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos); + + /* + * Iterators to interior nodes should always be pointed at the first non + * whiteout: + */ + if (level) + bch2_btree_node_iter_peek(&l->iter, l->b); btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } @@ -919,7 +1004,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) trans_for_each_iter(iter->trans, linked) if (linked->l[level].b == b) { - __btree_node_unlock(linked, level); + btree_node_unlock(linked, level); linked->l[level].b = BTREE_ITER_NO_NODE_DROP; } } @@ -945,7 +1030,8 @@ static int lock_root_check_fn(struct six_lock *lock, void *p) } static inline int btree_iter_lock_root(struct btree_iter *iter, - unsigned depth_want) + unsigned depth_want, + unsigned long trace_ip) { struct bch_fs *c = iter->trans->c; struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; @@ -974,7 +1060,8 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, lock_type = __btree_lock_want(iter, iter->level); if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, iter, lock_type, - lock_root_check_fn, rootp))) + lock_root_check_fn, rootp, + trace_ip))) return -EINTR; if (likely(b == READ_ONCE(*rootp) && @@ -1002,27 +1089,32 @@ static void btree_iter_prefetch(struct btree_iter *iter) struct btree_iter_level *l = &iter->l[iter->level]; struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; - BKEY_PADDED(k) tmp; + struct bkey_buf tmp; unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ? (iter->level > 1 ? 0 : 2) : (iter->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(iter, iter->level); + bch2_bkey_buf_init(&tmp); + while (nr) { if (!bch2_btree_node_relock(iter, iter->level)) - return; + break; bch2_btree_node_iter_advance(&node_iter, l->b); k = bch2_btree_node_iter_peek(&node_iter, l->b); if (!k) break; - bch2_bkey_unpack(l->b, &tmp.k, k); - bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); + bch2_bkey_buf_unpack(&tmp, c, l->b, k); + bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id, + iter->level - 1); } if (!was_locked) btree_node_unlock(iter, iter->level); + + bch2_bkey_buf_exit(&tmp, c); } static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, @@ -1046,45 +1138,45 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, btree_node_unlock(iter, plevel); } -static __always_inline int btree_iter_down(struct btree_iter *iter) +static __always_inline int btree_iter_down(struct btree_iter *iter, + unsigned long trace_ip) { struct bch_fs *c = iter->trans->c; struct btree_iter_level *l = &iter->l[iter->level]; struct btree *b; unsigned level = iter->level - 1; enum six_lock_type lock_type = __btree_lock_want(iter, level); - BKEY_PADDED(k) tmp; + struct bkey_buf tmp; + int ret; EBUG_ON(!btree_node_locked(iter, iter->level)); - bch2_bkey_unpack(l->b, &tmp.k, + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_unpack(&tmp, c, l->b, bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); - if (unlikely(IS_ERR(b))) - return PTR_ERR(b); + b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); + if (unlikely(ret)) + goto err; mark_btree_node_locked(iter, level, lock_type); btree_iter_node_set(iter, b); - if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && - unlikely(b != btree_node_mem_ptr(&tmp.k))) + if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && + unlikely(b != btree_node_mem_ptr(tmp.k))) btree_node_mem_ptr_set(iter, level + 1, b); if (iter->flags & BTREE_ITER_PREFETCH) btree_iter_prefetch(iter); iter->level = level; - - return 0; -} - -static void btree_iter_up(struct btree_iter *iter) -{ - btree_node_unlock(iter, iter->level++); +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; } -static int btree_iter_traverse_one(struct btree_iter *); +static int btree_iter_traverse_one(struct btree_iter *, unsigned long); static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) { @@ -1104,11 +1196,12 @@ retry_all: sorted[nr_sorted++] = iter->idx; #define btree_iter_cmp_by_idx(_l, _r) \ - btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) + btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r]) bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); #undef btree_iter_cmp_by_idx bch2_trans_unlock(trans); + cond_resched(); if (unlikely(ret == -ENOMEM)) { struct closure cl; @@ -1139,7 +1232,7 @@ retry_all: if (!(trans->iters_linked & (1ULL << idx))) continue; - ret = btree_iter_traverse_one(&trans->iters[idx]); + ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_); if (ret) goto retry_all; } @@ -1171,9 +1264,9 @@ static inline bool btree_iter_good_node(struct btree_iter *iter, !bch2_btree_node_relock(iter, l)) return false; - if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) + if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) return false; - if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) + if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) return false; return true; } @@ -1202,7 +1295,8 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -static int btree_iter_traverse_one(struct btree_iter *iter) +static int btree_iter_traverse_one(struct btree_iter *iter, + unsigned long trace_ip) { unsigned depth_want = iter->level; @@ -1223,24 +1317,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter) if (unlikely(iter->level >= BTREE_MAX_DEPTH)) return 0; - /* - * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos - * here unnecessary - */ iter->level = btree_iter_up_until_good_node(iter, 0); - /* - * If we've got a btree node locked (i.e. we aren't about to relock the - * root) - advance its node iterator if necessary: - * - * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary - */ - if (is_btree_node(iter, iter->level)) { - BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); - - btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); - } - /* * Note: iter->nodes[iter->level] may be temporarily NULL here - that * would indicate to other code that we got to the end of the btree, @@ -1249,8 +1327,8 @@ static int btree_iter_traverse_one(struct btree_iter *iter) */ while (iter->level > depth_want) { int ret = btree_iter_node(iter, iter->level) - ? btree_iter_down(iter) - : btree_iter_lock_root(iter, depth_want); + ? btree_iter_down(iter, trace_ip) + : btree_iter_lock_root(iter, depth_want, trace_ip); if (unlikely(ret)) { if (ret == 1) return 0; @@ -1275,32 +1353,41 @@ static int btree_iter_traverse_one(struct btree_iter *iter) return 0; } -int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) +static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; int ret; ret = bch2_trans_cond_resched(trans) ?: - btree_iter_traverse_one(iter); + btree_iter_traverse_one(iter, _RET_IP_); if (unlikely(ret)) ret = __btree_iter_traverse_all(trans, ret); return ret; } -static inline void bch2_btree_iter_checks(struct btree_iter *iter) +/* + * Note: + * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is + * for internal btree iterator users + * + * bch2_btree_iter_traverse sets iter->real_pos to iter->pos, + * btree_iter_traverse() does not: + */ +static inline int __must_check +btree_iter_traverse(struct btree_iter *iter) { - enum btree_iter_type type = btree_iter_type(iter); - - EBUG_ON(iter->btree_id >= BTREE_ID_NR); + return iter->uptodate >= BTREE_ITER_NEED_RELOCK + ? __bch2_btree_iter_traverse(iter) + : 0; +} - BUG_ON((type == BTREE_ITER_KEYS || - type == BTREE_ITER_CACHED) && - (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || - bkey_cmp(iter->pos, iter->k.p) > 0)); +int __must_check +bch2_btree_iter_traverse(struct btree_iter *iter) +{ + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); - bch2_btree_iter_verify_locks(iter); - bch2_btree_iter_verify_level(iter, iter->level); + return btree_iter_traverse(iter); } /* Iterate across nodes (leaf and interior nodes) */ @@ -1311,12 +1398,9 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); - bch2_btree_iter_checks(iter); - - if (iter->uptodate == BTREE_ITER_UPTODATE) - return iter->l[iter->level].b; + bch2_btree_iter_verify(iter); - ret = bch2_btree_iter_traverse(iter); + ret = btree_iter_traverse(iter); if (ret) return NULL; @@ -1324,10 +1408,9 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) if (!b) return NULL; - BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); + BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); - iter->pos = b->key.k.p; - iter->uptodate = BTREE_ITER_UPTODATE; + iter->pos = iter->real_pos = b->key.k.p; bch2_btree_iter_verify(iter); @@ -1340,7 +1423,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); /* already got to end? */ if (!btree_iter_node(iter, iter->level)) @@ -1348,12 +1431,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) bch2_trans_cond_resched(iter->trans); - btree_iter_up(iter); + btree_node_unlock(iter, iter->level); + iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; + iter->level++; - if (!bch2_btree_node_relock(iter, iter->level)) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); - - ret = bch2_btree_iter_traverse(iter); + btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + ret = btree_iter_traverse(iter); if (ret) return NULL; @@ -1362,34 +1445,28 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) if (!b) return NULL; - if (bkey_cmp(iter->pos, b->key.k.p) < 0) { + if (bpos_cmp(iter->pos, b->key.k.p) < 0) { /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ + btree_iter_set_search_pos(iter, bpos_successor(iter->pos)); - /* - * We don't really want to be unlocking here except we can't - * directly tell btree_iter_traverse() "traverse to this level" - * except by setting iter->level, so we have to unlock so we - * don't screw up our lock invariants: - */ - if (btree_node_read_locked(iter, iter->level)) - btree_node_unlock(iter, iter->level); - - iter->pos = bkey_successor(iter->pos); - iter->level = iter->min_depth; + /* Unlock to avoid screwing up our lock invariants: */ + btree_node_unlock(iter, iter->level); + iter->level = iter->min_depth; btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - ret = bch2_btree_iter_traverse(iter); + bch2_btree_iter_verify(iter); + + ret = btree_iter_traverse(iter); if (ret) return NULL; b = iter->l[iter->level].b; } - iter->pos = b->key.k.p; - iter->uptodate = BTREE_ITER_UPTODATE; + iter->pos = iter->real_pos = b->key.k.p; bch2_btree_iter_verify(iter); @@ -1398,43 +1475,16 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) /* Iterate across keys (in leaf nodes only) */ -void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) -{ - struct btree_iter_level *l = &iter->l[0]; - - EBUG_ON(iter->level != 0); - EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); - EBUG_ON(!btree_node_locked(iter, 0)); - EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); - - bkey_init(&iter->k); - iter->k.p = iter->pos = new_pos; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - - btree_iter_advance_to_pos(iter, l, -1); - - /* - * XXX: - * keeping a node locked that's outside (even just outside) iter->pos - * breaks __bch2_btree_node_lock(). This seems to only affect - * bch2_btree_node_get_sibling so for now it's fixed there, but we - * should try to get rid of this corner case. - * - * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) - */ - - if (bch2_btree_node_iter_end(&l->iter) && - btree_iter_pos_after_node(iter, l->b)) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -} - -static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) +static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) { + int cmp = bpos_cmp(new_pos, iter->real_pos); unsigned l = iter->level; if (!cmp) goto out; + iter->real_pos = new_pos; + if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { btree_node_unlock(iter, 0); iter->l[0].b = BTREE_ITER_NO_NODE_UP; @@ -1464,245 +1514,162 @@ out: btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); else btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); + + bch2_btree_iter_verify(iter); } -void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, - bool strictly_greater) +inline bool bch2_btree_iter_advance(struct btree_iter *iter) { - struct bpos old = btree_iter_search_key(iter); - int cmp; - - iter->flags &= ~BTREE_ITER_IS_EXTENTS; - iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; + struct bpos pos = iter->k.p; + bool ret = bpos_cmp(pos, POS_MAX) != 0; - bkey_init(&iter->k); - iter->k.p = iter->pos = new_pos; - - cmp = bkey_cmp(btree_iter_search_key(iter), old); - - btree_iter_pos_changed(iter, cmp); + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; } -void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { - int cmp = bkey_cmp(new_pos, iter->pos); - - bkey_init(&iter->k); - iter->k.p = iter->pos = new_pos; + struct bpos pos = bkey_start_pos(&iter->k); + bool ret = bpos_cmp(pos, POS_MIN) != 0; - btree_iter_pos_changed(iter, cmp); + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_predecessor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; } static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; - bool ret; - - bkey_init(&iter->k); - iter->k.p = iter->pos = l->b->key.k.p; + struct bpos next_pos = iter->l[0].b->key.k.p; + bool ret = bpos_cmp(next_pos, POS_MAX) != 0; - ret = bkey_cmp(iter->pos, POS_MAX) != 0; - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - iter->k.p = iter->pos = bkey_successor(iter->pos); + /* + * Typically, we don't want to modify iter->pos here, since that + * indicates where we searched from - unless we got to the end of the + * btree, in that case we want iter->pos to reflect that: + */ + if (ret) + btree_iter_set_search_pos(iter, bpos_successor(next_pos)); + else + bch2_btree_iter_set_pos(iter, POS_MAX); - btree_iter_pos_changed(iter, 1); return ret; } static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; - bool ret; - - bkey_init(&iter->k); - iter->k.p = iter->pos = l->b->data->min_key; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - - ret = bkey_cmp(iter->pos, POS_MIN) != 0; - if (ret) { - iter->k.p = iter->pos = bkey_predecessor(iter->pos); + struct bpos next_pos = iter->l[0].b->data->min_key; + bool ret = bpos_cmp(next_pos, POS_MIN) != 0; - if (iter->flags & BTREE_ITER_IS_EXTENTS) - iter->k.p = iter->pos = bkey_predecessor(iter->pos); - } + if (ret) + btree_iter_set_search_pos(iter, bpos_predecessor(next_pos)); + else + bch2_btree_iter_set_pos(iter, POS_MIN); - btree_iter_pos_changed(iter, -1); return ret; } -/** - * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key - * it currently points to - */ -static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) +static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) { - struct btree_iter_level *l = &iter->l[0]; - struct bkey_s_c ret = { .k = &iter->k }; - - if (!bkey_deleted(&iter->k)) { - struct bkey_packed *_k = - __bch2_btree_node_iter_peek_all(&l->iter, l->b); - - ret.v = bkeyp_val(&l->b->format, _k); - - if (debug_check_iterators(iter->trans->c)) { - struct bkey k = bkey_unpack_key(l->b, _k); + struct btree_insert_entry *i; - BUG_ON(memcmp(&k, &iter->k, sizeof(k))); + trans_for_each_update2(trans, i) + if ((cmp_int(btree_id, i->iter->btree_id) ?: + bkey_cmp(pos, i->k->k.p)) <= 0) { + if (btree_id == i->iter->btree_id) + return i->k; + break; } - if (debug_check_bkeys(iter->trans->c)) - bch2_bkey_debugcheck(iter->trans->c, l->b, ret); - } - - return ret; + return NULL; } -/** - * bch2_btree_iter_peek: returns first key greater than or equal to iterator's - * current position - */ -struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates) { - struct btree_iter_level *l = &iter->l[0]; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_i *next_update = with_updates + ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key) + : NULL; struct bkey_s_c k; int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); - if (iter->uptodate == BTREE_ITER_UPTODATE && - !bkey_deleted(&iter->k)) - return btree_iter_peek_uptodate(iter); + btree_iter_set_search_pos(iter, search_key); while (1) { - ret = bch2_btree_iter_traverse(iter); + ret = btree_iter_traverse(iter); if (unlikely(ret)) return bkey_s_c_err(ret); - k = __btree_iter_peek(iter, l); - if (likely(k.k)) + k = btree_iter_level_peek(iter, &iter->l[0]); + + if (next_update && + bpos_cmp(next_update->k.p, iter->real_pos) <= 0) + k = bkey_i_to_s_c(next_update); + + if (likely(k.k)) { + if (bkey_deleted(k.k)) { + btree_iter_set_search_pos(iter, + bkey_successor(iter, k.k->p)); + continue; + } + break; + } if (!btree_iter_set_pos_to_next_leaf(iter)) return bkey_s_c_null; } /* - * iter->pos should always be equal to the key we just - * returned - except extents can straddle iter->pos: + * iter->pos should be mononotically increasing, and always be equal to + * the key we just returned - except extents can straddle iter->pos: */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || - bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) + if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) iter->pos = bkey_start_pos(k.k); - iter->uptodate = BTREE_ITER_UPTODATE; - - bch2_btree_iter_verify_level(iter, 0); + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); return k; } +/** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ +struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +{ + return __btree_iter_peek(iter, false); +} + /** * bch2_btree_iter_next: returns first key greater than iterator's current * position */ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) { - if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + if (!bch2_btree_iter_advance(iter)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? iter->k.p - : bkey_successor(iter->k.p)); - return bch2_btree_iter_peek(iter); } -static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) -{ - struct bpos pos = btree_iter_search_key(iter); - struct btree_trans *trans = iter->trans; - struct btree_insert_entry *i; - - trans_for_each_update2(trans, i) - if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: - bkey_cmp(pos, i->k->k.p)) <= 0) - break; - - return i < trans->updates2 + trans->nr_updates2 && - iter->btree_id == i->iter->btree_id - ? bkey_i_to_s_c(i->k) - : bkey_s_c_null; -} - -static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -{ - struct btree_iter_level *l = &iter->l[0]; - struct bkey_s_c k = __btree_iter_peek(iter, l); - struct bkey_s_c u = __btree_trans_updates_peek(iter); - - if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) - return k; - if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { - iter->k = *u.k; - return u; - } - return bkey_s_c_null; -} - struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) { - struct bkey_s_c k; - int ret; - - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); - - while (1) { - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); - - k = __bch2_btree_iter_peek_with_updates(iter); - - if (k.k && bkey_deleted(k.k)) { - bch2_btree_iter_set_pos(iter, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? iter->k.p - : bkey_successor(iter->k.p)); - continue; - } - - if (likely(k.k)) - break; - - if (!btree_iter_set_pos_to_next_leaf(iter)) - return bkey_s_c_null; - } - - /* - * iter->pos should always be equal to the key we just - * returned - except extents can straddle iter->pos: - */ - if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || - bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) - iter->pos = bkey_start_pos(k.k); - - iter->uptodate = BTREE_ITER_UPTODATE; - return k; + return __btree_iter_peek(iter, true); } struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) { - if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + if (!bch2_btree_iter_advance(iter)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? iter->k.p - : bkey_successor(iter->k.p)); - return bch2_btree_iter_peek_with_updates(iter); } @@ -1712,38 +1679,57 @@ struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { - struct bpos pos = iter->pos; struct btree_iter_level *l = &iter->l[0]; struct bkey_s_c k; int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); - if (iter->uptodate == BTREE_ITER_UPTODATE && - !bkey_deleted(&iter->k)) - return btree_iter_peek_uptodate(iter); + btree_iter_set_search_pos(iter, iter->pos); while (1) { - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + ret = btree_iter_traverse(iter); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto no_key; + } - k = __btree_iter_peek(iter, l); - if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) - k = __btree_iter_prev(iter, l); + k = btree_iter_level_peek(iter, l); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 + : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)) + k = btree_iter_level_prev(iter, l); if (likely(k.k)) break; - if (!btree_iter_set_pos_to_prev_leaf(iter)) - return bkey_s_c_null; + if (!btree_iter_set_pos_to_prev_leaf(iter)) { + k = bkey_s_c_null; + goto no_key; + } } - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); - iter->pos = bkey_start_pos(k.k); - iter->uptodate = BTREE_ITER_UPTODATE; + EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); + + /* Extents can straddle iter->pos: */ + if (bkey_cmp(k.k->p, iter->pos) < 0) + iter->pos = k.k->p; +out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); return k; +no_key: + /* + * btree_iter_level_peek() may have set iter->k to a key we didn't want, and + * then we errored going to the previous leaf - make sure it's + * consistent with iter->pos: + */ + bkey_init(&iter->k); + iter->k.p = iter->pos; + goto out; } /** @@ -1752,81 +1738,52 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) { - struct bpos pos = bkey_start_pos(&iter->k); - - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); - - if (unlikely(!bkey_cmp(pos, POS_MIN))) + if (!bch2_btree_iter_rewind(iter)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); - return bch2_btree_iter_peek_prev(iter); } static inline struct bkey_s_c __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter; struct bkey_s_c k; - struct bkey n; - int ret; + struct bpos pos, next_start; /* keys & holes can't span inode numbers: */ if (iter->pos.offset == KEY_OFFSET_MAX) { if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); - - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos)); } - /* - * iterator is now at the correct position for inserting at iter->pos, - * but we need to keep iterating until we find the first non whiteout so - * we know how big a hole we have, if any: - */ - - node_iter = l->iter; - k = __btree_iter_unpack(iter, l, &iter->k, - bch2_btree_node_iter_peek(&node_iter, l->b)); - - if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { - /* - * We're not setting iter->uptodate because the node iterator - * doesn't necessarily point at the key we're returning: - */ + pos = iter->pos; + k = bch2_btree_iter_peek(iter); + iter->pos = pos; - EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); - bch2_btree_iter_verify_level(iter, 0); + if (bkey_err(k)) return k; - } - /* hole */ + if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) + return k; - if (!k.k) - k.k = &l->b->key.k; + next_start = k.k ? bkey_start_pos(k.k) : POS_MAX; - bkey_init(&n); - n.p = iter->pos; - bch2_key_resize(&n, + bkey_init(&iter->k); + iter->k.p = iter->pos; + bch2_key_resize(&iter->k, min_t(u64, KEY_SIZE_MAX, - (k.k->p.inode == n.p.inode - ? bkey_start_offset(k.k) + (next_start.inode == iter->pos.inode + ? next_start.offset : KEY_OFFSET_MAX) - - n.p.offset)); + iter->pos.offset)); - EBUG_ON(!n.size); + EBUG_ON(!iter->k.size); - iter->k = n; - iter->uptodate = BTREE_ITER_UPTODATE; + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); - bch2_btree_iter_verify_level(iter, 0); return (struct bkey_s_c) { &iter->k, NULL }; } @@ -1837,19 +1794,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); - if (iter->uptodate == BTREE_ITER_UPTODATE) - return btree_iter_peek_uptodate(iter); - - ret = bch2_btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); if (iter->flags & BTREE_ITER_IS_EXTENTS) return __bch2_btree_iter_peek_slot_extents(iter); - k = __btree_iter_peek_all(iter, l, &iter->k); + ret = btree_iter_traverse(iter); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + k = btree_iter_level_peek_all(iter, l, &iter->k); EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); @@ -1860,20 +1817,23 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = (struct bkey_s_c) { &iter->k, NULL }; } - iter->uptodate = BTREE_ITER_UPTODATE; - bch2_btree_iter_verify_level(iter, 0); + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); return k; } struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) { - if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) + if (!bch2_btree_iter_advance(iter)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? iter->k.p - : bkey_successor(iter->k.p)); + return bch2_btree_iter_peek_slot(iter); +} + +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) +{ + if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; return bch2_btree_iter_peek_slot(iter); } @@ -1884,9 +1844,9 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) int ret; EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); - bch2_btree_iter_checks(iter); + bch2_btree_iter_verify(iter); - ret = bch2_btree_iter_traverse(iter); + ret = btree_iter_traverse(iter); if (unlikely(ret)) return bkey_s_c_err(ret); @@ -1900,26 +1860,17 @@ struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) } static inline void bch2_btree_iter_init(struct btree_trans *trans, - struct btree_iter *iter, enum btree_id btree_id, - struct bpos pos, unsigned flags) + struct btree_iter *iter, enum btree_id btree_id) { struct bch_fs *c = trans->c; unsigned i; - if (btree_node_type_is_extents(btree_id) && - !(flags & BTREE_ITER_NODES)) - flags |= BTREE_ITER_IS_EXTENTS; - iter->trans = trans; - iter->pos = pos; - bkey_init(&iter->k); - iter->k.p = pos; - iter->flags = flags; iter->uptodate = BTREE_ITER_NEED_TRAVERSE; iter->btree_id = btree_id; iter->level = 0; iter->min_depth = 0; - iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; + iter->locks_want = 0; iter->nodes_locked = 0; iter->nodes_intent_locked = 0; for (i = 0; i < ARRAY_SIZE(iter->l); i++) @@ -1948,6 +1899,7 @@ int bch2_trans_iter_put(struct btree_trans *trans, return 0; BUG_ON(trans->iters + iter->idx != iter); + BUG_ON(!btree_iter_live(trans, iter)); ret = btree_iter_err(iter); @@ -1965,115 +1917,51 @@ int bch2_trans_iter_free(struct btree_trans *trans, if (IS_ERR_OR_NULL(iter)) return 0; - trans->iters_touched &= ~(1ULL << iter->idx); + set_btree_iter_dontneed(trans, iter); return bch2_trans_iter_put(trans, iter); } -static int bch2_trans_realloc_iters(struct btree_trans *trans, - unsigned new_size) +noinline __cold +static void btree_trans_iter_alloc_fail(struct btree_trans *trans) { - void *p, *new_iters, *new_updates, *new_updates2; - size_t iters_bytes; - size_t updates_bytes; - - new_size = roundup_pow_of_two(new_size); - - BUG_ON(new_size > BTREE_ITER_MAX); - - if (new_size <= trans->size) - return 0; - - BUG_ON(trans->used_mempool); - bch2_trans_unlock(trans); - - iters_bytes = sizeof(struct btree_iter) * new_size; - updates_bytes = sizeof(struct btree_insert_entry) * new_size; - - p = kmalloc(iters_bytes + - updates_bytes + - updates_bytes, GFP_NOFS); - if (p) - goto success; - - p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); - new_size = BTREE_ITER_MAX; - - trans->used_mempool = true; -success: - new_iters = p; p += iters_bytes; - new_updates = p; p += updates_bytes; - new_updates2 = p; p += updates_bytes; - - memcpy(new_iters, trans->iters, - sizeof(struct btree_iter) * trans->nr_iters); - memcpy(new_updates, trans->updates, - sizeof(struct btree_insert_entry) * trans->nr_updates); - memcpy(new_updates2, trans->updates2, - sizeof(struct btree_insert_entry) * trans->nr_updates2); - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - memset(trans->iters, POISON_FREE, - sizeof(struct btree_iter) * trans->nr_iters + - sizeof(struct btree_insert_entry) * trans->nr_iters); - - if (trans->iters != trans->iters_onstack) - kfree(trans->iters); - - trans->iters = new_iters; - trans->updates = new_updates; - trans->updates2 = new_updates2; - trans->size = new_size; + struct btree_iter *iter; + struct btree_insert_entry *i; + char buf[100]; - if (trans->iters_live) { - trace_trans_restart_iters_realloced(trans->ip, trans->size); - return -EINTR; + trans_for_each_iter(trans, iter) + printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n", + bch2_btree_ids[iter->btree_id], + (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf), + btree_iter_live(trans, iter) ? " live" : "", + (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", + iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", + (void *) iter->ip_allocated); + + trans_for_each_update(trans, i) { + char buf[300]; + + bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)); + printk(KERN_ERR "update: btree %s %s\n", + bch2_btree_ids[i->iter->btree_id], buf); } - - return 0; + panic("trans iter oveflow\n"); } static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) { - unsigned idx = __ffs64(~trans->iters_linked); + unsigned idx; - if (idx < trans->nr_iters) - goto got_slot; + if (unlikely(trans->iters_linked == + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) + btree_trans_iter_alloc_fail(trans); - if (trans->nr_iters == trans->size) { - int ret; - - if (trans->nr_iters >= BTREE_ITER_MAX) { - struct btree_iter *iter; - - trans_for_each_iter(trans, iter) { - pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", - bch2_btree_ids[iter->btree_id], - iter->pos.inode, - iter->pos.offset, - (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", - (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", - iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", - (void *) iter->ip_allocated); - } - - panic("trans iter oveflow\n"); - } + idx = __ffs64(~trans->iters_linked); - ret = bch2_trans_realloc_iters(trans, trans->size * 2); - if (ret) - return ERR_PTR(ret); - } - - idx = trans->nr_iters++; - BUG_ON(trans->nr_iters > trans->size); - - trans->iters[idx].idx = idx; -got_slot: - BUG_ON(trans->iters_linked & (1ULL << idx)); - trans->iters_linked |= 1ULL << idx; - trans->iters[idx].flags = 0; + trans->iters_linked |= 1ULL << idx; + trans->iters[idx].idx = idx; + trans->iters[idx].flags = 0; return &trans->iters[idx]; } @@ -2095,21 +1983,21 @@ static inline void btree_iter_copy(struct btree_iter *dst, dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; } -static inline struct bpos bpos_diff(struct bpos l, struct bpos r) -{ - if (bkey_cmp(l, r) > 0) - swap(l, r); - - return POS(r.inode - l.inode, r.offset - l.offset); -} - -static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, - unsigned btree_id, struct bpos pos, - unsigned flags) +struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags) { struct btree_iter *iter, *best = NULL; - BUG_ON(trans->nr_iters > BTREE_ITER_MAX); + if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + !btree_type_has_snapshots(btree_id)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) + pos.snapshot = btree_type_has_snapshots(btree_id) + ? U32_MAX : 0; trans_for_each_iter(trans, iter) { if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) @@ -2119,8 +2007,8 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, continue; if (best && - bkey_cmp(bpos_diff(best->pos, pos), - bpos_diff(iter->pos, pos)) < 0) + bkey_cmp(bpos_diff(best->real_pos, pos), + bpos_diff(iter->real_pos, pos)) > 0) continue; best = iter; @@ -2128,52 +2016,50 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, if (!best) { iter = btree_trans_iter_alloc(trans); - if (IS_ERR(iter)) - return iter; - - bch2_btree_iter_init(trans, iter, btree_id, pos, flags); - } else if ((trans->iters_live & (1ULL << best->idx)) || - (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { + bch2_btree_iter_init(trans, iter, btree_id); + } else if (btree_iter_keep(trans, best)) { iter = btree_trans_iter_alloc(trans); - if (IS_ERR(iter)) - return iter; - btree_iter_copy(iter, best); } else { iter = best; } - iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; - iter->flags &= ~BTREE_ITER_USER_FLAGS; - iter->flags |= flags & BTREE_ITER_USER_FLAGS; + trans->iters_live |= 1ULL << iter->idx; + trans->iters_touched |= 1ULL << iter->idx; - if (iter->flags & BTREE_ITER_INTENT) - bch2_btree_iter_upgrade(iter, 1); - else - bch2_btree_iter_downgrade(iter); + if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + btree_node_type_is_extents(btree_id) && + !(flags & BTREE_ITER_NOT_EXTENTS) && + !(flags & BTREE_ITER_ALL_SNAPSHOTS)) + flags |= BTREE_ITER_IS_EXTENTS; - BUG_ON(iter->btree_id != btree_id); - BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); - BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); - BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); - BUG_ON(trans->iters_live & (1ULL << iter->idx)); + iter->flags = flags; - trans->iters_live |= 1ULL << iter->idx; - trans->iters_touched |= 1ULL << iter->idx; + iter->snapshot = pos.snapshot; - return iter; -} + locks_want = min(locks_want, BTREE_MAX_DEPTH); -struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos, unsigned flags) -{ - struct btree_iter *iter = - __btree_trans_get_iter(trans, btree_id, pos, flags); + if (locks_want > iter->locks_want) { + iter->locks_want = locks_want; + btree_iter_get_locks(iter, true, false); + } else if (locks_want < iter->locks_want) { + __bch2_btree_iter_downgrade(iter, locks_want); + } + + while (iter->level < depth) { + btree_node_unlock(iter, iter->level); + iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; + iter->level++; + } + + while (iter->level > depth) + iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT; + + iter->min_depth = depth; + + bch2_btree_iter_set_pos(iter, pos); + btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); - if (!IS_ERR(iter)) - __bch2_btree_iter_set_pos(iter, pos, - btree_node_type_is_extents(btree_id)); return iter; } @@ -2185,20 +2071,18 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, unsigned flags) { struct btree_iter *iter = - __btree_trans_get_iter(trans, btree_id, pos, - flags|BTREE_ITER_NODES); - unsigned i; + __bch2_trans_get_iter(trans, btree_id, pos, + locks_want, depth, + BTREE_ITER_NODES| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + flags); - BUG_ON(IS_ERR(iter)); BUG_ON(bkey_cmp(iter->pos, pos)); - - iter->locks_want = locks_want; - iter->level = depth; - iter->min_depth = depth; - - for (i = 0; i < ARRAY_SIZE(iter->l); i++) - iter->l[i].b = NULL; - iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; + BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(iter->level != depth); + BUG_ON(iter->min_depth != depth); + iter->ip_allocated = _RET_IP_; return iter; } @@ -2209,9 +2093,6 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *iter; iter = btree_trans_iter_alloc(trans); - if (IS_ERR(iter)) - return iter; - btree_iter_copy(iter, src); trans->iters_live |= 1ULL << iter->idx; @@ -2219,7 +2100,7 @@ struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, * We don't need to preserve this iter since it's cheap to copy it * again - this will cause trans_iter_put() to free it right away: */ - trans->iters_touched &= ~(1ULL << iter->idx); + set_btree_iter_dontneed(trans, iter); return iter; } @@ -2286,11 +2167,11 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) trans->iters_touched &= trans->iters_live; - trans->need_reset = 0; trans->nr_updates = 0; trans->nr_updates2 = 0; trans->mem_top = 0; + trans->hooks = NULL; trans->extra_journal_entries = NULL; trans->extra_journal_entry_u64s = 0; @@ -2301,35 +2182,52 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags) (void *) &trans->fs_usage_deltas->memset_start); } + if (!(flags & TRANS_RESET_NOUNLOCK)) + bch2_trans_cond_resched(trans); + if (!(flags & TRANS_RESET_NOTRAVERSE)) bch2_btree_iter_traverse_all(trans); } +static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) +{ + size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX; + size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; + void *p = NULL; + + BUG_ON(trans->used_mempool); + +#ifdef __KERNEL__ + p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL); +#endif + if (!p) + p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + + trans->iters = p; p += iters_bytes; + trans->updates = p; p += updates_bytes; + trans->updates2 = p; p += updates_bytes; +} + void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned expected_nr_iters, size_t expected_mem_bytes) { - memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); + memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->ip = _RET_IP_; /* * reallocating iterators currently completely breaks - * bch2_trans_iter_put(): + * bch2_trans_iter_put(), we always allocate the max: */ - expected_nr_iters = BTREE_ITER_MAX; - - trans->c = c; - trans->ip = _RET_IP_; - trans->size = ARRAY_SIZE(trans->iters_onstack); - trans->iters = trans->iters_onstack; - trans->updates = trans->updates_onstack; - trans->updates2 = trans->updates2_onstack; - trans->fs_usage_deltas = NULL; + bch2_trans_alloc_iters(trans, c); - if (expected_nr_iters > trans->size) - bch2_trans_realloc_iters(trans, expected_nr_iters); + if (expected_mem_bytes) { + trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); + trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); + } - if (expected_mem_bytes) - bch2_trans_preload_mem(trans, expected_mem_bytes); + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); #ifdef CONFIG_BCACHEFS_DEBUG trans->pid = current->pid; @@ -2341,37 +2239,74 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, int bch2_trans_exit(struct btree_trans *trans) { + struct bch_fs *c = trans->c; + bch2_trans_unlock(trans); #ifdef CONFIG_BCACHEFS_DEBUG + if (trans->iters_live) { + struct btree_iter *iter; + + bch_err(c, "btree iterators leaked!"); + trans_for_each_iter(trans, iter) + if (btree_iter_live(trans, iter)) + printk(KERN_ERR " btree %s allocated at %pS\n", + bch2_btree_ids[iter->btree_id], + (void *) iter->ip_allocated); + /* Be noisy about this: */ + bch2_fatal_error(c); + } + mutex_lock(&trans->c->btree_trans_lock); list_del(&trans->list); mutex_unlock(&trans->c->btree_trans_lock); #endif + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); kfree(trans->fs_usage_deltas); kfree(trans->mem); - if (trans->used_mempool) + +#ifdef __KERNEL__ + /* + * Userspace doesn't have a real percpu implementation: + */ + trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); +#endif + if (trans->iters) mempool_free(trans->iters, &trans->c->btree_iters_pool); - else if (trans->iters != trans->iters_onstack) - kfree(trans->iters); + trans->mem = (void *) 0x1; trans->iters = (void *) 0x1; return trans->error ? -EIO : 0; } -static void bch2_btree_iter_node_to_text(struct printbuf *out, - struct btree_bkey_cached_common *_b, - enum btree_iter_type type) +static void __maybe_unused +bch2_btree_iter_node_to_text(struct printbuf *out, + struct btree_bkey_cached_common *_b, + enum btree_iter_type type) { - pr_buf(out, " %px l=%u %s:", - _b, _b->level, bch2_btree_ids[_b->btree_id]); + pr_buf(out, " l=%u %s:", + _b->level, bch2_btree_ids[_b->btree_id]); bch2_bpos_to_text(out, btree_node_pos(_b, type)); } +#ifdef CONFIG_BCACHEFS_DEBUG +static bool trans_has_btree_nodes_locked(struct btree_trans *trans) +{ + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) + if (btree_iter_type(iter) != BTREE_ITER_CACHED && + iter->nodes_locked) + return true; + return false; +} +#endif + void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -2382,14 +2317,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); + if (!trans_has_btree_nodes_locked(trans)) + continue; + + pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); trans_for_each_iter(trans, iter) { if (!iter->nodes_locked) continue; - pr_buf(out, " iter %u %s:", + pr_buf(out, " iter %u %c %s:", iter->idx, + btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', bch2_btree_ids[iter->btree_id]); bch2_bpos_to_text(out, iter->pos); pr_buf(out, "\n"); @@ -2408,17 +2347,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) b = READ_ONCE(trans->locking); if (b) { - pr_buf(out, " locking iter %u l=%u %s:", + iter = &trans->iters[trans->locking_iter_idx]; + pr_buf(out, " locking iter %u %c l=%u %s:", trans->locking_iter_idx, + btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', trans->locking_level, bch2_btree_ids[trans->locking_btree_id]); bch2_bpos_to_text(out, trans->locking_pos); - pr_buf(out, " node "); bch2_btree_iter_node_to_text(out, (void *) b, - btree_iter_type(&trans->iters[trans->locking_iter_idx])); + btree_iter_type(iter)); pr_buf(out, "\n"); } } @@ -2429,6 +2369,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_btree_iter_exit(struct bch_fs *c) { mempool_exit(&c->btree_iters_pool); + cleanup_srcu_struct(&c->btree_trans_barrier); } int bch2_fs_btree_iter_init(struct bch_fs *c) @@ -2438,7 +2379,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) INIT_LIST_HEAD(&c->btree_trans_list); mutex_init(&c->btree_trans_lock); - return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + return init_srcu_struct(&c->btree_trans_barrier) ?: + mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, sizeof(struct btree_iter) * nr + sizeof(struct btree_insert_entry) * nr + sizeof(struct btree_insert_entry) * nr); diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index bd9ec3e..07d9b6d 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -48,21 +48,22 @@ static inline int btree_iter_err(const struct btree_iter *iter) /* Iterate over iters within a transaction: */ -#define trans_for_each_iter_all(_trans, _iter) \ - for (_iter = (_trans)->iters; \ - _iter < (_trans)->iters + (_trans)->nr_iters; \ - _iter++) - static inline struct btree_iter * __trans_next_iter(struct btree_trans *trans, unsigned idx) { - EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); + u64 l; + + if (idx == BTREE_ITER_MAX) + return NULL; - for (; idx < trans->nr_iters; idx++) - if (trans->iters_linked & (1ULL << idx)) - return &trans->iters[idx]; + l = trans->iters_linked >> idx; + if (!l) + return NULL; - return NULL; + idx += __ffs64(l); + EBUG_ON(idx >= BTREE_ITER_MAX); + EBUG_ON(trans->iters[idx].idx != idx); + return &trans->iters[idx]; } #define trans_for_each_iter(_trans, _iter) \ @@ -115,7 +116,6 @@ bool bch2_trans_relock(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); -bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, unsigned new_locks_want) @@ -123,9 +123,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); return iter->locks_want < new_locks_want - ? (!iter->trans->nounlock - ? __bch2_btree_iter_upgrade(iter, new_locks_want) - : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) + ? __bch2_btree_iter_upgrade(iter, new_locks_want) : iter->uptodate <= BTREE_ITER_NEED_PEEK; } @@ -133,8 +131,10 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) { - if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) - __bch2_btree_iter_downgrade(iter, 0); + unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0); + + if (iter->locks_want > new_locks_want) + __bch2_btree_iter_downgrade(iter, new_locks_want); } void bch2_trans_downgrade(struct btree_trans *); @@ -144,15 +144,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); -int __must_check __bch2_btree_iter_traverse(struct btree_iter *); - -static inline int __must_check -bch2_btree_iter_traverse(struct btree_iter *iter) -{ - return iter->uptodate >= BTREE_ITER_NEED_RELOCK - ? __bch2_btree_iter_traverse(iter) - : 0; -} +int __must_check bch2_btree_iter_traverse(struct btree_iter *); int bch2_btree_iter_traverse_all(struct btree_trans *); @@ -170,18 +162,31 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); -void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); -void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); -void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); +bool bch2_btree_iter_advance(struct btree_iter *); +bool bch2_btree_iter_rewind(struct btree_iter *); -static inline int btree_iter_cmp(const struct btree_iter *l, - const struct btree_iter *r) +static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + + iter->k.type = KEY_TYPE_deleted; + iter->k.p.inode = iter->pos.inode = new_pos.inode; + iter->k.p.offset = iter->pos.offset = new_pos.offset; + iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; + iter->k.size = 0; +} + +/* Sort order for locking btree iterators: */ +static inline int btree_iter_lock_cmp(const struct btree_iter *l, + const struct btree_iter *r) { return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: + -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: bkey_cmp(l->pos, r->pos); } @@ -239,14 +244,11 @@ static inline int bkey_err(struct bkey_s_c k) #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ - for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ - bch2_trans_get_iter((_trans), (_btree_id), \ - (_start), (_flags))) ?: \ - PTR_ERR_OR_ZERO(((_k) = \ - __bch2_btree_iter_peek(_iter, _flags)).k); \ - !_ret && (_k).k; \ - (_ret) = PTR_ERR_OR_ZERO(((_k) = \ - __bch2_btree_iter_next(_iter, _flags)).k)) + for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \ + (_start), (_flags)), \ + (_k) = __bch2_btree_iter_peek(_iter, _flags); \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + (_k) = __bch2_btree_iter_next(_iter, _flags)) #define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ @@ -261,17 +263,18 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); void bch2_trans_unlink_iters(struct btree_trans *); struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, - struct bpos, unsigned); + struct bpos, unsigned, + unsigned, unsigned); static inline struct btree_iter * bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, struct bpos pos, unsigned flags) { struct btree_iter *iter = - __bch2_trans_get_iter(trans, btree_id, pos, flags); - - if (!IS_ERR(iter)) - iter->ip_allocated = _THIS_IP_; + __bch2_trans_get_iter(trans, btree_id, pos, + (flags & BTREE_ITER_INTENT) != 0, 0, + flags); + iter->ip_allocated = _THIS_IP_; return iter; } @@ -283,17 +286,32 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) struct btree_iter *iter = __bch2_trans_copy_iter(trans, src); - if (!IS_ERR(iter)) - iter->ip_allocated = _THIS_IP_; + iter->ip_allocated = _THIS_IP_; return iter; - } struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned); +static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter) +{ + return (trans->iters_live & (1ULL << iter->idx)) != 0; +} + +static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter) +{ + return btree_iter_live(trans, iter) || + (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); +} + +static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) +{ + trans->iters_touched &= ~(1ULL << iter->idx); +} + #define TRANS_RESET_NOTRAVERSE (1 << 0) +#define TRANS_RESET_NOUNLOCK (1 << 1) void bch2_trans_reset(struct btree_trans *, unsigned); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 6166275..0d3c0a4 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -9,8 +9,11 @@ #include "journal.h" #include "journal_reclaim.h" +#include #include +static struct kmem_cache *bch2_key_cache; + static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -18,7 +21,7 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, const struct bkey_cached_key *key = arg->key; return cmp_int(ck->key.btree_id, key->btree_id) ?: - bkey_cmp(ck->key.pos, key->pos); + bpos_cmp(ck->key.pos, key->pos); } static const struct rhashtable_params bch2_btree_key_cache_params = { @@ -29,8 +32,8 @@ static const struct rhashtable_params bch2_btree_key_cache_params = { }; __flatten -static inline struct bkey_cached * -btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) +inline struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) { struct bkey_cached_key key = { .btree_id = btree_id, @@ -66,12 +69,22 @@ static void bkey_cached_evict(struct btree_key_cache *c, BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, bch2_btree_key_cache_params)); memset(&ck->key, ~0, sizeof(ck->key)); + + atomic_long_dec(&c->nr_keys); } -static void bkey_cached_free(struct btree_key_cache *c, +static void bkey_cached_free(struct btree_key_cache *bc, struct bkey_cached *ck) { - list_move(&ck->list, &c->freed); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + list_move_tail(&ck->list, &bc->freed); + bc->nr_freed++; kfree(ck->k); ck->k = NULL; @@ -86,26 +99,50 @@ bkey_cached_alloc(struct btree_key_cache *c) { struct bkey_cached *ck; - list_for_each_entry(ck, &c->freed, list) - if (bkey_cached_lock_for_evict(ck)) - return ck; + ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); + if (likely(ck)) { + INIT_LIST_HEAD(&ck->list); + six_lock_init(&ck->c.lock); + BUG_ON(!six_trylock_intent(&ck->c.lock)); + BUG_ON(!six_trylock_write(&ck->c.lock)); + return ck; + } + + return NULL; +} - list_for_each_entry(ck, &c->clean, list) +static struct bkey_cached * +bkey_cached_reuse(struct btree_key_cache *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct bkey_cached *ck; + unsigned i; + + mutex_lock(&c->lock); + list_for_each_entry_reverse(ck, &c->freed, list) if (bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(c, ck); + c->nr_freed--; + list_del(&ck->list); + mutex_unlock(&c->lock); return ck; } + mutex_unlock(&c->lock); - ck = kzalloc(sizeof(*ck), GFP_NOFS); - if (!ck) - return NULL; - - INIT_LIST_HEAD(&ck->list); - six_lock_init(&ck->c.lock); - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); + rcu_read_lock(); + tbl = rht_dereference_rcu(c->table.tbl, &c->table); + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(c, ck); + rcu_read_unlock(); + return ck; + } + } + rcu_read_unlock(); - return ck; + return NULL; } static struct bkey_cached * @@ -114,28 +151,45 @@ btree_key_cache_create(struct btree_key_cache *c, struct bpos pos) { struct bkey_cached *ck; + bool was_new = true; ck = bkey_cached_alloc(c); - if (!ck) - return ERR_PTR(-ENOMEM); + + if (unlikely(!ck)) { + ck = bkey_cached_reuse(c); + if (unlikely(!ck)) + return ERR_PTR(-ENOMEM); + + was_new = false; + } ck->c.level = 0; ck->c.btree_id = btree_id; ck->key.btree_id = btree_id; ck->key.pos = pos; ck->valid = false; + ck->flags = 1U << BKEY_CACHED_ACCESSED; - BUG_ON(ck->flags); - - if (rhashtable_lookup_insert_fast(&c->table, + if (unlikely(rhashtable_lookup_insert_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params)) { + bch2_btree_key_cache_params))) { /* We raced with another fill: */ - bkey_cached_free(c, ck); + + if (likely(was_new)) { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + kfree(ck); + } else { + mutex_lock(&c->lock); + bkey_cached_free(c, ck); + mutex_unlock(&c->lock); + } + return NULL; } - list_move(&ck->list, &c->clean); + atomic_long_inc(&c->nr_keys); + six_unlock_write(&ck->c.lock); return ck; @@ -153,28 +207,23 @@ static int btree_key_cache_fill(struct btree_trans *trans, iter = bch2_trans_get_iter(trans, ck->key.btree_id, ck->key.pos, BTREE_ITER_SLOTS); - if (IS_ERR(iter)) - return PTR_ERR(iter); - k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); - if (ret) { - bch2_trans_iter_put(trans, iter); - return ret; - } + if (ret) + goto err; if (!bch2_btree_node_relock(ck_iter, 0)) { - bch2_trans_iter_put(trans, iter); trace_transaction_restart_ip(trans->ip, _THIS_IP_); - return -EINTR; + ret = -EINTR; + goto err; } if (k.k->u64s > ck->u64s) { new_u64s = roundup_pow_of_two(k.k->u64s); new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); if (!new_k) { - bch2_trans_iter_put(trans, iter); - return -ENOMEM; + ret = -ENOMEM; + goto err; } } @@ -190,9 +239,10 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); /* We're not likely to need this iterator again: */ - bch2_trans_iter_free(trans, iter); - - return 0; + set_btree_iter_dontneed(trans, iter); +err: + bch2_trans_iter_put(trans, iter); + return ret; } static int bkey_cached_check_fn(struct six_lock *lock, void *p) @@ -201,9 +251,10 @@ static int bkey_cached_check_fn(struct six_lock *lock, void *p) const struct btree_iter *iter = p; return ck->key.btree_id == iter->btree_id && - !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; + !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1; } +__flatten int bch2_btree_iter_traverse_cached(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; @@ -218,18 +269,15 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter) goto fill; } retry: - ck = btree_key_cache_find(c, iter->btree_id, iter->pos); + ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos); if (!ck) { if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { iter->l[0].b = NULL; return 0; } - mutex_lock(&c->btree_key_cache.lock); ck = btree_key_cache_create(&c->btree_key_cache, iter->btree_id, iter->pos); - mutex_unlock(&c->btree_key_cache.lock); - ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; @@ -242,9 +290,9 @@ retry: enum six_lock_type lock_want = __btree_lock_want(iter, 0); if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, - bkey_cached_check_fn, iter)) { + bkey_cached_check_fn, iter, _THIS_IP_)) { if (ck->key.btree_id != iter->btree_id || - bkey_cmp(ck->key.pos, iter->pos)) { + bpos_cmp(ck->key.pos, iter->pos)) { goto retry; } @@ -254,7 +302,7 @@ retry: } if (ck->key.btree_id != iter->btree_id || - bkey_cmp(ck->key.pos, iter->pos)) { + bpos_cmp(ck->key.pos, iter->pos)) { six_unlock_type(&ck->c.lock, lock_want); goto retry; } @@ -279,8 +327,18 @@ fill: goto err; } + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + iter->uptodate = BTREE_ITER_NEED_PEEK; - bch2_btree_iter_downgrade(iter); + + if (!(iter->flags & BTREE_ITER_INTENT)) + bch2_btree_iter_downgrade(iter); + else if (!iter->locks_want) { + if (!__bch2_btree_iter_upgrade(iter, 1)) + ret = -EINTR; + } + return ret; err: if (ret != -EINTR) { @@ -294,29 +352,23 @@ err: static int btree_key_cache_flush_pos(struct btree_trans *trans, struct bkey_cached_key key, u64 journal_seq, + unsigned commit_flags, bool evict) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_iter *c_iter = NULL, *b_iter = NULL; - struct bkey_cached *ck; + struct bkey_cached *ck = NULL; int ret; b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, BTREE_ITER_SLOTS| BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(b_iter); - if (ret) - goto out; - c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_CACHED_NOCREATE| BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(c_iter); - if (ret) - goto out; retry: ret = bch2_btree_iter_traverse(c_iter); if (ret) @@ -339,27 +391,33 @@ retry: BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_JOURNAL_RESERVED| - BTREE_INSERT_JOURNAL_RECLAIM); + (ck->journal.seq == journal_last_seq(j) + ? BTREE_INSERT_JOURNAL_RESERVED + : 0)| + commit_flags); err: if (ret == -EINTR) goto retry; - BUG_ON(ret && !bch2_journal_error(j)); + if (ret == -EAGAIN) + goto out; - if (ret) + if (ret) { + bch2_fs_fatal_err_on(!bch2_journal_error(j), c, + "error flushing key cache: %i", ret); goto out; + } bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); - clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + + BUG_ON(!btree_node_locked(c_iter, 0)); if (!evict) { - mutex_lock(&c->btree_key_cache.lock); - list_move_tail(&ck->list, &c->btree_key_cache.clean); - mutex_unlock(&c->btree_key_cache.lock); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + } } else { evict: BUG_ON(!btree_node_intent_locked(c_iter, 0)); @@ -369,8 +427,14 @@ evict: six_lock_write(&ck->c.lock, NULL, NULL); - mutex_lock(&c->btree_key_cache.lock); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + } + bkey_cached_evict(&c->btree_key_cache, ck); + + mutex_lock(&c->btree_key_cache.lock); bkey_cached_free(&c->btree_key_cache, ck); mutex_unlock(&c->btree_key_cache.lock); } @@ -380,15 +444,17 @@ out: return ret; } -static void btree_key_cache_journal_flush(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) +int bch2_btree_key_cache_journal_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; struct btree_trans trans; + int ret = 0; + + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); six_lock_read(&ck->c.lock, NULL, NULL); key = ck->key; @@ -396,13 +462,18 @@ static void btree_key_cache_journal_flush(struct journal *j, if (ck->journal.seq != seq || !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { six_unlock_read(&ck->c.lock); - return; + goto unlock; } six_unlock_read(&ck->c.lock); bch2_trans_init(&trans, c, 0, 0); - btree_key_cache_flush_pos(&trans, key, seq, false); + ret = btree_key_cache_flush_pos(&trans, key, seq, + BTREE_INSERT_JOURNAL_RECLAIM, false); bch2_trans_exit(&trans); +unlock: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + + return ret; } /* @@ -415,10 +486,10 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, struct bkey_cached_key key = { id, pos }; /* Fastpath - assume it won't be found: */ - if (!btree_key_cache_find(c, id, pos)) + if (!bch2_btree_key_cache_find(c, id, pos)) return 0; - return btree_key_cache_flush_pos(trans, key, 0, true); + return btree_key_cache_flush_pos(trans, key, 0, 0, true); } bool bch2_btree_insert_key_cached(struct btree_trans *trans, @@ -427,6 +498,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) iter->l[0].b; + bool kick_reclaim = false; BUG_ON(insert->u64s > ck->u64s); @@ -446,15 +518,18 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - mutex_lock(&c->btree_key_cache.lock); - list_del_init(&ck->list); - set_bit(BKEY_CACHED_DIRTY, &ck->flags); - mutex_unlock(&c->btree_key_cache.lock); + atomic_long_inc(&c->btree_key_cache.nr_dirty); + + if (bch2_nr_btree_keys_need_flush(c)) + kick_reclaim = true; } bch2_journal_pin_update(&c->journal, trans->journal_res.seq, - &ck->journal, btree_key_cache_journal_flush); + &ck->journal, bch2_btree_key_cache_journal_flush); + + if (kick_reclaim) + journal_reclaim_kick(&c->journal); return true; } @@ -462,58 +537,192 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, enum btree_id id, struct bpos pos) { - BUG_ON(btree_key_cache_find(trans->c, id, pos)); + BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); } #endif -void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + struct bucket_table *tbl; + struct bkey_cached *ck, *t; + size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; + unsigned start, flags; + int srcu_idx; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_FS) + mutex_lock(&bc->lock); + else if (!mutex_trylock(&bc->lock)) + return -1; + + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + flags = memalloc_nofs_save(); + + /* + * Newest freed entries are at the end of the list - once we hit one + * that's too new to be freed, we can bail out: + */ + list_for_each_entry_safe(ck, t, &bc->freed, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + kmem_cache_free(bch2_key_cache, ck); + bc->nr_freed--; + scanned++; + freed++; + } + + if (scanned >= nr) + goto out; + + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + start = bc->shrink_iter; + + do { + struct rhash_head *pos, *next; + + pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + + while (!rht_is_a_nulls(pos)) { + next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); + ck = container_of(pos, struct bkey_cached, hash); + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) + goto next; + + if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + else if (bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + } + + scanned++; + if (scanned >= nr) + break; +next: + pos = next; + } + + bc->shrink_iter++; + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + } while (scanned < nr && bc->shrink_iter != start); + + rcu_read_unlock(); +out: + memalloc_nofs_restore(flags); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + mutex_unlock(&bc->lock); + + return freed; +} + +static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, + struct shrink_control *sc) { + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + + return atomic_long_read(&bc->nr_keys); +} + +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct bucket_table *tbl; struct bkey_cached *ck, *n; + struct rhash_head *pos; + unsigned i; - mutex_lock(&c->lock); - list_for_each_entry_safe(ck, n, &c->clean, list) { + if (bc->shrink.list.next) + unregister_shrinker(&bc->shrink); + + mutex_lock(&bc->lock); + + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &bc->freed); + } + rcu_read_unlock(); + + list_for_each_entry_safe(ck, n, &bc->freed, list) { + cond_resched(); + + bch2_journal_pin_drop(&c->journal, &ck->journal); + bch2_journal_preres_put(&c->journal, &ck->res); + + list_del(&ck->list); kfree(ck->k); - kfree(ck); + kmem_cache_free(bch2_key_cache, ck); } - list_for_each_entry_safe(ck, n, &c->freed, list) - kfree(ck); - mutex_unlock(&c->lock); - rhashtable_destroy(&c->table); + BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal)); + BUG_ON(atomic_long_read(&bc->nr_keys)); + + mutex_unlock(&bc->lock); + + if (bc->table_init_done) + rhashtable_destroy(&bc->table); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { mutex_init(&c->lock); INIT_LIST_HEAD(&c->freed); - INIT_LIST_HEAD(&c->clean); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) { - return rhashtable_init(&c->table, &bch2_btree_key_cache_params); + int ret; + + c->shrink.seeks = 1; + c->shrink.count_objects = bch2_btree_key_cache_count; + c->shrink.scan_objects = bch2_btree_key_cache_scan; + + ret = register_shrinker(&c->shrink); + if (ret) + return ret; + + ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); + if (ret) + return ret; + + c->table_init_done = true; + return 0; } void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { - struct bucket_table *tbl; - struct bkey_cached *ck; - struct rhash_head *pos; - size_t i; + pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); + pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys)); + pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty)); +} - mutex_lock(&c->lock); - tbl = rht_dereference_rcu(c->table.tbl, &c->table); +void bch2_btree_key_cache_exit(void) +{ + if (bch2_key_cache) + kmem_cache_destroy(bch2_key_cache); +} - for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - pr_buf(out, "%s:", - bch2_btree_ids[ck->key.btree_id]); - bch2_bpos_to_text(out, ck->key.pos); +int __init bch2_btree_key_cache_init(void) +{ + bch2_key_cache = KMEM_CACHE(bkey_cached, 0); + if (!bch2_key_cache) + return -ENOMEM; - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) - pr_buf(out, " journal seq %llu", ck->journal.seq); - pr_buf(out, "\n"); - } - } - mutex_unlock(&c->lock); + return 0; } diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index b1756c6..7e2b0a0 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -1,6 +1,31 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_H #define _BCACHEFS_BTREE_KEY_CACHE_H +static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 1024 + nr_keys / 2; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + + return nr_dirty > max_dirty && + test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); +} + +int bch2_btree_key_cache_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + +struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + int bch2_btree_iter_traverse_cached(struct btree_iter *); bool bch2_btree_insert_key_cached(struct btree_trans *, @@ -22,4 +47,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *); void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); +void bch2_btree_key_cache_exit(void); +int __init bch2_btree_key_cache_init(void); + #endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 81fbf3e..7532bcd 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -95,7 +95,7 @@ btree_lock_want(struct btree_iter *iter, int level) return BTREE_NODE_UNLOCKED; } -static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) +static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) { int lock_type = btree_node_locked_type(iter, level); @@ -106,13 +106,6 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) mark_btree_node_unlocked(iter, level); } -static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) -{ - EBUG_ON(!level && iter->trans->nounlock); - - __btree_node_unlock(iter, level); -} - static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) { btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); @@ -176,36 +169,25 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, struct btree_iter *, enum six_lock_type, - six_lock_should_sleep_fn, void *); + six_lock_should_sleep_fn, void *, + unsigned long); static inline bool btree_node_lock(struct btree *b, struct bpos pos, unsigned level, struct btree_iter *iter, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { struct btree_trans *trans = iter->trans; - bool ret; EBUG_ON(level >= BTREE_MAX_DEPTH); EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); -#ifdef CONFIG_BCACHEFS_DEBUG - trans->locking = b; - trans->locking_iter_idx = iter->idx; - trans->locking_pos = pos; - trans->locking_btree_id = iter->btree_id; - trans->locking_level = level; -#endif - ret = likely(six_trylock_type(&b->c.lock, type)) || + return likely(six_trylock_type(&b->c.lock, type)) || btree_node_lock_increment(trans, b, level, type) || __bch2_btree_node_lock(b, pos, level, iter, type, - should_sleep_fn, p); - -#ifdef CONFIG_BCACHEFS_DEBUG - trans->locking = NULL; -#endif - return ret; + should_sleep_fn, p, ip); } bool __bch2_btree_node_relock(struct btree_iter *, unsigned); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index cc01bae..39e93da 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -47,8 +47,6 @@ struct bset_tree { u16 data_offset; u16 aux_data_offset; u16 end_offset; - - struct bpos max_key; }; struct btree_write { @@ -57,7 +55,7 @@ struct btree_write { struct btree_alloc { struct open_buckets ob; - BKEY_PADDED(k); + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); }; struct btree_bkey_cached_common { @@ -76,6 +74,7 @@ struct btree { u16 written; u8 nsets; u8 nr_key_bits; + u16 version_ondisk; struct bkey_format format; @@ -97,6 +96,11 @@ struct btree { u8 byte_order; u8 unpack_fn_len; + struct btree_write writes[2]; + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + /* * XXX: add a delete sequence number, so when bch2_btree_node_relock() * fails because the lock sequence number has changed - i.e. the @@ -127,15 +131,6 @@ struct btree { /* lru list */ struct list_head list; - - struct btree_write writes[2]; - -#ifdef CONFIG_BCACHEFS_DEBUG - bool *expensive_debug_checks; -#endif - - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; struct btree_cache { @@ -162,6 +157,7 @@ struct btree_cache { /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; + atomic_t dirty; struct shrinker shrink; /* @@ -217,13 +213,8 @@ enum btree_iter_type { #define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) #define BTREE_ITER_CACHED_NOFILL (1 << 9) #define BTREE_ITER_CACHED_NOCREATE (1 << 10) - -#define BTREE_ITER_USER_FLAGS \ - (BTREE_ITER_SLOTS \ - |BTREE_ITER_INTENT \ - |BTREE_ITER_PREFETCH \ - |BTREE_ITER_CACHED_NOFILL \ - |BTREE_ITER_CACHED_NOCREATE) +#define BTREE_ITER_NOT_EXTENTS (1 << 11) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) enum btree_iter_uptodate { BTREE_ITER_UPTODATE = 0, @@ -250,7 +241,11 @@ enum btree_iter_uptodate { struct btree_iter { struct btree_trans *trans; struct bpos pos; + /* what we're searching for/what the iterator actually points to: */ + struct bpos real_pos; struct bpos pos_after_commit; + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; u16 flags; u8 idx; @@ -283,6 +278,11 @@ btree_iter_type(const struct btree_iter *iter) return iter->flags & BTREE_ITER_TYPE; } +static inline bool btree_iter_is_cached(const struct btree_iter *iter) +{ + return btree_iter_type(iter) == BTREE_ITER_CACHED; +} + static inline struct btree_iter_level *iter_l(struct btree_iter *iter) { return iter->l + iter->level; @@ -291,8 +291,14 @@ static inline struct btree_iter_level *iter_l(struct btree_iter *iter) struct btree_key_cache { struct mutex lock; struct rhashtable table; + bool table_init_done; struct list_head freed; - struct list_head clean; + struct shrinker shrink; + unsigned shrink_iter; + + size_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; }; struct bkey_cached_key { @@ -300,7 +306,8 @@ struct bkey_cached_key { struct bpos pos; } __attribute__((packed, aligned(4))); -#define BKEY_CACHED_DIRTY 0 +#define BKEY_CACHED_ACCESSED 0 +#define BKEY_CACHED_DIRTY 1 struct bkey_cached { struct btree_bkey_cached_common c; @@ -308,6 +315,7 @@ struct bkey_cached { unsigned long flags; u8 u64s; bool valid; + u32 btree_trans_barrier_seq; struct bkey_cached_key key; struct rhash_head hash; @@ -321,7 +329,11 @@ struct bkey_cached { struct btree_insert_entry { unsigned trigger_flags; + u8 bkey_type; + enum btree_id btree_id:8; + u8 level; unsigned trans_triggers_run:1; + unsigned is_extent:1; struct bkey_i *k; struct btree_iter *iter; }; @@ -332,6 +344,14 @@ struct btree_insert_entry { #define BTREE_ITER_MAX 32 #endif +struct btree_trans_commit_hook; +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); + +struct btree_trans_commit_hook { + btree_trans_commit_hook_fn *fn; + struct btree_trans_commit_hook *next; +}; + struct btree_trans { struct bch_fs *c; #ifdef CONFIG_BCACHEFS_DEBUG @@ -344,21 +364,18 @@ struct btree_trans { pid_t pid; #endif unsigned long ip; + int srcu_idx; - u64 iters_linked; - u64 iters_live; - u64 iters_touched; - - u8 nr_iters; u8 nr_updates; u8 nr_updates2; - u8 size; unsigned used_mempool:1; unsigned error:1; - unsigned nounlock:1; - unsigned need_reset:1; unsigned in_traverse_all:1; + u64 iters_linked; + u64 iters_live; + u64 iters_touched; + unsigned mem_top; unsigned mem_bytes; void *mem; @@ -368,6 +385,7 @@ struct btree_trans { struct btree_insert_entry *updates2; /* update path: */ + struct btree_trans_commit_hook *hooks; struct jset_entry *extra_journal_entries; unsigned extra_journal_entry_u64s; struct journal_entry_pin *journal_pin; @@ -380,10 +398,6 @@ struct btree_trans { unsigned journal_u64s; unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; - - struct btree_iter iters_onstack[2]; - struct btree_insert_entry updates_onstack[2]; - struct btree_insert_entry updates2_onstack[2]; }; #define BTREE_FLAG(flag) \ @@ -408,13 +422,12 @@ enum btree_flags { BTREE_NODE_just_written, BTREE_NODE_dying, BTREE_NODE_fake, - BTREE_NODE_old_extent_overwrite, BTREE_NODE_need_rewrite, + BTREE_NODE_never_write, }; BTREE_FLAG(read_in_flight); BTREE_FLAG(read_error); -BTREE_FLAG(dirty); BTREE_FLAG(need_write); BTREE_FLAG(noevict); BTREE_FLAG(write_idx); @@ -423,8 +436,8 @@ BTREE_FLAG(write_in_flight); BTREE_FLAG(just_written); BTREE_FLAG(dying); BTREE_FLAG(fake); -BTREE_FLAG(old_extent_overwrite); BTREE_FLAG(need_rewrite); +BTREE_FLAG(never_write); static inline struct btree_write *btree_current_write(struct btree *b) { @@ -536,16 +549,16 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i) } enum btree_node_type { -#define x(kwd, val, name) BKEY_TYPE_##kwd = val, +#define x(kwd, val) BKEY_TYPE_##kwd = val, BCH_BTREE_IDS() #undef x - BKEY_TYPE_BTREE, + BKEY_TYPE_btree, }; /* Type of a key in btree @id at level @level: */ static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) { - return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; + return level ? BKEY_TYPE_btree : (enum btree_node_type) id; } /* Type of keys @b contains: */ @@ -557,8 +570,8 @@ static inline enum btree_node_type btree_node_type(struct btree *b) static inline bool btree_node_type_is_extents(enum btree_node_type type) { switch (type) { - case BKEY_TYPE_EXTENTS: - case BKEY_TYPE_REFLINK: + case BKEY_TYPE_extents: + case BKEY_TYPE_reflink: return true; default: return false; @@ -580,19 +593,31 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) return btree_node_type_is_extents(btree_iter_key_type(iter)); } +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_reflink)| \ + (1U << BKEY_TYPE_btree)) + +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ + ((1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_stripes)) + #define BTREE_NODE_TYPE_HAS_TRIGGERS \ - ((1U << BKEY_TYPE_EXTENTS)| \ - (1U << BKEY_TYPE_ALLOC)| \ - (1U << BKEY_TYPE_INODES)| \ - (1U << BKEY_TYPE_REFLINK)| \ - (1U << BKEY_TYPE_EC)| \ - (1U << BKEY_TYPE_BTREE)) + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) -#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - ((1U << BKEY_TYPE_EXTENTS)| \ - (1U << BKEY_TYPE_INODES)| \ - (1U << BKEY_TYPE_EC)| \ - (1U << BKEY_TYPE_REFLINK)) +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ + (1U << BTREE_ID_dirents)| \ + (1U << BTREE_ID_xattrs)) + +static inline bool btree_type_has_snapshots(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; +} enum btree_trigger_flags { __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ @@ -643,6 +668,7 @@ enum btree_insert_ret { BTREE_INSERT_ENOSPC, BTREE_INSERT_NEED_MARK_REPLICAS, BTREE_INSERT_NEED_JOURNAL_RES, + BTREE_INSERT_NEED_JOURNAL_RECLAIM, }; enum btree_gc_coalesce_fail_reason { diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index e0b1bde..4ce12ae 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -20,7 +20,6 @@ enum btree_insert_flags { __BTREE_INSERT_NOCHECK_RW, __BTREE_INSERT_LAZY_RW, __BTREE_INSERT_USE_RESERVE, - __BTREE_INSERT_USE_ALLOC_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, __BTREE_INSERT_JOURNAL_RESERVED, __BTREE_INSERT_JOURNAL_RECLAIM, @@ -43,7 +42,6 @@ enum btree_insert_flags { /* for copygc, or when merging btree nodes */ #define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) -#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) /* Insert is for journal replay - don't get journal reservations: */ #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) @@ -67,8 +65,8 @@ int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); -int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, - struct bpos, u64 *); +int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + struct bpos, struct bpos, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, u64 *); @@ -79,6 +77,8 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, int bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_trigger_flags); +void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *); /** diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a2604b0..0014470 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -11,6 +11,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "buckets.h" +#include "error.h" #include "extents.h" #include "journal.h" #include "journal_reclaim.h" @@ -34,6 +35,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) struct bkey_s_c k; struct bkey_s_c_btree_ptr_v2 bp; struct bkey unpacked; + char buf1[100], buf2[100]; BUG_ON(!b->c.level); @@ -48,16 +50,26 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) break; bp = bkey_s_c_to_btree_ptr_v2(k); - BUG_ON(bkey_cmp(next_node, bp.v->min_key)); + if (bpos_cmp(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); + panic("expected next min_key %s got %s\n", + (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1), + (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2)); + } bch2_btree_node_iter_advance(&iter, b); if (bch2_btree_node_iter_end(&iter)) { - BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); + if (bpos_cmp(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); + panic("expected end %s got %s\n", + (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1), + (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2)); + } break; } - next_node = bkey_successor(k.k->p); + next_node = bpos_successor(k.k->p); } #endif } @@ -70,11 +82,9 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) struct bset_tree *t; struct bkey uk; - bch2_bkey_format_add_pos(s, b->data->min_key); - for_each_bset(b, t) bset_tree_for_each_key(b, t, k) - if (!bkey_whiteout(k)) { + if (!bkey_deleted(k)) { uk = bkey_unpack_key(b, k); bch2_bkey_format_add_key(s, &uk); } @@ -85,6 +95,8 @@ static struct bkey_format bch2_btree_calc_format(struct btree *b) struct bkey_format_state s; bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, b->data->min_key); + bch2_bkey_format_add_pos(&s, b->data->max_key); __bch2_btree_calc_format(&s, b); return bch2_bkey_format_done(&s); @@ -149,7 +161,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) b->ob.nr = 0; - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); btree_node_lock_type(c, b, SIX_LOCK_write); __btree_node_free(c, b); @@ -179,21 +191,18 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, { struct write_point *wp; struct btree *b; - BKEY_PADDED(k) tmp; + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; struct open_buckets ob = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; unsigned nr_reserve; enum alloc_reserve alloc_reserve; - if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { + if (flags & BTREE_INSERT_USE_RESERVE) { nr_reserve = 0; - alloc_reserve = RESERVE_ALLOC; - } else if (flags & BTREE_INSERT_USE_RESERVE) { - nr_reserve = BTREE_NODE_RESERVE / 2; - alloc_reserve = RESERVE_BTREE; + alloc_reserve = RESERVE_BTREE_MOVINGGC; } else { nr_reserve = BTREE_NODE_RESERVE; - alloc_reserve = RESERVE_NONE; + alloc_reserve = RESERVE_BTREE; } mutex_lock(&c->btree_reserve_cache_lock); @@ -209,7 +218,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, + wp = bch2_alloc_sectors_start(c, + c->opts.metadata_target ?: + c->opts.foreground_target, + 0, writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, @@ -264,19 +276,19 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b = as->prealloc_nodes[--as->nr_prealloc_nodes]; set_btree_node_accessed(b); - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); set_btree_node_need_write(b); bch2_bset_init_first(b, &b->data->keys); b->c.level = level; b->c.btree_id = as->btree_id; + b->version_ondisk = c->sb.version; memset(&b->nr, 0, sizeof(b->nr)); b->data->magic = cpu_to_le64(bset_magic(c)); b->data->flags = 0; SET_BTREE_NODE_ID(b->data, as->btree_id); SET_BTREE_NODE_LEVEL(b->data, level); - b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); @@ -284,17 +296,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev bp->v.mem_ptr = 0; bp->v.seq = b->data->keys.seq; bp->v.sectors_written = 0; - bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); } - if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) - SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); - - if (btree_node_is_extents(b) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { - set_btree_node_old_extent_overwrite(b); - set_btree_node_need_rewrite(b); - } + SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); bch2_btree_build_aux_trees(b); @@ -433,10 +437,6 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, goto err_free; } - ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); - if (ret) - goto err_free; - as->prealloc_nodes[as->nr_prealloc_nodes++] = b; } @@ -454,6 +454,10 @@ static void bch2_btree_update_free(struct btree_update *as) { struct bch_fs *c = as->c; + if (as->took_gc_lock) + up_read(&c->gc_lock); + as->took_gc_lock = false; + bch2_journal_preres_put(&c->journal, &as->journal_preres); bch2_journal_pin_drop(&c->journal, &as->journal); @@ -503,14 +507,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, trans->journal_pin = &as->journal; for_each_keylist_key(&as->new_keys, k) { - ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), + ret = bch2_trans_mark_key(trans, + bkey_s_c_null, + bkey_i_to_s_c(k), 0, 0, BTREE_TRIGGER_INSERT); if (ret) return ret; } for_each_keylist_key(&as->old_keys, k) { - ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), + ret = bch2_trans_mark_key(trans, + bkey_i_to_s_c(k), + bkey_s_c_null, 0, 0, BTREE_TRIGGER_OVERWRITE); if (ret) return ret; @@ -523,10 +531,24 @@ static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; struct btree *b = as->b; + struct btree_trans trans; u64 journal_seq = 0; unsigned i; int ret; + /* + * If we're already in an error state, it might be because a btree node + * was never written, and we might be trying to free that same btree + * node here, but it won't have been marked as allocated and we'll see + * spurious disk usage inconsistencies in the transactional part below + * if we don't skip it: + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + + BUG_ON(!journal_pin_active(&as->journal)); + /* * We did an update to a parent node where the pointers we added pointed * to child nodes that weren't written yet: now, the child nodes have @@ -540,16 +562,18 @@ static void btree_update_nodes_written(struct btree_update *as) * journal reclaim does btree updates when flushing bkey_cached entries, * which may require allocations as well. */ - ret = bch2_trans_do(c, &as->disk_res, &journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED, - btree_update_nodes_written_trans(&trans, as)); - BUG_ON(ret && !bch2_journal_error(&c->journal)); - + bch2_trans_init(&trans, c, 0, 512); + ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_JOURNAL_RESERVED, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_exit(&trans); + + bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, + "error %i in btree_update_nodes_written()", ret); +err: if (b) { /* * @b is the node we did the final insert into: @@ -569,17 +593,30 @@ static void btree_update_nodes_written(struct btree_update *as) list_del(&as->write_blocked_list); - if (!ret && as->b == b) { + /* + * Node might have been freed, recheck under + * btree_interior_update_lock: + */ + if (as->b == b) { struct bset *i = btree_bset_last(b); BUG_ON(!b->c.level); BUG_ON(!btree_node_dirty(b)); - i->journal_seq = cpu_to_le64( - max(journal_seq, - le64_to_cpu(i->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); + if (!ret) { + i->journal_seq = cpu_to_le64( + max(journal_seq, + le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, journal_seq); + } else { + /* + * If we didn't get a journal sequence number we + * can't write this btree node, because recovery + * won't know to ignore this write: + */ + set_btree_node_never_write(b); + } } mutex_unlock(&c->btree_interior_update_lock); @@ -680,17 +717,7 @@ static void btree_update_reparent(struct btree_update *as, child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; - /* - * When we write a new btree root, we have to drop our journal pin - * _before_ the new nodes are technically reachable; see - * btree_update_nodes_written(). - * - * This goes for journal pins that are recursively blocked on us - so, - * just transfer the journal pin to the new interior update so - * btree_update_nodes_written() can drop it. - */ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); - bch2_journal_pin_drop(&c->journal, &child->journal); } static void btree_update_updated_root(struct btree_update *as, struct btree *b) @@ -827,7 +854,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, closure_wake_up(&c->btree_interior_update_wait); } - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); clear_btree_node_need_write(b); /* @@ -866,24 +893,33 @@ void bch2_btree_update_done(struct btree_update *as) { BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + if (as->took_gc_lock) + up_read(&as->c->gc_lock); + as->took_gc_lock = false; + bch2_btree_reserve_put(as); continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); } struct btree_update * -bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, - unsigned nr_nodes, unsigned flags, - struct closure *cl) +bch2_btree_update_start(struct btree_iter *iter, unsigned level, + unsigned nr_nodes, unsigned flags) { + struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct btree_update *as; + struct closure cl; int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0; - int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) - ? JOURNAL_RES_GET_RECLAIM : 0; + int journal_flags = 0; int ret = 0; + if (flags & BTREE_INSERT_JOURNAL_RESERVED) + journal_flags |= JOURNAL_RES_GET_RESERVED; + + closure_init_stack(&cl); +retry: /* * This check isn't necessary for correctness - it's just to potentially * prevent us from doing a lot of work that'll end up being wasted: @@ -892,12 +928,36 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, if (ret) return ERR_PTR(ret); + /* + * XXX: figure out how far we might need to split, + * instead of locking/reserving all the way to the root: + */ + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->ip); + return ERR_PTR(-EINTR); + } + + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + else if (!down_read_trylock(&c->gc_lock)) { + if (flags & BTREE_INSERT_NOUNLOCK) + return ERR_PTR(-EINTR); + + bch2_trans_unlock(trans); + down_read(&c->gc_lock); + if (!bch2_trans_relock(trans)) { + up_read(&c->gc_lock); + return ERR_PTR(-EINTR); + } + } + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); as->c = c; as->mode = BTREE_INTERIOR_NO_UPDATE; - as->btree_id = id; + as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->btree_id = iter->btree_id; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -909,16 +969,25 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, BTREE_UPDATE_JOURNAL_RES, journal_flags|JOURNAL_RES_GET_NONBLOCK); if (ret == -EAGAIN) { - if (flags & BTREE_INSERT_NOUNLOCK) - return ERR_PTR(-EINTR); + /* + * this would be cleaner if bch2_journal_preres_get() took a + * closure argument + */ + if (flags & BTREE_INSERT_NOUNLOCK) { + ret = -EINTR; + goto err; + } bch2_trans_unlock(trans); + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) + goto err; + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, journal_flags); if (ret) - return ERR_PTR(ret); + goto err; if (!bch2_trans_relock(trans)) { ret = -EINTR; @@ -933,10 +1002,15 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, if (ret) goto err; - ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); + ret = bch2_btree_reserve_get(as, nr_nodes, flags, + !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); if (ret) goto err; + bch2_journal_pin_add(&c->journal, + atomic64_read(&c->journal.seq), + &as->journal, NULL); + mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->list, &c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); @@ -944,6 +1018,18 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, return as; err: bch2_btree_update_free(as); + + if (ret == -EAGAIN) { + BUG_ON(flags & BTREE_INSERT_NOUNLOCK); + + bch2_trans_unlock(trans); + closure_sync(&cl); + ret = -EINTR; + } + + if (ret == -EINTR && bch2_trans_relock(trans)) + goto retry; + return ERR_PTR(ret); } @@ -956,6 +1042,11 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) list_del_init(&b->list); mutex_unlock(&c->btree_cache.lock); + if (b->c.level) + six_lock_pcpu_alloc(&b->c.lock); + else + six_lock_pcpu_free(&b->c.lock); + mutex_lock(&c->btree_root_lock); BUG_ON(btree_node_root(c, b) && (b->c.level < btree_node_root(c, b)->c.level || @@ -1018,7 +1109,19 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b struct bkey_i *insert, struct btree_node_iter *node_iter) { + struct bch_fs *c = as->c; struct bkey_packed *k; + const char *invalid; + + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); + if (invalid) { + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); + bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); + dump_stack(); + } BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -1034,7 +1137,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bch2_btree_node_iter_advance(node_iter, b); bch2_btree_bset_insert_key(iter, b, node_iter, insert); - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); set_btree_node_need_write(b); } @@ -1046,10 +1149,12 @@ static struct btree *__btree_split_node(struct btree_update *as, struct btree *n1, struct btree_iter *iter) { + struct bkey_format_state s; size_t nr_packed = 0, nr_unpacked = 0; struct btree *n2; struct bset *set1, *set2; - struct bkey_packed *k, *prev = NULL; + struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL; + struct bpos n1_pos; n2 = bch2_btree_node_alloc(as, n1->c.level); bch2_btree_update_add_new_node(as, n2); @@ -1059,8 +1164,6 @@ static struct btree *__btree_split_node(struct btree_update *as, SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); n2->key.k.p = n1->key.k.p; - btree_node_set_format(n2, n2->data->format); - set1 = btree_bset_first(n1); set2 = btree_bset_first(n2); @@ -1070,7 +1173,7 @@ static struct btree *__btree_split_node(struct btree_update *as, */ k = set1->start; while (1) { - struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); + struct bkey_packed *n = bkey_next(k); if (n == vstruct_last(set1)) break; @@ -1087,33 +1190,53 @@ static struct btree *__btree_split_node(struct btree_update *as, } BUG_ON(!prev); + set2_start = k; + set2_end = vstruct_last(set1); - btree_set_max(n1, bkey_unpack_pos(n1, prev)); - btree_set_min(n2, bkey_successor(n1->key.k.p)); - - set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); - set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); - + set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data); set_btree_bset_end(n1, n1->set); - set_btree_bset_end(n2, n2->set); - - n2->nr.live_u64s = le16_to_cpu(set2->u64s); - n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); - n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; - n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; n1->nr.live_u64s = le16_to_cpu(set1->u64s); n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); n1->nr.packed_keys = nr_packed; n1->nr.unpacked_keys = nr_unpacked; + n1_pos = bkey_unpack_pos(n1, prev); + if (as->c->sb.version < bcachefs_metadata_version_snapshot) + n1_pos.snapshot = U32_MAX; + + btree_set_max(n1, n1_pos); + btree_set_min(n2, bpos_successor(n1->key.k.p)); + + bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, n2->data->min_key); + bch2_bkey_format_add_pos(&s, n2->data->max_key); + + for (k = set2_start; k != set2_end; k = bkey_next(k)) { + struct bkey uk = bkey_unpack_key(n1, k); + bch2_bkey_format_add_key(&s, &uk); + } + + n2->data->format = bch2_bkey_format_done(&s); + btree_node_set_format(n2, n2->data->format); + + out = set2->start; + memset(&n2->nr, 0, sizeof(n2->nr)); + + for (k = set2_start; k != set2_end; k = bkey_next(k)) { + BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k) + ? &n1->format : &bch2_bkey_format_current, k)); + out->format = KEY_FORMAT_LOCAL_BTREE; + btree_keys_account_key_add(&n2->nr, 0, out); + out = bkey_next(out); + } + + set2->u64s = cpu_to_le16((u64 *) out - set2->_data); + set_btree_bset_end(n2, n2->set); + BUG_ON(!set1->u64s); BUG_ON(!set2->u64s); - memcpy_u64s(set2->start, - vstruct_end(set1), - le16_to_cpu(set2->u64s)); - btree_node_reset_sib_u64s(n1); btree_node_reset_sib_u64s(n2); @@ -1148,7 +1271,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, struct bkey_packed *src, *dst, *n; struct bset *i; - BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); + BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); bch2_btree_node_iter_init(&node_iter, b, &k->k.p); @@ -1167,7 +1290,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, i = btree_bset_first(b); src = dst = i->start; while (src != vstruct_last(i)) { - n = bkey_next_skip_noops(src, vstruct_last(i)); + n = bkey_next(src); if (!bkey_deleted(src)) { memmove_u64s_down(dst, src, src->u64s); dst = bkey_next(dst); @@ -1175,6 +1298,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, src = n; } + /* Also clear out the unwritten whiteouts area: */ + b->whiteout_u64s = 0; + i->u64s = cpu_to_le16((u64 *) dst - i->_data); set_btree_bset_end(b, b->set); @@ -1313,7 +1439,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * the node the iterator points to: */ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_packed(b, k, &insert->k) >= 0)) + (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) ; for_each_keylist_key(keys, insert) @@ -1348,14 +1474,12 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; + lockdep_assert_held(&c->gc_lock); BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); - if (as->must_rewrite) - goto split; - bch2_btree_node_lock_for_insert(c, b, iter); if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { @@ -1363,6 +1487,8 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, goto split; } + btree_node_interior_verify(c, b); + bch2_btree_insert_keys_interior(as, b, iter, keys); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; @@ -1380,14 +1506,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, bch2_btree_node_unlock_write(b, iter); btree_node_interior_verify(c, b); - - /* - * when called from the btree_split path the new nodes aren't added to - * the btree iterator yet, so the merge path's unlock/wait/relock dance - * won't work: - */ - bch2_foreground_maybe_merge(c, iter, b->c.level, - flags|BTREE_INSERT_NOUNLOCK); return; split: btree_split(as, b, iter, keys, flags); @@ -1396,118 +1514,73 @@ split: int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, unsigned flags) { - struct btree_trans *trans = iter->trans; struct btree *b = iter_l(iter)->b; struct btree_update *as; - struct closure cl; + unsigned l; int ret = 0; - struct btree_insert_entry *i; - - /* - * We already have a disk reservation and open buckets pinned; this - * allocation must not block: - */ - trans_for_each_update(trans, i) - if (btree_node_type_needs_gc(i->iter->btree_id)) - flags |= BTREE_INSERT_USE_RESERVE; - - closure_init_stack(&cl); - - /* Hack, because gc and splitting nodes doesn't mix yet: */ - if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && - !down_read_trylock(&c->gc_lock)) { - if (flags & BTREE_INSERT_NOUNLOCK) { - trace_transaction_restart_ip(trans->ip, _THIS_IP_); - return -EINTR; - } - - bch2_trans_unlock(trans); - down_read(&c->gc_lock); - if (!bch2_trans_relock(trans)) - ret = -EINTR; - } - - /* - * XXX: figure out how far we might need to split, - * instead of locking/reserving all the way to the root: - */ - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { - trace_trans_restart_iter_upgrade(trans->ip); - ret = -EINTR; - goto out; - } - - as = bch2_btree_update_start(trans, iter->btree_id, - btree_update_reserve_required(c, b), flags, - !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); - if (IS_ERR(as)) { - ret = PTR_ERR(as); - if (ret == -EAGAIN) { - BUG_ON(flags & BTREE_INSERT_NOUNLOCK); - bch2_trans_unlock(trans); - ret = -EINTR; - - trace_transaction_restart_ip(trans->ip, _THIS_IP_); - } - goto out; - } + as = bch2_btree_update_start(iter, iter->level, + btree_update_reserve_required(c, b), flags); + if (IS_ERR(as)) + return PTR_ERR(as); btree_split(as, b, iter, NULL, flags); bch2_btree_update_done(as); - /* - * We haven't successfully inserted yet, so don't downgrade all the way - * back to read locks; - */ - __bch2_btree_iter_downgrade(iter, 1); -out: - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) - up_read(&c->gc_lock); - closure_sync(&cl); + for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++) + ret = bch2_foreground_maybe_merge(c, iter, l, flags); + return ret; } -void __bch2_foreground_maybe_merge(struct bch_fs *c, - struct btree_iter *iter, - unsigned level, - unsigned flags, - enum btree_node_sibling sib) +int __bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + unsigned level, + unsigned flags, + enum btree_node_sibling sib) { struct btree_trans *trans = iter->trans; + struct btree_iter *sib_iter = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; struct bkey_i delete; struct btree *b, *m, *n, *prev, *next, *parent; - struct closure cl; + struct bpos sib_pos; size_t sib_u64s; - int ret = 0; + int ret = 0, ret2 = 0; BUG_ON(!btree_node_locked(iter, level)); - - closure_init_stack(&cl); retry: + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + BUG_ON(!btree_node_locked(iter, level)); b = iter->l[level].b; - parent = btree_node_parent(iter, b); - if (!parent) + if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || + (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) { + b->sib_u64s[sib] = U16_MAX; goto out; + } - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - goto out; + sib_pos = sib == btree_prev_sib + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); - /* XXX: can't be holding read locks */ - m = bch2_btree_node_get_sibling(c, iter, b, sib); - if (IS_ERR(m)) { - ret = PTR_ERR(m); + sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id, + sib_pos, U8_MAX, level, + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(sib_iter); + if (ret) goto err; - } - /* NULL means no sibling: */ - if (!m) { + m = sib_iter->l[level].b; + + if (btree_node_parent(iter, b) != + btree_node_parent(sib_iter, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1520,9 +1593,13 @@ retry: next = m; } + BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)); + bch2_bkey_format_init(&new_s); - __bch2_btree_calc_format(&new_s, b); - __bch2_btree_calc_format(&new_s, m); + bch2_bkey_format_add_pos(&new_s, prev->data->min_key); + __bch2_btree_calc_format(&new_s, prev); + __bch2_btree_calc_format(&new_s, next); + bch2_bkey_format_add_pos(&new_s, next->data->max_key); new_f = bch2_bkey_format_done(&new_s); sib_u64s = btree_node_u64s_with_format(b, &new_f) + @@ -1535,33 +1612,21 @@ retry: } sib_u64s = min(sib_u64s, btree_max_u64s(c)); + sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); b->sib_u64s[sib] = sib_u64s; - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { - six_unlock_intent(&m->c.lock); + if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - } - /* We're changing btree topology, doesn't mix with gc: */ - if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && - !down_read_trylock(&c->gc_lock)) - goto err_cycle_gc_lock; - - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { - ret = -EINTR; - goto err_unlock; - } - - as = bch2_btree_update_start(trans, iter->btree_id, + parent = btree_node_parent(iter, b); + as = bch2_btree_update_start(iter, level, btree_update_reserve_required(c, parent) + 1, flags| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); - if (IS_ERR(as)) { - ret = PTR_ERR(as); - goto err_unlock; - } + BTREE_INSERT_USE_RESERVE); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto err; trace_btree_merge(c, b); @@ -1595,6 +1660,7 @@ retry: bch2_btree_update_get_open_buckets(as, n); six_lock_increment(&b->c.lock, SIX_LOCK_intent); + six_lock_increment(&m->c.lock, SIX_LOCK_intent); bch2_btree_iter_node_drop(iter, b); bch2_btree_iter_node_drop(iter, m); @@ -1608,11 +1674,9 @@ retry: six_unlock_intent(&n->c.lock); bch2_btree_update_done(as); - - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) - up_read(&c->gc_lock); out: bch2_btree_trans_verify_locks(trans); + bch2_trans_iter_free(trans, sib_iter); /* * Don't downgrade locks here: we're called after successful insert, @@ -1623,58 +1687,56 @@ out: * split path, and downgrading to read locks in there is potentially * confusing: */ - closure_sync(&cl); - return; - -err_cycle_gc_lock: - six_unlock_intent(&m->c.lock); - - if (flags & BTREE_INSERT_NOUNLOCK) - goto out; - - bch2_trans_unlock(trans); - - down_read(&c->gc_lock); - up_read(&c->gc_lock); - ret = -EINTR; - goto err; - -err_unlock: - six_unlock_intent(&m->c.lock); - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) - up_read(&c->gc_lock); + return ret ?: ret2; err: - BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); - - if ((ret == -EAGAIN || ret == -EINTR) && - !(flags & BTREE_INSERT_NOUNLOCK)) { - bch2_trans_unlock(trans); - closure_sync(&cl); - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto out; + bch2_trans_iter_put(trans, sib_iter); + sib_iter = NULL; + if (ret == -EINTR && bch2_trans_relock(trans)) goto retry; + + if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { + ret2 = ret; + ret = bch2_btree_iter_traverse_all(trans); + if (!ret) + goto retry; } goto out; } -static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, - struct btree *b, unsigned flags, - struct closure *cl) +/** + * bch_btree_node_rewrite - Rewrite/move a btree node + */ +int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + __le64 seq, unsigned flags) { - struct btree *n, *parent = btree_node_parent(iter, b); + struct btree *b, *n, *parent; struct btree_update *as; + int ret; - as = bch2_btree_update_start(iter->trans, iter->btree_id, + flags |= BTREE_INSERT_NOFAIL; +retry: + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto out; + + b = bch2_btree_iter_peek_node(iter); + if (!b || b->data->keys.seq != seq) + goto out; + + parent = btree_node_parent(iter, b); + as = bch2_btree_update_start(iter, b->c.level, (parent ? btree_update_reserve_required(c, parent) : 0) + 1, - flags, cl); - if (IS_ERR(as)) { + flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret == -EINTR) + goto retry; + if (ret) { trace_btree_gc_rewrite_node_fail(c, b); - return PTR_ERR(as); + goto out; } bch2_btree_interior_update_will_free_node(as, b); @@ -1705,60 +1767,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, six_unlock_intent(&n->c.lock); bch2_btree_update_done(as); - return 0; -} - -/** - * bch_btree_node_rewrite - Rewrite/move a btree node - * - * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. - * btree_check_reserve() has to wait) - */ -int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, - __le64 seq, unsigned flags) -{ - struct btree_trans *trans = iter->trans; - struct closure cl; - struct btree *b; - int ret; - - flags |= BTREE_INSERT_NOFAIL; - - closure_init_stack(&cl); - - bch2_btree_iter_upgrade(iter, U8_MAX); - - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { - if (!down_read_trylock(&c->gc_lock)) { - bch2_trans_unlock(trans); - down_read(&c->gc_lock); - } - } - - while (1) { - ret = bch2_btree_iter_traverse(iter); - if (ret) - break; - - b = bch2_btree_iter_peek_node(iter); - if (!b || b->data->keys.seq != seq) - break; - - ret = __btree_node_rewrite(c, iter, b, flags, &cl); - if (ret != -EAGAIN && - ret != -EINTR) - break; - - bch2_trans_unlock(trans); - closure_sync(&cl); - } - +out: bch2_btree_iter_downgrade(iter); - - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) - up_read(&c->gc_lock); - - closure_sync(&cl); return ret; } @@ -1829,74 +1839,34 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, struct btree_update *as = NULL; struct btree *new_hash = NULL; struct closure cl; - int ret; + int ret = 0; closure_init_stack(&cl); - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) - return -EINTR; - - if (!down_read_trylock(&c->gc_lock)) { - bch2_trans_unlock(iter->trans); - down_read(&c->gc_lock); - - if (!bch2_trans_relock(iter->trans)) { - ret = -EINTR; - goto err; - } - } - /* * check btree_ptr_hash_val() after @b is locked by * btree_iter_traverse(): */ if (btree_ptr_hash_val(new_key) != b->hash_val) { - /* bch2_btree_reserve_get will unlock */ ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) { bch2_trans_unlock(iter->trans); - up_read(&c->gc_lock); closure_sync(&cl); - down_read(&c->gc_lock); - - if (!bch2_trans_relock(iter->trans)) { - ret = -EINTR; - goto err; - } + if (!bch2_trans_relock(iter->trans)) + return -EINTR; } new_hash = bch2_btree_node_mem_alloc(c); } -retry: - as = bch2_btree_update_start(iter->trans, iter->btree_id, - parent ? btree_update_reserve_required(c, parent) : 0, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE, - &cl); + as = bch2_btree_update_start(iter, b->c.level, + parent ? btree_update_reserve_required(c, parent) : 0, + BTREE_INSERT_NOFAIL); if (IS_ERR(as)) { ret = PTR_ERR(as); - if (ret == -EAGAIN) - ret = -EINTR; - - if (ret == -EINTR) { - bch2_trans_unlock(iter->trans); - up_read(&c->gc_lock); - closure_sync(&cl); - down_read(&c->gc_lock); - - if (bch2_trans_relock(iter->trans)) - goto retry; - } - goto err; } - ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); - if (ret) - goto err_free_update; - __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); bch2_btree_iter_downgrade(iter); @@ -1909,12 +1879,9 @@ err: six_unlock_write(&new_hash->c.lock); six_unlock_intent(&new_hash->c.lock); } - up_read(&c->gc_lock); closure_sync(&cl); + bch2_btree_cache_cannibalize_unlock(c); return ret; -err_free_update: - bch2_btree_update_free(as); - goto err; } /* Init code: */ diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 7668225..f2925b0 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -47,8 +47,8 @@ struct btree_update { BTREE_INTERIOR_UPDATING_AS, } mode; - unsigned must_rewrite:1; unsigned nodes_written:1; + unsigned took_gc_lock:1; enum btree_id btree_id; @@ -121,8 +121,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, void bch2_btree_update_done(struct btree_update *); struct btree_update * -bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, - unsigned, struct closure *); +bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned); void bch2_btree_interior_update_will_free_node(struct btree_update *, struct btree *); @@ -133,10 +132,10 @@ void bch2_btree_insert_node(struct btree_update *, struct btree *, unsigned); int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); -void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, - unsigned, unsigned, enum btree_node_sibling); +int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, + unsigned, unsigned, enum btree_node_sibling); -static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, +static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, struct btree_iter *iter, unsigned level, unsigned flags, enum btree_node_sibling sib) @@ -144,27 +143,27 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, struct btree *b; if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) - return; + return 0; if (!bch2_btree_node_relock(iter, level)) - return; + return 0; b = iter->l[level].b; if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) - return; + return 0; - __bch2_foreground_maybe_merge(c, iter, level, flags, sib); + return __bch2_foreground_maybe_merge(c, iter, level, flags, sib); } -static inline void bch2_foreground_maybe_merge(struct bch_fs *c, +static inline int bch2_foreground_maybe_merge(struct bch_fs *c, struct btree_iter *iter, unsigned level, unsigned flags) { - bch2_foreground_maybe_merge_sibling(c, iter, level, flags, - btree_prev_sib); - bch2_foreground_maybe_merge_sibling(c, iter, level, flags, - btree_next_sib); + return bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + btree_prev_sib) ?: + bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + btree_next_sib); } void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); @@ -237,6 +236,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, b->whiteout_u64s; ssize_t total = c->opts.btree_node_size << 6; + /* Always leave one extra u64 for bch2_varint_decode: */ + used++; + return total - used; } diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 49995cd..e258cf8 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -21,6 +21,14 @@ #include #include +static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + const struct btree_insert_entry *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->k->k.p, r->k->k.p); +} + static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { @@ -62,27 +70,24 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(btree_node_just_written(b)); EBUG_ON(bset_written(b, btree_bset_last(b))); EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && - bkey_cmp(bkey_start_pos(&insert->k), - bkey_predecessor(b->data->min_key)) < 0); - EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); - EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); EBUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->trans->c, b)); EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_packed(b, k, &insert->k)) + if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) k = NULL; /* @k is the key being overwritten/deleted, if any: */ - EBUG_ON(k && bkey_whiteout(k)); + EBUG_ON(k && bkey_deleted(k)); /* Deleting, but not found? nothing to do: */ - if (bkey_whiteout(&insert->k) && !k) + if (bkey_deleted(&insert->k) && !k) return false; - if (bkey_whiteout(&insert->k)) { + if (bkey_deleted(&insert->k)) { /* Deleting: */ btree_account_key_drop(b, k); k->type = KEY_TYPE_deleted; @@ -129,7 +134,7 @@ fix_iter: return true; } -static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, +static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, unsigned i, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -140,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, bch2_btree_node_write_cond(c, b, (btree_current_write(b) == w && w->journal.seq == seq)); six_unlock_read(&b->c.lock); + return 0; } -static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) { return __btree_node_flush(j, pin, 0, seq); } -static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) { return __btree_node_flush(j, pin, 1, seq); } @@ -191,7 +197,7 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); if (unlikely(!btree_node_dirty(b))) - set_btree_node_dirty(b); + set_btree_node_dirty(c, b); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -214,15 +220,23 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, /* Normal update interface: */ static inline void btree_insert_entry_checks(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) + struct btree_insert_entry *i) { struct bch_fs *c = trans->c; - BUG_ON(bkey_cmp(insert->k.p, iter->pos)); - BUG_ON(debug_check_bkeys(c) && - bch2_bkey_invalid(c, bkey_i_to_s_c(insert), - __btree_node_type(iter->level, iter->btree_id))); + if (bch2_debug_check_bkeys) { + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); + panic("invalid bkey %s on insert: %s\n", buf, invalid); + } + } + BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos)); + BUG_ON(i->level != i->iter->level); + BUG_ON(i->btree_id != i->iter->btree_id); } static noinline int @@ -286,6 +300,11 @@ btree_key_can_insert_cached(struct btree_trans *trans, BUG_ON(iter->level); + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(trans->c) && + !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + if (u64s <= ck->u64s) return BTREE_INSERT_OK; @@ -330,19 +349,6 @@ static inline void do_btree_insert_one(struct btree_trans *trans, } } -static inline bool iter_has_trans_triggers(struct btree_iter *iter) -{ - return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); -} - -static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) -{ - return (((BTREE_NODE_TYPE_HAS_TRIGGERS & - ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) | - (1U << BTREE_ID_EC)) & - (1U << iter->btree_id); -} - static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) { __bch2_btree_iter_unlock(iter); @@ -370,8 +376,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, struct btree_insert_entry **stopped_at) { struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; + struct btree_trans_commit_hook *h; unsigned u64s = 0; bool marking = false; int ret; @@ -389,6 +395,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, prefetch(&trans->c->journal.flags); + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + return ret; + h = h->next; + } + trans_for_each_update2(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) @@ -403,13 +417,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, return ret; } - if (btree_node_type_needs_gc(i->iter->btree_id)) + if (btree_node_type_needs_gc(i->bkey_type)) marking = true; } if (marking) { percpu_down_read(&c->mark_lock); - fs_usage = bch2_fs_usage_scratch_get(c); + } + + /* Must be called under mark_lock: */ + if (marking && trans->fs_usage_deltas && + !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto err; } /* @@ -440,29 +460,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, */ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { - if (journal_seq_verify(c)) + if (bch2_journal_seq_verify) trans_for_each_update2(trans, i) i->k->k.version.lo = trans->journal_res.seq; - else if (inject_invalid_keys(c)) + else if (bch2_inject_invalid_keys) trans_for_each_update2(trans, i) i->k->k.version = MAX_VERSION; } - /* Must be called under mark_lock: */ - if (marking && trans->fs_usage_deltas && - bch2_replicas_delta_list_apply(c, fs_usage, - trans->fs_usage_deltas)) { - ret = BTREE_INSERT_NEED_MARK_REPLICAS; - goto err; - } - trans_for_each_update(trans, i) - if (iter_has_nontrans_triggers(i->iter)) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) bch2_mark_update(trans, i->iter, i->k, - fs_usage, i->trigger_flags); + NULL, i->trigger_flags); - if (marking) - bch2_trans_fs_usage_apply(trans, fs_usage); + if (marking && trans->fs_usage_deltas) + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); if (unlikely(c->gc_pos.phase)) bch2_trans_mark_gc(trans); @@ -471,31 +483,85 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, do_btree_insert_one(trans, i->iter, i->k); err: if (marking) { - bch2_fs_usage_scratch_put(c, fs_usage); percpu_up_read(&c->mark_lock); } return ret; } +static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter) +{ + struct btree_insert_entry *i; + struct btree *b = iter_l(iter)->b; + struct bkey_s_c old; + int u64s_delta = 0; + int ret; + + /* + * Inserting directly into interior nodes is an uncommon operation with + * various weird edge cases: also, a lot of things about + * BTREE_ITER_NODES iters need to be audited + */ + if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS)) + return 0; + + BUG_ON(iter->level); + + trans_for_each_update2(trans, i) { + if (iter_l(i->iter)->b != b) + continue; + + old = bch2_btree_iter_peek_slot(i->iter); + ret = bkey_err(old); + if (ret) + return ret; + + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; + u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + } + + return u64s_delta <= 0 + ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level, + trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR) + : 0; +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ static inline int do_bch2_trans_commit(struct btree_trans *trans, struct btree_insert_entry **stopped_at) { + struct bch_fs *c = trans->c; struct btree_insert_entry *i; struct btree_iter *iter; int ret; + trans_for_each_update2(trans, i) { + struct btree *b; + + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + + if (btree_iter_type(i->iter) == BTREE_ITER_CACHED) + continue; + + b = iter_l(i->iter)->b; + if (b->sib_u64s[0] < c->btree_foreground_merge_threshold || + b->sib_u64s[1] < c->btree_foreground_merge_threshold) { + ret = maybe_do_btree_merge(trans, i->iter); + if (unlikely(ret)) + return ret; + } + } + trans_for_each_update2(trans, i) - BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); - ret = bch2_journal_preres_get(&trans->c->journal, + ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, JOURNAL_RES_GET_NONBLOCK| - ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) - ? JOURNAL_RES_GET_RECLAIM : 0)); + ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED) + ? JOURNAL_RES_GET_RESERVED : 0)); if (unlikely(ret == -EAGAIN)) ret = bch2_trans_journal_preres_get_cold(trans, trans->journal_preres_u64s); @@ -504,6 +570,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, /* * Can't be holding any read locks when we go to take write locks: + * another thread could be holding an intent lock on the same node we + * have a read lock on, and it'll block trying to take a write lock + * (because we hold a read lock) and it could be blocking us by holding + * its own read lock (while we're trying to to take write locks). * * note - this must be done after bch2_trans_journal_preres_get_cold() * or anything else that might call bch2_trans_relock(), since that @@ -511,20 +581,25 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, */ trans_for_each_iter(trans, iter) { if (iter->nodes_locked != iter->nodes_intent_locked) { - EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); - EBUG_ON(trans->iters_live & (1ULL << iter->idx)); - bch2_btree_iter_unlock_noinline(iter); + if (btree_iter_keep(trans, iter)) { + if (!bch2_btree_iter_upgrade(iter, 1)) { + trace_trans_restart_upgrade(trans->ip); + return -EINTR; + } + } else { + bch2_btree_iter_unlock_noinline(iter); + } } } if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) trans_for_each_update2(trans, i) - btree_insert_entry_checks(trans, i->iter, i->k); + btree_insert_entry_checks(trans, i); bch2_btree_trans_verify_locks(trans); trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_lock_for_insert(trans->c, + bch2_btree_node_lock_for_insert(c, iter_l(i->iter)->b, i->iter); ret = bch2_trans_commit_write_locked(trans, stopped_at); @@ -535,32 +610,43 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, i->iter); if (!ret && trans->journal_pin) - bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, trans->journal_pin, NULL); /* * Drop journal reservation after dropping write locks, since dropping * the journal reservation may kick off a journal write: */ - bch2_journal_res_put(&trans->c->journal, &trans->journal_res); + bch2_journal_res_put(&c->journal, &trans->journal_res); if (unlikely(ret)) return ret; - if (trans->flags & BTREE_INSERT_NOUNLOCK) - trans->nounlock = true; + bch2_trans_downgrade(trans); - trans_for_each_update2(trans, i) - if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && - !same_leaf_as_prev(trans, i)) - bch2_foreground_maybe_merge(trans->c, i->iter, - 0, trans->flags); + return 0; +} - trans->nounlock = false; +static int journal_reclaim_wait_done(struct bch_fs *c) +{ + int ret; - bch2_trans_downgrade(trans); + ret = bch2_journal_error(&c->journal); + if (ret) + return ret; - return 0; + ret = !bch2_btree_key_cache_must_wait(c); + if (ret) + return ret; + + if (mutex_trylock(&c->journal.reclaim_lock)) { + ret = bch2_journal_reclaim(&c->journal); + mutex_unlock(&c->journal.reclaim_lock); + } + + if (!ret) + ret = !bch2_btree_key_cache_must_wait(c); + return ret; } static noinline @@ -617,11 +703,9 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); - trans_for_each_update(trans, i) { - ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); - if (ret) - return ret; - } + ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas); + if (ret) + return ret; if (bch2_trans_relock(trans)) return 0; @@ -632,6 +716,10 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_JOURNAL_RES: bch2_trans_unlock(trans); + if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && + !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) + return -EAGAIN; + ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); if (ret) return ret; @@ -642,20 +730,21 @@ int bch2_trans_commit_error(struct btree_trans *trans, trace_trans_restart_journal_res_get(trans->ip); ret = -EINTR; break; - default: - BUG_ON(ret >= 0); - break; - } + case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + bch2_trans_unlock(trans); - if (ret == -EINTR) { - int ret2 = bch2_btree_iter_traverse_all(trans); + wait_event(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); - if (ret2) { - trace_trans_restart_traverse(trans->ip); - return ret2; - } + if (!ret && bch2_trans_relock(trans)) + return 0; - trace_trans_restart_atomic(trans->ip); + trace_trans_restart_journal_reclaim(trans->ip); + ret = -EINTR; + break; + default: + BUG_ON(ret >= 0); + break; } return ret; @@ -680,137 +769,134 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) return 0; } -static void bch2_trans_update2(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) +static void __bch2_trans_update2(struct btree_trans *trans, + struct btree_insert_entry n) { - struct btree_insert_entry *i, n = (struct btree_insert_entry) { - .iter = iter, .k = insert - }; - - btree_insert_entry_checks(trans, n.iter, n.k); - - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + struct btree_insert_entry *i; - EBUG_ON(trans->nr_updates2 >= trans->nr_iters); + btree_insert_entry_checks(trans, &n); - iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX); - trans_for_each_update2(trans, i) { - if (btree_iter_cmp(n.iter, i->iter) == 0) { - *i = n; - return; - } + n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - if (btree_iter_cmp(n.iter, i->iter) <= 0) + trans_for_each_update2(trans, i) + if (btree_insert_entry_cmp(&n, i) <= 0) break; - } - array_insert_item(trans->updates2, trans->nr_updates2, - i - trans->updates2, n); + if (i < trans->updates2 + trans->nr_updates2 && + !btree_insert_entry_cmp(&n, i)) + *i = n; + else + array_insert_item(trans->updates2, trans->nr_updates2, + i - trans->updates2, n); +} + +static void bch2_trans_update2(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert) +{ + __bch2_trans_update2(trans, (struct btree_insert_entry) { + .bkey_type = __btree_node_type(iter->level, iter->btree_id), + .btree_id = iter->btree_id, + .level = iter->level, + .iter = iter, + .k = insert, + }); } static int extent_update_to_keys(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert) + struct btree_insert_entry n) { - struct btree_iter *iter; int ret; - ret = bch2_extent_can_insert(trans, orig_iter, insert); + if (bkey_deleted(&n.k->k)) + return 0; + + ret = bch2_extent_can_insert(trans, n.iter, n.k); if (ret) return ret; - if (bkey_deleted(&insert->k)) - return 0; - - iter = bch2_trans_copy_iter(trans, orig_iter); - if (IS_ERR(iter)) - return PTR_ERR(iter); + n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS); + n.is_extent = false; - iter->flags |= BTREE_ITER_INTENT; - __bch2_btree_iter_set_pos(iter, insert->k.p, false); - bch2_trans_update2(trans, iter, insert); - bch2_trans_iter_put(trans, iter); + __bch2_trans_update2(trans, n); + bch2_trans_iter_put(trans, n.iter); return 0; } static int extent_handle_overwrites(struct btree_trans *trans, enum btree_id btree_id, - struct bpos start, struct bpos end) + struct bkey_i *insert) { - struct btree_iter *iter = NULL, *update_iter; + struct btree_iter *iter, *update_iter; + struct bpos start = bkey_start_pos(&insert->k); struct bkey_i *update; struct bkey_s_c k; int ret = 0; - iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - return ret; - + iter = bch2_trans_get_iter(trans, btree_id, start, + BTREE_ITER_INTENT); k = bch2_btree_iter_peek_with_updates(iter); while (k.k && !(ret = bkey_err(k))) { - if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) + if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0) break; if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { - update_iter = bch2_trans_copy_iter(trans, iter); - if ((ret = PTR_ERR_OR_ZERO(update_iter))) - goto err; - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; + break; bkey_reassemble(update, k); + bch2_cut_back(start, update); - __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); bch2_trans_update2(trans, update_iter, update); bch2_trans_iter_put(trans, update_iter); } - if (bkey_cmp(k.k->p, end) > 0) { - update_iter = bch2_trans_copy_iter(trans, iter); - if ((ret = PTR_ERR_OR_ZERO(update_iter))) - goto err; - - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if (bkey_cmp(k.k->p, insert->k.p) < 0 || + (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) { + update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; + break; - bkey_reassemble(update, k); - bch2_cut_front(end, update); + bkey_init(&update->k); + update->k.p = k.k->p; - __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); bch2_trans_update2(trans, update_iter, update); bch2_trans_iter_put(trans, update_iter); - } else { - update_iter = bch2_trans_copy_iter(trans, iter); - if ((ret = PTR_ERR_OR_ZERO(update_iter))) - goto err; + } - update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); + if (bkey_cmp(k.k->p, insert->k.p) > 0) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; + break; - update->k = *k.k; - set_bkey_val_u64s(&update->k, 0); - update->k.type = KEY_TYPE_deleted; - update->k.size = 0; + bkey_reassemble(update, k); + bch2_cut_front(insert->k.p, update); - __bch2_btree_iter_set_pos(update_iter, update->k.p, false); + update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); bch2_trans_update2(trans, update_iter, update); bch2_trans_iter_put(trans, update_iter); + break; } k = bch2_btree_iter_next_with_updates(iter); } -err: - if (!IS_ERR_OR_NULL(iter)) - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_put(trans, iter); + return ret; } @@ -819,13 +905,11 @@ int __bch2_trans_commit(struct btree_trans *trans) struct btree_insert_entry *i = NULL; struct btree_iter *iter; bool trans_trigger_run; - unsigned u64s; + unsigned u64s, reset_flags = 0; int ret = 0; - BUG_ON(trans->need_reset); - if (!trans->nr_updates) - goto out_noupdates; + goto out_reset; if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&trans->c->gc_lock); @@ -839,7 +923,7 @@ int __bch2_trans_commit(struct btree_trans *trans) unlikely(!percpu_ref_tryget(&trans->c->writes))) { ret = bch2_trans_commit_get_rw_cold(trans); if (ret) - return ret; + goto out_reset; } #ifdef CONFIG_BCACHEFS_DEBUG @@ -847,7 +931,7 @@ int __bch2_trans_commit(struct btree_trans *trans) if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && !(i->trigger_flags & BTREE_TRIGGER_NORUN)) bch2_btree_key_cache_verify_clean(trans, - i->iter->btree_id, i->iter->pos); + i->btree_id, i->k->k.p); #endif /* @@ -858,24 +942,7 @@ int __bch2_trans_commit(struct btree_trans *trans) trans_trigger_run = false; trans_for_each_update(trans, i) { - if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && - (ret = bch2_btree_iter_traverse(i->iter)))) { - trace_trans_restart_traverse(trans->ip); - goto out; - } - - /* - * We're not using bch2_btree_iter_upgrade here because - * we know trans->nounlock can't be set: - */ - if (unlikely(i->iter->locks_want < 1 && - !__bch2_btree_iter_upgrade(i->iter, 1))) { - trace_trans_restart_upgrade(trans->ip); - ret = -EINTR; - goto out; - } - - if (iter_has_trans_triggers(i->iter) && + if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && !i->trans_triggers_run) { i->trans_triggers_run = true; trans_trigger_run = true; @@ -893,33 +960,34 @@ int __bch2_trans_commit(struct btree_trans *trans) /* Turn extents updates into keys: */ trans_for_each_update(trans, i) - if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { - struct bpos start = bkey_start_pos(&i->k->k); - - while (i + 1 < trans->updates + trans->nr_updates && - i[0].iter->btree_id == i[1].iter->btree_id && - !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) - i++; - - ret = extent_handle_overwrites(trans, i->iter->btree_id, - start, i->k->k.p); - if (ret) + if (i->is_extent) { + ret = extent_handle_overwrites(trans, i->btree_id, i->k); + if (unlikely(ret)) goto out; } trans_for_each_update(trans, i) { - if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { - ret = extent_update_to_keys(trans, i->iter, i->k); - if (ret) - goto out; - } else { - bch2_trans_update2(trans, i->iter, i->k); - } + ret = i->is_extent + ? extent_update_to_keys(trans, *i) + : (__bch2_trans_update2(trans, *i), 0); + if (unlikely(ret)) + goto out; } trans_for_each_update2(trans, i) { - BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); - BUG_ON(i->iter->locks_want < 1); + ret = bch2_btree_iter_traverse(i->iter); + if (unlikely(ret)) { + trace_trans_restart_traverse(trans->ip); + goto out; + } + + if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) { + trace_trans_restart_upgrade(trans->ip); + ret = -EINTR; + goto out; + } + + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); u64s = jset_u64s(i->k->k.u64s); if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && @@ -939,20 +1007,20 @@ retry: goto err; trans_for_each_iter(trans, iter) - if ((trans->iters_live & (1ULL << iter->idx)) && - (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { - if (trans->flags & BTREE_INSERT_NOUNLOCK) - bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); - else - bch2_btree_iter_set_pos(iter, iter->pos_after_commit); - } + if (btree_iter_live(trans, iter) && + (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) + bch2_btree_iter_set_pos(iter, iter->pos_after_commit); out: bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) percpu_ref_put(&trans->c->writes); -out_noupdates: - bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); +out_reset: + if (!ret) + reset_flags |= TRANS_RESET_NOTRAVERSE; + if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK)) + reset_flags |= TRANS_RESET_NOUNLOCK; + bch2_trans_reset(trans, reset_flags); return ret; err: @@ -967,75 +1035,111 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_trigger_flags flags) { struct btree_insert_entry *i, n = (struct btree_insert_entry) { - .trigger_flags = flags, .iter = iter, .k = k + .trigger_flags = flags, + .bkey_type = __btree_node_type(iter->level, iter->btree_id), + .btree_id = iter->btree_id, + .level = iter->level, + .is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0, + .iter = iter, + .k = k }; - EBUG_ON(bkey_cmp(iter->pos, - (iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_start_pos(&k->k) - : k->k.p)); + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + +#ifdef CONFIG_BCACHEFS_DEBUG + BUG_ON(bkey_cmp(iter->pos, + n.is_extent ? bkey_start_pos(&k->k) : k->k.p)); + + trans_for_each_update(trans, i) { + BUG_ON(bkey_cmp(i->iter->pos, + i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p)); + + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); + } +#endif iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - if (btree_node_type_is_extents(iter->btree_id)) { + if (n.is_extent) { iter->pos_after_commit = k->k.p; iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; } /* - * Pending updates are kept sorted: first, find position of new update: + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: */ - trans_for_each_update(trans, i) - if (btree_iter_cmp(iter, i->iter) <= 0) - break; + if (!n.is_extent) { + trans_for_each_update(trans, i) + if (btree_insert_entry_cmp(&n, i) <= 0) + break; - /* - * Now delete/trim any updates the new update overwrites: - */ - if (i > trans->updates && - i[-1].iter->btree_id == iter->btree_id && - bkey_cmp(iter->pos, i[-1].k->k.p) < 0) - bch2_cut_back(n.iter->pos, i[-1].k); - - while (i < trans->updates + trans->nr_updates && - iter->btree_id == i->iter->btree_id && - bkey_cmp(n.k->k.p, i->k->k.p) >= 0) - array_remove_item(trans->updates, trans->nr_updates, - i - trans->updates); - - if (i < trans->updates + trans->nr_updates && - iter->btree_id == i->iter->btree_id && - bkey_cmp(n.k->k.p, i->iter->pos) > 0) { - /* - * When we have an extent that overwrites the start of another - * update, trimming that extent will mean the iterator's - * position has to change since the iterator position has to - * match the extent's start pos - but we don't want to change - * the iterator pos if some other code is using it, so we may - * need to clone it: - */ - if (trans->iters_live & (1ULL << i->iter->idx)) { - i->iter = bch2_trans_copy_iter(trans, i->iter); - if (IS_ERR(i->iter)) { - trans->need_reset = true; - return PTR_ERR(i->iter); + if (i < trans->updates + trans->nr_updates && + !btree_insert_entry_cmp(&n, i)) + *i = n; + else + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + } else { + trans_for_each_update(trans, i) + if (btree_insert_entry_cmp(&n, i) < 0) + break; + + while (i > trans->updates && + i[-1].btree_id == n.btree_id && + bkey_cmp(bkey_start_pos(&n.k->k), + bkey_start_pos(&i[-1].k->k)) <= 0) { + --i; + array_remove_item(trans->updates, trans->nr_updates, + i - trans->updates); + } + + if (i > trans->updates && + i[-1].btree_id == n.btree_id && + bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0) + bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k); + + if (i < trans->updates + trans->nr_updates && + i->btree_id == n.btree_id && + bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) { + /* We don't handle splitting extents here: */ + BUG_ON(bkey_cmp(bkey_start_pos(&n.k->k), + bkey_start_pos(&i->k->k)) > 0); + + /* + * When we have an extent that overwrites the start of another + * update, trimming that extent will mean the iterator's + * position has to change since the iterator position has to + * match the extent's start pos - but we don't want to change + * the iterator pos if some other code is using it, so we may + * need to clone it: + */ + if (btree_iter_live(trans, i->iter)) { + i->iter = bch2_trans_copy_iter(trans, i->iter); + + i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + bch2_trans_iter_put(trans, i->iter); } - i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - bch2_trans_iter_put(trans, i->iter); + bch2_cut_front(n.k->k.p, i->k); + bch2_btree_iter_set_pos(i->iter, n.k->k.p); } - bch2_cut_front(n.k->k.p, i->k); - bch2_btree_iter_set_pos(i->iter, n.k->k.p); + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); } - EBUG_ON(trans->nr_updates >= trans->nr_iters); - - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, n); return 0; } +void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + h->next = trans->hooks; + trans->hooks = h; +} + int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, struct bkey_i *k) { @@ -1044,8 +1148,6 @@ int __bch2_btree_insert(struct btree_trans *trans, iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); ret = bch2_btree_iter_traverse(iter) ?: bch2_trans_update(trans, iter, k, 0); @@ -1069,13 +1171,28 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, __bch2_btree_insert(&trans, id, k)); } -int bch2_btree_delete_at_range(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos end, - u64 *journal_seq) +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_i k; + + bkey_init(&k.k); + k.k.p = iter->pos; + + bch2_trans_update(trans, iter, &k, 0); + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|flags); +} + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, + u64 *journal_seq) { + struct btree_iter *iter; struct bkey_s_c k; int ret = 0; + + iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); retry: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -1086,6 +1203,10 @@ retry: bkey_init(&delete.k); + /* + * This could probably be more efficient for extents: + */ + /* * For extents, iter.pos won't necessarily be the same as * bkey_start_pos(k.k) (for non extents they always will be the @@ -1125,22 +1246,8 @@ retry: goto retry; } + bch2_trans_iter_free(trans, iter); return ret; - -} - -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_i k; - - bkey_init(&k.k); - k.k.p = iter->pos; - - bch2_trans_update(trans, iter, &k, 0); - return bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE|flags); } /* @@ -1152,21 +1259,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, struct bpos start, struct bpos end, u64 *journal_seq) { - struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; - - /* - * XXX: whether we need mem/more iters depends on whether this btree id - * has triggers - */ - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); - - iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); - - ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); - ret = bch2_trans_exit(&trans) ?: ret; - - BUG_ON(ret == -EINTR); - return ret; + return bch2_trans_do(c, NULL, journal_seq, 0, + bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 82f1cc4..31f7617 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -137,13 +137,14 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; + struct bch_dev *ca; unsigned i; percpu_down_write(&c->mark_lock); usage = c->usage_base; - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); for (i = 0; i < BCH_REPLICAS_MAX; i++) usage->reserved += usage->persistent_reserved[i]; @@ -155,48 +156,38 @@ void bch2_fs_usage_initialize(struct bch_fs *c) fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); } - percpu_up_write(&c->mark_lock); -} + for_each_member_device(ca, c, i) { + struct bch_dev_usage dev = bch2_dev_usage_read(ca); -void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) -{ - if (fs_usage == c->usage_scratch) - mutex_unlock(&c->usage_scratch_lock); - else - kfree(fs_usage); + usage->hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * + ca->mi.bucket_size; + } + + percpu_up_write(&c->mark_lock); } -struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) +static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) { - struct bch_fs_usage *ret; - unsigned bytes = fs_usage_u64s(c) * sizeof(u64); - - ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); - if (ret) - return ret; - - if (mutex_trylock(&c->usage_scratch_lock)) - goto out_pool; - - ret = kzalloc(bytes, GFP_NOFS); - if (ret) - return ret; - - mutex_lock(&c->usage_scratch_lock); -out_pool: - ret = c->usage_scratch; - memset(ret, 0, bytes); - return ret; + return this_cpu_ptr(gc + ? ca->usage_gc + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); } struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) { + struct bch_fs *c = ca->fs; struct bch_dev_usage ret; + unsigned seq, i, u64s = dev_usage_u64s(); - memset(&ret, 0, sizeof(ret)); - acc_u64s_percpu((u64 *) &ret, - (u64 __percpu *) ca->usage[0], - sizeof(ret) / sizeof(u64)); + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; } @@ -207,13 +198,13 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, { return this_cpu_ptr(gc ? c->usage_gc - : c->usage[journal_seq & 1]); + : c->usage[journal_seq & JOURNAL_BUF_MASK]); } u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) { ssize_t offset = v - (u64 *) c->usage_base; - unsigned seq; + unsigned i, seq; u64 ret; BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); @@ -221,38 +212,37 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) do { seq = read_seqcount_begin(&c->usage_lock); - ret = *v + - percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + - percpu_u64_get((u64 __percpu *) c->usage[1] + offset); + ret = *v; + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; } -struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) +struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) { - struct bch_fs_usage *ret; - unsigned seq, v, u64s = fs_usage_u64s(c); -retry: - ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); - if (unlikely(!ret)) - return NULL; + struct bch_fs_usage_online *ret; + unsigned seq, i, u64s; percpu_down_read(&c->mark_lock); - v = fs_usage_u64s(c); - if (unlikely(u64s != v)) { - u64s = v; + ret = kmalloc(sizeof(struct bch_fs_usage_online) + + sizeof(u64) + c->replicas.nr, GFP_NOFS); + if (unlikely(!ret)) { percpu_up_read(&c->mark_lock); - kfree(ret); - goto retry; + return NULL; } + ret->online_reserved = percpu_u64_get(c->online_reserved); + + u64s = fs_usage_u64s(c); do { seq = read_seqcount_begin(&c->usage_lock); - memcpy(ret, c->usage_base, u64s * sizeof(u64)); - acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); - acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); + memcpy(&ret->u, c->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; @@ -260,9 +250,10 @@ retry: void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { - unsigned u64s = fs_usage_u64s(c); + struct bch_dev *ca; + unsigned i, u64s = fs_usage_u64s(c); - BUG_ON(idx >= 2); + BUG_ON(idx >= ARRAY_SIZE(c->usage)); preempt_disable(); write_seqcount_begin(&c->usage_lock); @@ -271,37 +262,47 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) (u64 __percpu *) c->usage[idx], u64s); percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + u64s = dev_usage_u64s(); + + acc_u64s_percpu((u64 *) ca->usage_base, + (u64 __percpu *) ca->usage[idx], u64s); + percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); + } + rcu_read_unlock(); + write_seqcount_end(&c->usage_lock); preempt_enable(); } void bch2_fs_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_fs_usage *fs_usage) + struct bch_fs_usage_online *fs_usage) { unsigned i; pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); pr_buf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->hidden); + fs_usage->u.hidden); pr_buf(out, "data:\t\t\t\t%llu\n", - fs_usage->data); + fs_usage->u.data); pr_buf(out, "cached:\t\t\t\t%llu\n", - fs_usage->cached); + fs_usage->u.cached); pr_buf(out, "reserved:\t\t\t%llu\n", - fs_usage->reserved); + fs_usage->u.reserved); pr_buf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->nr_inodes); + fs_usage->u.nr_inodes); pr_buf(out, "online reserved:\t\t%llu\n", fs_usage->online_reserved); for (i = 0; - i < ARRAY_SIZE(fs_usage->persistent_reserved); + i < ARRAY_SIZE(fs_usage->u.persistent_reserved); i++) { pr_buf(out, "%u replicas:\n", i + 1); pr_buf(out, "\treserved:\t\t%llu\n", - fs_usage->persistent_reserved[i]); + fs_usage->u.persistent_reserved[i]); } for (i = 0; i < c->replicas.nr; i++) { @@ -310,7 +311,7 @@ void bch2_fs_usage_to_text(struct printbuf *out, pr_buf(out, "\t"); bch2_replicas_entry_to_text(out, e); - pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); + pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]); } } @@ -323,15 +324,15 @@ static u64 reserve_factor(u64 r) static u64 avail_factor(u64 r) { - return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); + return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); } -u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) +u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) { - return min(fs_usage->hidden + - fs_usage->btree + - fs_usage->data + - reserve_factor(fs_usage->reserved + + return min(fs_usage->u.hidden + + fs_usage->u.btree + + fs_usage->u.data + + reserve_factor(fs_usage->u.reserved + fs_usage->online_reserved), c->capacity); } @@ -348,7 +349,7 @@ __bch2_fs_usage_read_short(struct bch_fs *c) data = bch2_fs_usage_read_one(c, &c->usage_base->data) + bch2_fs_usage_read_one(c, &c->usage_base->btree); reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + - bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; @@ -375,15 +376,12 @@ static inline int is_unavailable_bucket(struct bucket_mark m) return !is_available_bucket(m); } -static inline int is_fragmented_bucket(struct bucket_mark m, - struct bch_dev *ca) +static inline int bucket_sectors_fragmented(struct bch_dev *ca, + struct bucket_mark m) { - if (!m.owned_by_allocator && - m.data_type == BCH_DATA_user && - bucket_sectors_used(m)) - return max_t(int, 0, (int) ca->mi.bucket_size - - bucket_sectors_used(m)); - return 0; + return bucket_sectors_used(m) + ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) + : 0; } static inline int is_stripe_data_bucket(struct bucket_mark m) @@ -391,11 +389,6 @@ static inline int is_stripe_data_bucket(struct bucket_mark m) return m.stripe && m.data_type != BCH_DATA_parity; } -static inline int bucket_stripe_sectors(struct bucket_mark m) -{ - return is_stripe_data_bucket(m) ? m.dirty_sectors : 0; -} - static inline enum bch_data_type bucket_type(struct bucket_mark m) { return m.cached_sectors && !m.dirty_sectors @@ -410,43 +403,6 @@ static bool bucket_became_unavailable(struct bucket_mark old, !is_available_bucket(new); } -int bch2_fs_usage_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct disk_reservation *disk_res, - unsigned journal_seq) -{ - s64 added = fs_usage->data + fs_usage->reserved; - s64 should_not_have_added; - int ret = 0; - - percpu_rwsem_assert_held(&c->mark_lock); - - /* - * Not allowed to reduce sectors_available except by getting a - * reservation: - */ - should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); - if (WARN_ONCE(should_not_have_added > 0, - "disk usage increased by %lli more than reservation of %llu", - added, disk_res ? disk_res->sectors : 0)) { - atomic64_sub(should_not_have_added, &c->sectors_available); - added -= should_not_have_added; - ret = -1; - } - - if (added > 0) { - disk_res->sectors -= added; - fs_usage->online_reserved -= added; - } - - preempt_disable(); - acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), - (u64 *) fs_usage, fs_usage_u64s(c)); - preempt_enable(); - - return ret; -} - static inline void account_bucket(struct bch_fs_usage *fs_usage, struct bch_dev_usage *dev_usage, enum bch_data_type type, @@ -455,20 +411,22 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, if (type == BCH_DATA_sb || type == BCH_DATA_journal) fs_usage->hidden += size; - dev_usage->buckets[type] += nr; + dev_usage->d[type].buckets += nr; } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, struct bch_fs_usage *fs_usage, struct bucket_mark old, struct bucket_mark new, - bool gc) + u64 journal_seq, bool gc) { struct bch_dev_usage *u; percpu_rwsem_assert_held(&c->mark_lock); preempt_disable(); - u = this_cpu_ptr(ca->usage[gc]); + if (!fs_usage) + fs_usage = fs_usage_ptr(c, journal_seq, gc); + u = dev_usage_ptr(ca, journal_seq, gc); if (bucket_type(old)) account_bucket(fs_usage, u, bucket_type(old), @@ -478,68 +436,35 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, account_bucket(fs_usage, u, bucket_type(new), 1, ca->mi.bucket_size); - u->buckets_alloc += - (int) new.owned_by_allocator - (int) old.owned_by_allocator; + u->buckets_ec += (int) new.stripe - (int) old.stripe; u->buckets_unavailable += is_unavailable_bucket(new) - is_unavailable_bucket(old); - u->buckets_ec += (int) new.stripe - (int) old.stripe; - u->sectors_ec += bucket_stripe_sectors(new) - - bucket_stripe_sectors(old); - - u->sectors[old.data_type] -= old.dirty_sectors; - u->sectors[new.data_type] += new.dirty_sectors; - u->sectors[BCH_DATA_cached] += + u->d[old.data_type].sectors -= old.dirty_sectors; + u->d[new.data_type].sectors += new.dirty_sectors; + u->d[BCH_DATA_cached].sectors += (int) new.cached_sectors - (int) old.cached_sectors; - u->sectors_fragmented += - is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); + + u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); + u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + preempt_enable(); if (!is_available_bucket(old) && is_available_bucket(new)) bch2_wake_allocator(ca); } -__flatten -void bch2_dev_usage_from_buckets(struct bch_fs *c) -{ - struct bch_dev *ca; - struct bucket_mark old = { .v.counter = 0 }; - struct bucket_array *buckets; - struct bucket *g; - unsigned i; - int cpu; - - c->usage_base->hidden = 0; - - for_each_member_device(ca, c, i) { - for_each_possible_cpu(cpu) - memset(per_cpu_ptr(ca->usage[0], cpu), 0, - sizeof(*ca->usage[0])); - - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - bch2_dev_usage_update(c, ca, c->usage_base, - old, g->mark, false); - } -} - -static inline int update_replicas(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, - s64 sectors) +static inline void update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); - if (idx < 0) - return -1; - - if (!fs_usage) - return 0; + BUG_ON(idx < 0); fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); fs_usage->replicas[idx] += sectors; - return 0; } static inline void update_cached_sectors(struct bch_fs *c, @@ -586,6 +511,7 @@ static inline void update_replicas_list(struct btree_trans *trans, n = (void *) d->d + d->used; n->delta = sectors; memcpy(&n->r, r, replicas_entry_bytes(r)); + bch2_replicas_entry_sort(&n->r); d->used += b; } @@ -599,43 +525,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -static inline struct replicas_delta * -replicas_delta_next(struct replicas_delta *d) -{ - return (void *) d + replicas_entry_bytes(&d->r) + 8; -} - -int bch2_replicas_delta_list_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct replicas_delta_list *r) -{ - struct replicas_delta *d = r->d; - struct replicas_delta *top = (void *) r->d + r->used; - unsigned i; - - for (d = r->d; d != top; d = replicas_delta_next(d)) - if (update_replicas(c, fs_usage, &d->r, d->delta)) { - top = d; - goto unwind; - } - - if (!fs_usage) - return 0; - - fs_usage->nr_inodes += r->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - fs_usage->reserved += r->persistent_reserved[i]; - fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; - } - - return 0; -unwind: - for (d = r->d; d != top; d = replicas_delta_next(d)) - update_replicas(c, fs_usage, &d->r, -d->delta); - return -1; -} - #define do_mark_fn(fn, c, pos, flags, ...) \ ({ \ int gc, ret = 0; \ @@ -649,51 +538,10 @@ unwind: ret; \ }) -static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *ret, - bool gc) -{ - struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); - struct bucket *g = __bucket(ca, b, gc); - struct bucket_mark old, new; - - old = bucket_cmpxchg(g, new, ({ - BUG_ON(!is_available_bucket(new)); - - new.owned_by_allocator = true; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - new.gen++; - })); - - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); - - if (old.cached_sectors) - update_cached_sectors(c, fs_usage, ca->dev_idx, - -((s64) old.cached_sectors)); - - if (!gc) - *ret = old; - return 0; -} - -void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *old) -{ - do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, - ca, b, old); - - if (!old->owned_by_allocator && old->cached_sectors) - trace_invalidate(ca, bucket_to_sector(ca, b), - old->cached_sectors); -} - static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, bool gc) { - struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; @@ -701,8 +549,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, new.owned_by_allocator = owned_by_allocator; })); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); - BUG_ON(!gc && !owned_by_allocator && !old.owned_by_allocator); @@ -733,7 +579,8 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bucket_mark old_m, m; /* We don't do anything for deletions - do we?: */ - if (new.k->type != KEY_TYPE_alloc) + if (new.k->type != KEY_TYPE_alloc && + new.k->type != KEY_TYPE_alloc_v2) return 0; /* @@ -756,6 +603,7 @@ static int bch2_mark_alloc(struct bch_fs *c, m.data_type = u.data_type; m.dirty_sectors = u.dirty_sectors; m.cached_sectors = u.cached_sectors; + m.stripe = u.stripe != 0; if (journal_seq) { m.journal_seq_valid = 1; @@ -763,12 +611,14 @@ static int bch2_mark_alloc(struct bch_fs *c, } })); - bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); + bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; g->oldest_gen = u.oldest_gen; g->gen_valid = 1; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; /* * need to know if we're getting called from the invalidate path or @@ -826,7 +676,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (c) bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), - old, new, gc); + old, new, 0, gc); return 0; } @@ -963,11 +813,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, +static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, unsigned ptr_idx, struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags, - bool enabled) + u64 journal_seq, unsigned flags) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; @@ -980,8 +829,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, char buf[200]; int ret; - if (enabled) - g->ec_redundancy = s->nr_redundant; + if (g->stripe && g->stripe != k.k->p.offset) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + return -EINVAL; + } old = bucket_cmpxchg(g, new, ({ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, @@ -989,23 +843,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, if (ret) return ret; - if (new.stripe && enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - if (!new.stripe && !enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u: deleting stripe but not marked\n%s", - ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - - new.stripe = enabled; - - if ((flags & BTREE_TRIGGER_GC) && parity) { - new.data_type = enabled ? BCH_DATA_parity : 0; - new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0; + if (parity) { + new.data_type = BCH_DATA_parity; + new.dirty_sectors = le16_to_cpu(s->sectors); } if (journal_seq) { @@ -1014,10 +854,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, } })); - if (!enabled) - g->ec_redundancy = 0; + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); return 0; } @@ -1084,7 +924,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); @@ -1211,6 +1051,8 @@ static int bch2_mark_stripe(struct bch_fs *c, unsigned i; int ret; + BUG_ON(gc && old_s); + if (!m || (old_s && !m->alive)) { bch_err_ratelimited(c, "error marking nonexistent stripe %zu", idx); @@ -1218,48 +1060,12 @@ static int bch2_mark_stripe(struct bch_fs *c, } if (!new_s) { - /* Deleting: */ - for (i = 0; i < old_s->nr_blocks; i++) { - ret = bucket_set_stripe(c, old, i, fs_usage, - journal_seq, flags, false); - if (ret) - return ret; - } - - if (!gc && m->on_heap) { - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_del(c, m, idx); - spin_unlock(&c->ec_stripes_heap_lock); - } - - if (gc) - update_replicas(c, fs_usage, &m->r.e, - -((s64) m->sectors * m->nr_redundant)); + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_del(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); memset(m, 0, sizeof(*m)); } else { - BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); - BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); - - for (i = 0; i < new_s->nr_blocks; i++) { - if (!old_s || - memcmp(new_s->ptrs + i, - old_s->ptrs + i, - sizeof(struct bch_extent_ptr))) { - - if (old_s) { - bucket_set_stripe(c, old, i, fs_usage, - journal_seq, flags, false); - if (ret) - return ret; - } - ret = bucket_set_stripe(c, new, i, fs_usage, - journal_seq, flags, true); - if (ret) - return ret; - } - } - m->alive = true; m->sectors = le16_to_cpu(new_s->sectors); m->algorithm = new_s->algorithm; @@ -1271,18 +1077,12 @@ static int bch2_mark_stripe(struct bch_fs *c, m->block_sectors[i] = stripe_blockcount_get(new_s, i); m->blocks_nonempty += !!m->block_sectors[i]; - } - if (gc && old_s) - update_replicas(c, fs_usage, &m->r.e, - -((s64) m->sectors * m->nr_redundant)); + m->ptrs[i] = new_s->ptrs[i]; + } bch2_bkey_to_replicas(&m->r.e, new); - if (gc) - update_replicas(c, fs_usage, &m->r.e, - ((s64) m->sectors * m->nr_redundant)); - if (!gc) { spin_lock(&c->ec_stripes_heap_lock); bch2_stripes_heap_update(c, m, idx); @@ -1290,6 +1090,25 @@ static int bch2_mark_stripe(struct bch_fs *c, } } + if (gc) { + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { + ret = mark_stripe_bucket(c, new, i, fs_usage, + journal_seq, flags); + if (ret) + return ret; + } + + update_replicas(c, fs_usage, &m->r.e, + ((s64) m->sectors * m->nr_redundant)); + } + return 0; } @@ -1313,6 +1132,7 @@ static int bch2_mark_key_locked(struct bch_fs *c, switch (k.k->type) { case KEY_TYPE_alloc: + case KEY_TYPE_alloc_v2: ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_btree_ptr: @@ -1333,10 +1153,8 @@ static int bch2_mark_key_locked(struct bch_fs *c, ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_inode: - if (!(flags & BTREE_TRIGGER_OVERWRITE)) - fs_usage->nr_inodes++; - else - fs_usage->nr_inodes--; + fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; + fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; break; case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; @@ -1383,9 +1201,6 @@ int bch2_mark_update(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree *b = iter_l(iter)->b; - struct btree_node_iter node_iter = iter_l(iter)->iter; - struct bkey_packed *_old; struct bkey_s_c old; struct bkey unpacked; int ret = 0; @@ -1400,10 +1215,10 @@ int bch2_mark_update(struct btree_trans *trans, old = (struct bkey_s_c) { &unpacked, NULL }; if (!btree_node_type_is_extents(iter->btree_id)) { + /* iterators should be uptodate, shouldn't get errors here: */ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { - _old = bch2_btree_node_iter_peek(&node_iter, b); - if (_old) - old = bkey_disassemble(b, _old, &unpacked); + old = bch2_btree_iter_peek_slot(iter); + BUG_ON(bkey_err(old)); } else { struct bkey_cached *ck = (void *) iter->l[0].b; @@ -1425,23 +1240,24 @@ int bch2_mark_update(struct btree_trans *trans, BTREE_TRIGGER_OVERWRITE|flags); } } else { + struct btree_iter *copy; + BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, new->k.size, fs_usage, trans->journal_res.seq, BTREE_TRIGGER_INSERT|flags); - while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { - unsigned offset = 0; - s64 sectors; + copy = bch2_trans_copy_iter(trans, iter); - old = bkey_disassemble(b, _old, &unpacked); - sectors = -((s64) old.k->size); + for_each_btree_key_continue(copy, 0, old, ret) { + unsigned offset = 0; + s64 sectors = -((s64) old.k->size); flags |= BTREE_TRIGGER_OVERWRITE; if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) - return 0; + break; switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: @@ -1474,30 +1290,22 @@ int bch2_mark_update(struct btree_trans *trans, trans->journal_res.seq, flags) ?: 1; if (ret <= 0) break; - - bch2_btree_node_iter_advance(&node_iter, b); } + bch2_trans_iter_put(trans, copy); } return ret; } -void bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct bch_fs_usage *fs_usage) +static noinline __cold +void fs_usage_apply_warn(struct btree_trans *trans, + unsigned disk_res_sectors) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - static int warned_disk_usage = 0; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; char buf[200]; - if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, - trans->journal_res.seq) || - warned_disk_usage || - xchg(&warned_disk_usage, 1)) - return; - - bch_err(c, "disk usage increased more than %llu sectors reserved", + bch_err(c, "disk usage increased more than %u sectors reserved", disk_res_sectors); trans_for_each_update(trans, i) { @@ -1507,27 +1315,20 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, pr_err("overlapping with"); if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { - struct btree *b = iter_l(i->iter)->b; - struct btree_node_iter node_iter = iter_l(i->iter)->iter; - struct bkey_packed *_k; - - while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { - struct bkey unpacked; - struct bkey_s_c k; - - pr_info("_k %px format %u", _k, _k->format); - k = bkey_disassemble(b, _k, &unpacked); + struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter); + struct bkey_s_c k; + int ret; - if (btree_node_is_extents(b) + for_each_btree_key_continue(copy, 0, k, ret) { + if (btree_node_type_is_extents(i->iter->btree_id) ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 : bkey_cmp(i->k->k.p, k.k->p)) break; bch2_bkey_val_to_text(&PBUF(buf), c, k); pr_err("%s", buf); - - bch2_btree_node_iter_advance(&node_iter, b); } + bch2_trans_iter_put(trans, copy); } else { struct bkey_cached *ck = (void *) i->iter->l[0].b; @@ -1539,6 +1340,65 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, } } +void bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + static int warned_disk_usage = 0; + bool warn = false; + unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + struct replicas_delta *d = deltas->d; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + s64 added = 0, should_not_have_added; + unsigned i; + + percpu_rwsem_assert_held(&c->mark_lock); + + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) { + switch (d->r.data_type) { + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + added += d->delta; + } + + update_replicas(c, dst, &d->r, d->delta); + } + + dst->nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + added += deltas->persistent_reserved[i]; + dst->reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + /* + * Not allowed to reduce sectors_available except by getting a + * reservation: + */ + should_not_have_added = added - (s64) disk_res_sectors; + if (unlikely(should_not_have_added > 0)) { + atomic64_sub(should_not_have_added, &c->sectors_available); + added -= should_not_have_added; + warn = true; + } + + if (added > 0) { + trans->disk_res->sectors -= added; + this_cpu_sub(*c->online_reserved, added); + } + + preempt_enable(); + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + fs_usage_apply_warn(trans, disk_res_sectors); +} + /* trans_mark: */ static struct btree_iter *trans_get_update(struct btree_trans *trans, @@ -1554,6 +1414,10 @@ static struct btree_iter *trans_get_update(struct btree_trans *trans, bkey_cmp(pos, i->k->k.p) < 0 : !bkey_cmp(pos, i->iter->pos))) { *k = bkey_i_to_s_c(i->k); + + /* ugly hack.. */ + BUG_ON(btree_iter_live(trans, i->iter)); + trans->iters_live |= 1ULL << i->iter->idx; return i->iter; } @@ -1565,7 +1429,7 @@ static int trans_get_key(struct btree_trans *trans, struct btree_iter **iter, struct bkey_s_c *k) { - unsigned flags = btree_id != BTREE_ID_ALLOC + unsigned flags = btree_id != BTREE_ID_alloc ? BTREE_ITER_SLOTS : BTREE_ITER_CACHED; int ret; @@ -1576,9 +1440,6 @@ static int trans_get_key(struct btree_trans *trans, *iter = bch2_trans_get_iter(trans, btree_id, pos, flags|BTREE_ITER_INTENT); - if (IS_ERR(*iter)) - return PTR_ERR(*iter); - *k = __bch2_btree_iter_peek(*iter, flags); ret = bkey_err(*k); if (ret) @@ -1586,9 +1447,10 @@ static int trans_get_key(struct btree_trans *trans, return ret; } -static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, - const struct bch_extent_ptr *ptr, - struct bkey_alloc_unpacked *u) +static struct bkey_alloc_buf * +bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, + const struct bch_extent_ptr *ptr, + struct bkey_alloc_unpacked *u) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); @@ -1596,33 +1458,35 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree struct bucket *g; struct btree_iter *iter; struct bkey_s_c k; + struct bkey_alloc_buf *a; int ret; - iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + if (IS_ERR(a)) + return a; + + iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k); if (iter) { *u = bch2_alloc_unpack(k); } else { - iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, + iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos, BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); - ret = bch2_btree_iter_traverse(iter); if (ret) { bch2_trans_iter_put(trans, iter); - return ret; + return ERR_PTR(ret); } percpu_down_read(&c->mark_lock); g = bucket(ca, pos.offset); - *u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); percpu_up_read(&c->mark_lock); } *_iter = iter; - return 0; + return a; } static int bch2_trans_mark_pointer(struct btree_trans *trans, @@ -1632,34 +1496,27 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_alloc_unpacked u; - struct bkey_i_alloc *a; + struct bkey_alloc_buf *a; int ret; - ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); - if (ret) - return ret; + a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, &u.dirty_sectors, &u.cached_sectors); if (ret) goto out; - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto out; - - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i, 0); + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); out: bch2_trans_iter_put(trans, iter); return ret; } static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, - struct bch_extent_stripe_ptr p, + struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type) { struct bch_fs *c = trans->c; @@ -1669,14 +1526,22 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_replicas_padded r; int ret = 0; - ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); + ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k); if (ret < 0) return ret; if (k.k->type != KEY_TYPE_stripe) { bch2_fs_inconsistent(c, "pointer to nonexistent stripe %llu", - (u64) p.idx); + (u64) p.ec.idx); + ret = -EIO; + goto out; + } + + if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { + bch2_fs_inconsistent(c, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); ret = -EIO; goto out; } @@ -1687,8 +1552,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, goto out; bkey_reassemble(&s->k_i, k); - stripe_blockcount_set(&s->v, p.block, - stripe_blockcount_get(&s->v, p.block) + + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + sectors); bch2_trans_update(trans, iter, &s->k_i, 0); @@ -1739,7 +1604,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - ret = bch2_trans_mark_stripe_ptr(trans, p.ec, + ret = bch2_trans_mark_stripe_ptr(trans, p, disk_sectors, data_type); if (ret) return ret; @@ -1754,59 +1619,108 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, return 0; } +static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) +{ + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct bkey_alloc_buf *a; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; + bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; + int ret = 0; + + a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); + + if (parity) { + s64 sectors = le16_to_cpu(s.v->sectors); + + if (deleting) + sectors = -sectors; + + u.dirty_sectors += sectors; + u.data_type = u.dirty_sectors + ? BCH_DATA_parity + : 0; + } + + if (!deleting) { + if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, + "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", + iter->pos.inode, iter->pos.offset, u.gen, + u.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + + u.stripe = s.k->p.offset; + u.stripe_redundancy = s.v->nr_redundant; + } else { + u.stripe = 0; + u.stripe_redundancy = 0; + } + + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + static int bch2_trans_mark_stripe(struct btree_trans *trans, - struct bkey_s_c k, + struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned nr_data = s->nr_blocks - s->nr_redundant; + struct bkey_s_c_stripe old_s = { NULL }; + struct bkey_s_c_stripe new_s = { NULL }; struct bch_replicas_padded r; - struct bkey_alloc_unpacked u; - struct bkey_i_alloc *a; - struct btree_iter *iter; - bool deleting = flags & BTREE_TRIGGER_OVERWRITE; - s64 sectors = le16_to_cpu(s->sectors); unsigned i; int ret = 0; - if (deleting) - sectors = -sectors; - - bch2_bkey_to_replicas(&r.e, k); - update_replicas_list(trans, &r.e, sectors * s->nr_redundant); + if (old.k->type == KEY_TYPE_stripe) + old_s = bkey_s_c_to_stripe(old); + if (new.k->type == KEY_TYPE_stripe) + new_s = bkey_s_c_to_stripe(new); /* - * The allocator code doesn't necessarily update bucket gens in the - * btree when incrementing them, right before handing out new buckets - - * we just need to persist those updates here along with the new stripe: + * If the pointers aren't changing, we don't need to do anything: */ + if (new_s.k && old_s.k && + new_s.v->nr_blocks == old_s.v->nr_blocks && + new_s.v->nr_redundant == old_s.v->nr_redundant && + !memcmp(old_s.v->ptrs, new_s.v->ptrs, + new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; - for (i = 0; i < s->nr_blocks && !ret; i++) { - bool parity = i >= nr_data; + if (new_s.k) { + s64 sectors = le16_to_cpu(new_s.v->sectors); - ret = bch2_trans_start_alloc_update(trans, &iter, - &s->ptrs[i], &u); - if (ret) - break; + bch2_bkey_to_replicas(&r.e, new); + update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); - if (parity) { - u.dirty_sectors += sectors; - u.data_type = u.dirty_sectors - ? BCH_DATA_parity - : 0; + for (i = 0; i < new_s.v->nr_blocks; i++) { + ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s, + i, false); + if (ret) + return ret; } + } - a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto put_iter; - - bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); - bch2_trans_update(trans, iter, &a->k_i, 0); -put_iter: - bch2_trans_iter_put(trans, iter); + if (old_s.k) { + s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors)); + + bch2_bkey_to_replicas(&r.e, old); + update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); + + for (i = 0; i < old_s.v->nr_blocks; i++) { + ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s, + i, true); + if (ret) + return ret; + } } return ret; @@ -1836,7 +1750,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, __le64 *refcount; s64 ret; - ret = trans_get_key(trans, BTREE_ID_REFLINK, + ret = trans_get_key(trans, BTREE_ID_reflink, POS(0, idx), &iter, &k); if (ret < 0) return ret; @@ -1872,8 +1786,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, } bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); - bch2_trans_update(trans, iter, n, 0); out: ret = sectors; @@ -1905,11 +1817,16 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, return ret; } -int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, +int bch2_trans_mark_key(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_s_c new, unsigned offset, s64 sectors, unsigned flags) { - struct replicas_delta_list *d; struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct replicas_delta_list *d; + + BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); switch (k.k->type) { case KEY_TYPE_btree_ptr: @@ -1925,15 +1842,18 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, return bch2_trans_mark_extent(trans, k, offset, sectors, flags, BCH_DATA_user); case KEY_TYPE_stripe: - return bch2_trans_mark_stripe(trans, k, flags); - case KEY_TYPE_inode: - d = replicas_deltas_realloc(trans, 0); + return bch2_trans_mark_stripe(trans, old, new, flags); + case KEY_TYPE_inode: { + int nr = (new.k->type == KEY_TYPE_inode) - + (old.k->type == KEY_TYPE_inode); + + if (nr) { + d = replicas_deltas_realloc(trans, 0); + d->nr_inodes += nr; + } - if (!(flags & BTREE_TRIGGER_OVERWRITE)) - d->nr_inodes++; - else - d->nr_inodes--; return 0; + } case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; @@ -1957,12 +1877,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, int bch2_trans_mark_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, + struct bkey_i *new, unsigned flags) { - struct btree *b = iter_l(iter)->b; - struct btree_node_iter node_iter = iter_l(iter)->iter; - struct bkey_packed *_k; + struct bkey_s_c old; int ret; if (unlikely(flags & BTREE_TRIGGER_NORUN)) @@ -1971,93 +1889,251 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), - 0, insert->k.size, BTREE_TRIGGER_INSERT); - if (ret) - return ret; + if (!btree_node_type_is_extents(iter->btree_id)) { + /* iterators should be uptodate, shouldn't get errors here: */ + if (btree_iter_type(iter) != BTREE_ITER_CACHED) { + old = bch2_btree_iter_peek_slot(iter); + BUG_ON(bkey_err(old)); + } else { + struct bkey_cached *ck = (void *) iter->l[0].b; - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { - struct bkey_cached *ck = (void *) iter->l[0].b; + BUG_ON(!ck->valid); + old = bkey_i_to_s_c(ck->k); + } - return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), - 0, 0, BTREE_TRIGGER_OVERWRITE); - } + if (old.k->type == new->k.type) { + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, + BTREE_TRIGGER_INSERT|flags) ?: + bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0, + BTREE_TRIGGER_OVERWRITE|flags); + } + } else { + struct btree_iter *copy; + struct bkey _old; - while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { - struct bkey unpacked; - struct bkey_s_c k; - unsigned offset = 0; - s64 sectors = 0; - unsigned flags = BTREE_TRIGGER_OVERWRITE; + EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); - k = bkey_disassemble(b, _k, &unpacked); + bkey_init(&_old); + old = (struct bkey_s_c) { &_old, NULL }; - if (btree_node_is_extents(b) - ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 - : bkey_cmp(insert->k.p, k.k->p)) - break; + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), + 0, new->k.size, + BTREE_TRIGGER_INSERT); + if (ret) + return ret; - if (btree_node_is_extents(b)) { - switch (bch2_extent_overlap(&insert->k, k.k)) { + copy = bch2_trans_copy_iter(trans, iter); + + for_each_btree_key_continue(copy, 0, old, ret) { + unsigned offset = 0; + s64 sectors = -((s64) old.k->size); + + flags |= BTREE_TRIGGER_OVERWRITE; + + if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) + break; + + switch (bch2_extent_overlap(&new->k, old.k)) { case BCH_EXTENT_OVERLAP_ALL: offset = 0; - sectors = -((s64) k.k->size); + sectors = -((s64) old.k->size); break; case BCH_EXTENT_OVERLAP_BACK: - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); - sectors = bkey_start_offset(&insert->k) - - k.k->p.offset; + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = bkey_start_offset(&new->k) - + old.k->p.offset; break; case BCH_EXTENT_OVERLAP_FRONT: offset = 0; - sectors = bkey_start_offset(k.k) - - insert->k.p.offset; + sectors = bkey_start_offset(old.k) - + new->k.p.offset; break; case BCH_EXTENT_OVERLAP_MIDDLE: - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); - sectors = -((s64) insert->k.size); + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = -((s64) new->k.size); flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; break; } BUG_ON(sectors >= 0); + + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), + offset, sectors, flags); + if (ret) + break; } + bch2_trans_iter_put(trans, copy); + } - ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); - if (ret) - return ret; + return ret; +} + +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_alloc_unpacked u; + struct bkey_alloc_buf *a; + struct bch_extent_ptr ptr = { + .dev = ca->dev_idx, + .offset = bucket_to_sector(ca, b), + }; + int ret = 0; + + a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); + if (IS_ERR(a)) + return PTR_ERR(a); - bch2_btree_node_iter_advance(&node_iter, b); + if (u.data_type && u.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + iter->pos.inode, iter->pos.offset, u.gen, + bch2_data_types[u.data_type], + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; + goto out; } - return 0; + if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n" + "while marking %s", + iter->pos.inode, iter->pos.offset, u.gen, + bch2_data_types[u.data_type ?: type], + u.dirty_sectors, sectors, ca->mi.bucket_size, + bch2_data_types[type]); + ret = -EIO; + goto out; + } + + if (u.data_type == type && + u.dirty_sectors == sectors) + goto out; + + u.data_type = type; + u.dirty_sectors = sectors; + + bch2_alloc_pack(c, a, u); + bch2_trans_update(trans, iter, &a->k, 0); +out: + bch2_trans_iter_put(trans, iter); + return ret; } -/* Disk reservations: */ +int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + return __bch2_trans_do(trans, res, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal, + ca->mi.bucket_size)); -static u64 bch2_recalc_sectors_available(struct bch_fs *c) +} + +static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca, + u64 start, u64 end, + enum bch_data_type type, + u64 *bucket, unsigned *bucket_sectors) { - percpu_u64_set(&c->pcpu->sectors_available, 0); + int ret; + + do { + u64 b = sector_to_bucket(ca, start); + unsigned sectors = + min_t(u64, bucket_to_sector(ca, b + 1), end) - start; + + if (b != *bucket) { + if (*bucket_sectors) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + *bucket, type, *bucket_sectors); + if (ret) + return ret; + } + + *bucket = b; + *bucket_sectors = 0; + } + + *bucket_sectors += sectors; + start += sectors; + } while (!ret && start < end); - return avail_factor(__bch2_fs_usage_read_short(c).free); + return 0; } -void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + struct disk_reservation *res, + struct bch_dev *ca) { - percpu_down_read(&c->mark_lock); - this_cpu_sub(c->usage[0]->online_reserved, - res->sectors); - percpu_up_read(&c->mark_lock); + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 bucket = 0; + unsigned i, bucket_sectors = 0; + int ret; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) { + ret = bch2_trans_mark_metadata_sectors(trans, res, ca, + 0, BCH_SB_SECTOR, + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } - res->sectors = 0; + ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset, + offset + (1 << layout->sb_max_size_bits), + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + if (bucket_sectors) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + bucket, BCH_DATA_sb, bucket_sectors); + if (ret) + return ret; + } + + for (i = 0; i < ca->journal.nr; i++) { + ret = bch2_trans_mark_metadata_bucket(trans, res, ca, + ca->journal.buckets[i], + BCH_DATA_journal, ca->mi.bucket_size); + if (ret) + return ret; + } + + return 0; } +int bch2_trans_mark_dev_sb(struct bch_fs *c, + struct disk_reservation *res, + struct bch_dev *ca) +{ + return bch2_trans_do(c, res, NULL, 0, + __bch2_trans_mark_dev_sb(&trans, res, ca)); +} + +/* Disk reservations: */ + #define SECTORS_CACHE 1024 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, - unsigned sectors, int flags) + u64 sectors, int flags) { struct bch_fs_pcpu *pcpu; u64 old, v, get; @@ -2078,7 +2154,6 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, if (get < sectors) { preempt_enable(); - percpu_up_read(&c->mark_lock); goto recalculate; } } while ((v = atomic64_cmpxchg(&c->sectors_available, @@ -2088,7 +2163,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, out: pcpu->sectors_available -= sectors; - this_cpu_add(c->usage[0]->online_reserved, sectors); + this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; preempt_enable(); @@ -2096,15 +2171,16 @@ out: return 0; recalculate: - percpu_down_write(&c->mark_lock); + mutex_lock(&c->sectors_available_lock); - sectors_available = bch2_recalc_sectors_available(c); + percpu_u64_set(&c->pcpu->sectors_available, 0); + sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, max_t(s64, 0, sectors_available - sectors)); - this_cpu_add(c->usage[0]->online_reserved, sectors); + this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; ret = 0; } else { @@ -2112,7 +2188,8 @@ recalculate: ret = -ENOSPC; } - percpu_up_write(&c->mark_lock); + mutex_unlock(&c->sectors_available_lock); + percpu_up_read(&c->mark_lock); return ret; } @@ -2141,7 +2218,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ca->mi.bucket_size / c->opts.btree_node_size); /* XXX: these should be tunable */ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); - size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); + size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), btree_reserve * 2); bool resize = ca->buckets[0] != NULL; @@ -2158,7 +2235,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || - !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || @@ -2245,13 +2321,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca) sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - free_percpu(ca->usage[0]); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); + kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) + unsigned i; + + ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); + if (!ca->usage_base) return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + ca->usage[i] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[i]) + return -ENOMEM; + } + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index a3873be..54dcc82 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) return __bucket(ca, b, false); } -static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) -{ - return c->bucket_clock[rw].hand - g->io_time[rw]; -} - /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. */ -static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) +static inline u8 bucket_gc_gen(struct bucket *g) { - struct bucket *g = bucket(ca, b); - return g->mark.gen - g->oldest_gen; } @@ -153,18 +146,9 @@ static inline unsigned bucket_sectors_used(struct bucket_mark mark) return mark.dirty_sectors + mark.cached_sectors; } -static inline bool bucket_unused(struct bucket_mark mark) -{ - return !mark.owned_by_allocator && - !mark.data_type && - !bucket_sectors_used(mark); -} - static inline bool is_available_bucket(struct bucket_mark mark) { - return (!mark.owned_by_allocator && - !mark.dirty_sectors && - !mark.stripe); + return !mark.dirty_sectors && !mark.stripe; } static inline bool bucket_needs_journal_commit(struct bucket_mark m, @@ -178,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); -void bch2_dev_usage_from_buckets(struct bch_fs *); - static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) { @@ -223,19 +205,21 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c) READ_ONCE(c->replicas.nr); } -void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); -struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); +static inline unsigned dev_usage_u64s(void) +{ + return sizeof(struct bch_dev_usage) / sizeof(u64); +} u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); -struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); +struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); void bch2_fs_usage_to_text(struct printbuf *, - struct bch_fs *, struct bch_fs_usage *); + struct bch_fs *, struct bch_fs_usage_online *); -u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); +u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); @@ -245,8 +229,6 @@ bch2_fs_usage_read_short(struct bch_fs *); void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); -void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, - size_t, struct bucket_mark *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool, struct gc_pos, unsigned); void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, @@ -255,37 +237,36 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, struct bch_fs_usage *, u64, unsigned); -int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, unsigned); int bch2_mark_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct bch_fs_usage *, unsigned); -int bch2_replicas_delta_list_apply(struct bch_fs *, - struct bch_fs_usage *, - struct replicas_delta_list *); -int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned, s64, unsigned); int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, struct bkey_i *insert, unsigned); -void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); +void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); -/* disk reservations: */ +int bch2_trans_mark_metadata_bucket(struct btree_trans *, + struct disk_reservation *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *, + struct bch_dev *); -void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); +/* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { - if (res->sectors) - __bch2_disk_reservation_put(c, res); + this_cpu_sub(*c->online_reserved, res->sectors); + res->sectors = 0; } #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) int bch2_disk_reservation_add(struct bch_fs *, - struct disk_reservation *, - unsigned, int); + struct disk_reservation *, + u64, int); static inline struct disk_reservation bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) @@ -302,8 +283,7 @@ bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) static inline int bch2_disk_reservation_get(struct bch_fs *c, struct disk_reservation *res, - unsigned sectors, - unsigned nr_replicas, + u64 sectors, unsigned nr_replicas, int flags) { *res = bch2_disk_reservation_init(c, nr_replicas); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index d6057d2..588b1a7 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -37,11 +37,12 @@ struct bucket { const struct bucket_mark mark; }; - u16 io_time[2]; + u64 io_time[2]; u8 oldest_gen; u8 gc_gen; unsigned gen_valid:1; - u8 ec_redundancy; + u8 stripe_redundancy; + u32 stripe; }; struct bucket_array { @@ -52,26 +53,18 @@ struct bucket_array { }; struct bch_dev_usage { - u64 buckets[BCH_DATA_NR]; - u64 buckets_alloc; + u64 buckets_ec; u64 buckets_unavailable; - /* _compressed_ sectors: */ - u64 sectors[BCH_DATA_NR]; - u64 sectors_fragmented; - - u64 buckets_ec; - u64 sectors_ec; + struct { + u64 buckets; + u64 sectors; /* _compressed_ sectors: */ + u64 fragmented; + } d[BCH_DATA_NR]; }; struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - - u64 online_reserved; - - /* fields after online_reserved are cleared/recalculated by gc: */ - u64 gc_start[0]; - u64 hidden; u64 btree; u64 data; @@ -91,6 +84,11 @@ struct bch_fs_usage { u64 replicas[]; }; +struct bch_fs_usage_online { + u64 online_reserved; + struct bch_fs_usage u; +}; + struct bch_fs_usage_short { u64 capacity; u64 used; @@ -98,22 +96,6 @@ struct bch_fs_usage_short { u64 nr_inodes; }; -struct replicas_delta { - s64 delta; - struct bch_replicas_entry r; -} __packed; - -struct replicas_delta_list { - unsigned size; - unsigned used; - - struct {} memset_start; - u64 nr_inodes; - u64 persistent_reserved[BCH_REPLICAS_MAX]; - struct {} memset_end; - struct replicas_delta d[0]; -}; - /* * A reservation for space on disk: */ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 0377f90..c616014 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -5,6 +5,7 @@ #include "bcachefs_ioctl.h" #include "buckets.h" #include "chardev.h" +#include "journal.h" #include "move.h" #include "replicas.h" #include "super.h" @@ -340,7 +341,8 @@ static long bch2_ioctl_data(struct bch_fs *c, ctx->c = c; ctx->arg = arg; - ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); + ctx->thread = kthread_create(bch2_data_thread, ctx, + "bch-data/%s", c->name); if (IS_ERR(ctx->thread)) { ret = PTR_ERR(ctx->thread); goto err; @@ -377,7 +379,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, { struct bch_ioctl_fs_usage *arg = NULL; struct bch_replicas_usage *dst_e, *dst_end; - struct bch_fs_usage *src; + struct bch_fs_usage_online *src; u32 replica_entries_bytes; unsigned i; int ret = 0; @@ -403,7 +405,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, arg->online_reserved = src->online_reserved; for (i = 0; i < BCH_REPLICAS_MAX; i++) - arg->persistent_reserved[i] = src->persistent_reserved[i]; + arg->persistent_reserved[i] = src->u.persistent_reserved[i]; dst_e = arg->replicas; dst_end = (void *) arg->replicas + replica_entries_bytes; @@ -417,7 +419,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, break; } - dst_e->sectors = src->replicas[i]; + dst_e->sectors = src->u.replicas[i]; dst_e->r = *src_e; /* recheck after setting nr_devs: */ @@ -475,11 +477,11 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; arg.ec_buckets = src.buckets_ec; - arg.ec_sectors = src.sectors_ec; + arg.ec_sectors = 0; for (i = 0; i < BCH_DATA_NR; i++) { - arg.buckets[i] = src.buckets[i]; - arg.sectors[i] = src.sectors[i]; + arg.buckets[i] = src.d[i].buckets; + arg.sectors[i] = src.d[i].sectors; } percpu_ref_put(&ca->ref); @@ -563,6 +565,26 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, return ret; } +static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, + struct bch_ioctl_disk_resize_journal arg) +{ + struct bch_dev *ca; + int ret; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); + + percpu_ref_put(&ca->ref); + return ret; +} + #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ @@ -619,6 +641,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(data, struct bch_ioctl_data); case BCH_IOCTL_DISK_RESIZE: BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); + case BCH_IOCTL_DISK_RESIZE_JOURNAL: + BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); default: return -ENOTTY; diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 24dee80..728b7ef 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -77,11 +77,11 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, bool data) { switch (type) { - case BCH_CSUM_OPT_NONE: + case BCH_CSUM_OPT_none: return BCH_CSUM_NONE; - case BCH_CSUM_OPT_CRC32C: + case BCH_CSUM_OPT_crc32c: return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; - case BCH_CSUM_OPT_CRC64: + case BCH_CSUM_OPT_crc64: return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; default: BUG(); diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index 1d1590d..4324cfe 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) spin_lock(&clock->timer_lock); - if (time_after_eq((unsigned long) atomic_long_read(&clock->now), + if (time_after_eq((unsigned long) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); @@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) { struct io_timer *timer; - unsigned long now = atomic_long_add_return(sectors, &clock->now); + unsigned long now = atomic64_add_return(sectors, &clock->now); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); @@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) unsigned i; spin_lock(&clock->timer_lock); - now = atomic_long_read(&clock->now); + now = atomic64_read(&clock->now); for (i = 0; i < clock->timers.used; i++) pr_buf(out, "%ps:\t%li\n", @@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock) int bch2_io_clock_init(struct io_clock *clock) { - atomic_long_set(&clock->now, 0); + atomic64_set(&clock->now, 0); spin_lock_init(&clock->timer_lock); clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h index 92c740a..5fae001 100644 --- a/libbcachefs/clock_types.h +++ b/libbcachefs/clock_types.h @@ -26,7 +26,7 @@ struct io_timer { typedef HEAP(struct io_timer *) io_timer_heap; struct io_clock { - atomic_long_t now; + atomic64_t now; u16 __percpu *pcpu_buf; unsigned max_slop; diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index b50d2b0..f63651d 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -70,7 +70,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); - if (!IS_ENABLED(CONFIG_HIGHMEM) && + if (!PageHighMem(bio_iter_page(bio, start)) && bio_phys_contig(bio, start)) return (struct bbuf) { .b = page_address(bio_iter_page(bio, start)) + @@ -336,8 +336,19 @@ static int attempt_compress(struct bch_fs *c, ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); + /* + * ZSTD requires that when we decompress we pass in the exact + * compressed size - rounding it up to the nearest sector + * doesn't work, so we use the first 4 bytes of the buffer for + * that. + * + * Additionally, the ZSTD code seems to have a bug where it will + * write just past the end of the buffer - so subtract a fudge + * factor (7 bytes) from the dst buffer size to account for + * that. + */ size_t len = ZSTD_compressCCtx(ctx, - dst + 4, dst_len - 4, + dst + 4, dst_len - 4 - 7, src, src_len, c->zstd_params); if (ZSTD_isError(len)) diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index aa10591..acf6003 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -54,7 +54,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) v->written = 0; v->c.level = b->c.level; v->c.btree_id = b->c.btree_id; - bch2_btree_keys_init(v, &c->expensive_debug_checks); + bch2_btree_keys_init(v); if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) @@ -79,7 +79,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) memcpy(n_ondisk, n_sorted, btree_bytes(c)); - if (bch2_btree_node_read_done(c, v, false)) + if (bch2_btree_node_read_done(c, ca, v, false)) goto out; n_sorted = c->verify_data->data; @@ -222,7 +222,9 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); + iter = bch2_trans_get_iter(&trans, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); k = bch2_btree_iter_peek(iter); while (k.k && !(err = bkey_err(k))) { @@ -242,6 +244,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) break; } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return err < 0 ? err : i->ret; @@ -271,7 +275,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (err) return err; - if (!i->size || !bkey_cmp(POS_MAX, i->from)) + if (!i->size || !bpos_cmp(POS_MAX, i->from)) return i->ret; bch2_trans_init(&trans, i->c, 0, 0); @@ -287,13 +291,15 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, * can't easily correctly restart a btree node traversal across * all nodes, meh */ - i->from = bkey_cmp(POS_MAX, b->key.k.p) - ? bkey_successor(b->key.k.p) + i->from = bpos_cmp(POS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) : b->key.k.p; if (!i->size) break; } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return err < 0 ? err : i->ret; @@ -352,7 +358,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (err) break; - bch2_btree_iter_next(iter); + bch2_btree_iter_advance(iter); i->from = iter->pos; err = flush_buf(i); diff --git a/libbcachefs/debug.h b/libbcachefs/debug.h index 56c2d1a..7ac1615 100644 --- a/libbcachefs/debug.h +++ b/libbcachefs/debug.h @@ -8,44 +8,15 @@ struct bio; struct btree; struct bch_fs; -#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) \ - { return bch2_##name || c->name; } -BCH_DEBUG_PARAMS_ALWAYS() -#undef BCH_DEBUG_PARAM - #ifdef CONFIG_BCACHEFS_DEBUG - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) \ - { return bch2_##name || c->name; } -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM - void __bch2_btree_verify(struct bch_fs *, struct btree *); - -#define bypass_torture_test(d) ((d)->bypass_torture_test) - -#else /* DEBUG */ - -#define BCH_DEBUG_PARAM(name, description) \ - static inline bool name(struct bch_fs *c) { return false; } -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM - +#else static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} - -#define bypass_torture_test(d) 0 - #endif static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { - if (verify_btree_ondisk(c)) + if (bch2_verify_btree_ondisk) __bch2_btree_verify(c, b); } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index f34bfda..cf4ce2e 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -64,7 +64,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) } const struct bch_hash_desc bch2_dirent_hash_desc = { - .btree_id = BTREE_ID_DIRENTS, + .btree_id = BTREE_ID_dirents, .key_type = KEY_TYPE_dirent, .hash_key = dirent_hash_key, .hash_bkey = dirent_hash_bkey, @@ -141,7 +141,7 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, int bch2_dirent_create(struct btree_trans *trans, u64 dir_inum, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, - int flags) + u64 *dir_offset, int flags) { struct bkey_i_dirent *dirent; int ret; @@ -151,8 +151,11 @@ int bch2_dirent_create(struct btree_trans *trans, if (ret) return ret; - return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, &dirent->k_i, flags); + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, &dirent->k_i, flags); + *dir_offset = dirent->k.p.offset; + + return ret; } static void dirent_copy_target(struct bkey_i_dirent *dst, @@ -165,8 +168,8 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, int bch2_dirent_rename(struct btree_trans *trans, u64 src_dir, struct bch_hash_info *src_hash, u64 dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, u64 *src_inum, - const struct qstr *dst_name, u64 *dst_inum, + const struct qstr *src_name, u64 *src_inum, u64 *src_offset, + const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) { struct btree_iter *src_iter = NULL, *dst_iter = NULL; @@ -255,14 +258,14 @@ int bch2_dirent_rename(struct btree_trans *trans, new_dst->k.p = src_iter->pos; bch2_trans_update(trans, src_iter, &new_dst->k_i, 0); - goto out; + goto out_set_offset; } else { /* If we're overwriting, we can't insert new_dst * at a different slot because it has to * overwrite old_dst - just make sure to use a * whiteout when deleting src: */ - new_src->k.type = KEY_TYPE_whiteout; + new_src->k.type = KEY_TYPE_hash_whiteout; } } else { /* Check if we need a whiteout to delete src: */ @@ -272,12 +275,15 @@ int bch2_dirent_rename(struct btree_trans *trans, goto out; if (ret) - new_src->k.type = KEY_TYPE_whiteout; + new_src->k.type = KEY_TYPE_hash_whiteout; } } bch2_trans_update(trans, src_iter, &new_src->k_i, 0); bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); +out_set_offset: + *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; out: bch2_trans_iter_put(trans, src_iter); bch2_trans_iter_put(trans, dst_iter); @@ -321,6 +327,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, k = bch2_btree_iter_peek_slot(iter); inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + bch2_trans_iter_put(&trans, iter); out: bch2_trans_exit(&trans); return inum; @@ -332,7 +339,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) struct bkey_s_c k; int ret; - for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS(dir_inum, 0), 0, k, ret) { if (k.k->p.inode > dir_inum) break; @@ -357,7 +364,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, + for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS(inum, ctx->pos), 0, k, ret) { if (k.k->p.inode > inum) break; @@ -379,6 +386,8 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) break; ctx->pos = dirent.k->p.offset + 1; } + bch2_trans_iter_put(&trans, iter); + ret = bch2_trans_exit(&trans) ?: ret; return ret; diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 3476937..e1d8ce3 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -31,7 +31,7 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_create(struct btree_trans *, u64, const struct bch_hash_info *, u8, - const struct qstr *, u64, int); + const struct qstr *, u64, u64 *, int); int bch2_dirent_delete_at(struct btree_trans *, const struct bch_hash_info *, @@ -46,8 +46,8 @@ enum bch_rename_mode { int bch2_dirent_rename(struct btree_trans *, u64, struct bch_hash_info *, u64, struct bch_hash_info *, - const struct qstr *, u64 *, - const struct qstr *, u64 *, + const struct qstr *, u64 *, u64 *, + const struct qstr *, u64 *, u64 *, enum bch_rename_mode); struct btree_iter * diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index e4a4805..f712f68 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -4,7 +4,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "bset.h" #include "btree_gc.h" #include "btree_update.h" @@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + if (!bkey_cmp(k.k->p, POS_MIN)) + return "stripe at pos 0"; + if (k.k->p.inode) return "invalid stripe key"; @@ -138,44 +141,19 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, stripe_blockcount_get(s, i)); } -static int ptr_matches_stripe(struct bch_fs *c, - struct bch_stripe *v, - const struct bch_extent_ptr *ptr) +/* returns blocknr in stripe that we matched: */ +static int bkey_matches_stripe(struct bch_stripe *s, + struct bkey_s_c k) { - unsigned i; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned i, nr_data = s->nr_blocks - s->nr_redundant; - for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { - const struct bch_extent_ptr *ptr2 = v->ptrs + i; - - if (ptr->dev == ptr2->dev && - ptr->gen == ptr2->gen && - ptr->offset >= ptr2->offset && - ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) - return i; - } - - return -1; -} - -static int extent_matches_stripe(struct bch_fs *c, - struct bch_stripe *v, - struct bkey_s_c k) -{ - - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - int idx; - - extent_for_each_ptr(e, ptr) { - idx = ptr_matches_stripe(c, v, ptr); - if (idx >= 0) - return idx; - } - break; - } - } + bkey_for_each_ptr(ptrs, ptr) + for (i = 0; i < nr_data; i++) + if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, + le16_to_cpu(s->sectors))) + return i; return -1; } @@ -200,46 +178,95 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) return false; } +/* Stripe bufs: */ + +static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) +{ + unsigned i; + + for (i = 0; i < buf->key.v.nr_blocks; i++) { + kvpfree(buf->data[i], buf->size << 9); + buf->data[i] = NULL; + } +} + +static int ec_stripe_buf_init(struct ec_stripe_buf *buf, + unsigned offset, unsigned size) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1U << v->csum_granularity_bits; + unsigned end = offset + size; + unsigned i; + + BUG_ON(end > le16_to_cpu(v->sectors)); + + offset = round_down(offset, csum_granularity); + end = min_t(unsigned, le16_to_cpu(v->sectors), + round_up(end, csum_granularity)); + + buf->offset = offset; + buf->size = end - offset; + + memset(buf->valid, 0xFF, sizeof(buf->valid)); + + for (i = 0; i < buf->key.v.nr_blocks; i++) { + buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + if (!buf->data[i]) + goto err; + } + + return 0; +err: + ec_stripe_buf_exit(buf); + return -ENOMEM; +} + /* Checksumming: */ -static void ec_generate_checksums(struct ec_stripe_buf *buf) +static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, + unsigned block, unsigned offset) { struct bch_stripe *v = &buf->key.v; unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned csums_per_device = stripe_csums_per_device(v); - unsigned csum_bytes = bch_crc_bytes[v->csum_type]; - unsigned i, j; + unsigned end = buf->offset + buf->size; + unsigned len = min(csum_granularity, end - offset); + + BUG_ON(offset >= end); + BUG_ON(offset < buf->offset); + BUG_ON(offset & (csum_granularity - 1)); + BUG_ON(offset + len != le16_to_cpu(v->sectors) && + (len & (csum_granularity - 1))); + + return bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[block] + ((offset - buf->offset) << 9), + len << 9); +} - if (!csum_bytes) +static void ec_generate_checksums(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned i, j, csums_per_device = stripe_csums_per_device(v); + + if (!v->csum_type) return; BUG_ON(buf->offset); BUG_ON(buf->size != le16_to_cpu(v->sectors)); - for (i = 0; i < v->nr_blocks; i++) { - for (j = 0; j < csums_per_device; j++) { - unsigned offset = j << v->csum_granularity_bits; - unsigned len = min(csum_granularity, buf->size - offset); - - struct bch_csum csum = - bch2_checksum(NULL, v->csum_type, - null_nonce(), - buf->data[i] + (offset << 9), - len << 9); - - memcpy(stripe_csum(v, i, j), &csum, csum_bytes); - } - } + for (i = 0; i < v->nr_blocks; i++) + for (j = 0; j < csums_per_device; j++) + stripe_csum_set(v, i, j, + ec_block_checksum(buf, i, j << v->csum_granularity_bits)); } static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) { struct bch_stripe *v = &buf->key.v; unsigned csum_granularity = 1 << v->csum_granularity_bits; - unsigned csum_bytes = bch_crc_bytes[v->csum_type]; unsigned i; - if (!csum_bytes) + if (!v->csum_type) return; for (i = 0; i < v->nr_blocks; i++) { @@ -252,21 +279,18 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) while (offset < end) { unsigned j = offset >> v->csum_granularity_bits; unsigned len = min(csum_granularity, end - offset); - struct bch_csum csum; + struct bch_csum want = stripe_csum_get(v, i, j); + struct bch_csum got = ec_block_checksum(buf, i, offset); - BUG_ON(offset & (csum_granularity - 1)); - BUG_ON(offset + len != le16_to_cpu(v->sectors) && - ((offset + len) & (csum_granularity - 1))); + if (bch2_crc_cmp(want, got)) { + char buf2[200]; - csum = bch2_checksum(NULL, v->csum_type, - null_nonce(), - buf->data[i] + ((offset - buf->offset) << 9), - len << 9); + bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i)); - if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { - __bcache_io_error(c, - "checksum error while doing reconstruct read (%u:%u)", - i, j); + bch_err_ratelimited(c, + "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", + (void *) _RET_IP_, i, j, v->csum_type, + want.lo, got.lo, buf2); clear_bit(i, buf->valid); break; } @@ -287,25 +311,21 @@ static void ec_generate_ec(struct ec_stripe_buf *buf) raid_gen(nr_data, v->nr_redundant, bytes, buf->data); } -static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) -{ - return nr - bitmap_weight(buf->valid, nr); -} - static unsigned ec_nr_failed(struct ec_stripe_buf *buf) { - return __ec_nr_failed(buf, buf->key.v.nr_blocks); + return buf->key.v.nr_blocks - + bitmap_weight(buf->valid, buf->key.v.nr_blocks); } static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) { struct bch_stripe *v = &buf->key.v; - unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; + unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; unsigned nr_data = v->nr_blocks - v->nr_redundant; unsigned bytes = buf->size << 9; if (ec_nr_failed(buf) > v->nr_redundant) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: unable to read enough blocks"); return -1; } @@ -323,14 +343,23 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) static void ec_block_endio(struct bio *bio) { struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); + struct bch_stripe *v = &ec_bio->buf->key.v; + struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", bio_data_dir(bio) ? "write" : "read", bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(ca->fs, + "error %s stripe: stale pointer after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to"); + clear_bit(ec_bio->idx, ec_bio->buf->valid); + } + bio_put(&ec_bio->bio); percpu_ref_put(&ca->io_ref); closure_put(cl); @@ -347,6 +376,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ? BCH_DATA_user : BCH_DATA_parity; + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(c, + "error %s stripe: stale pointer", + rw == READ ? "reading from" : "writing to"); + clear_bit(idx, buf->valid); + return; + } + if (!bch2_dev_get_ioref(ca, rw)) { clear_bit(idx, buf->valid); return; @@ -389,92 +426,82 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, percpu_ref_put(&ca->io_ref); } -/* recovery read path: */ -int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) { struct btree_trans trans; struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + if (k.k->type != KEY_TYPE_stripe) { + ret = -ENOENT; + goto err; + } + bkey_reassemble(&stripe->key.k_i, k); +err: + bch2_trans_exit(&trans); + return ret; +} + +/* recovery read path: */ +int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +{ struct ec_stripe_buf *buf; struct closure cl; - struct bkey_s_c k; struct bch_stripe *v; - unsigned stripe_idx; - unsigned offset, end; - unsigned i, nr_data, csum_granularity; - int ret = 0, idx; + unsigned i, offset; + int ret = 0; closure_init_stack(&cl); BUG_ON(!rbio->pick.has_ec); - stripe_idx = rbio->pick.ec.idx; - buf = kzalloc(sizeof(*buf), GFP_NOIO); if (!buf) return -ENOMEM; - bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, - POS(0, stripe_idx), - BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { - __bcache_io_error(c, - "error doing reconstruct read: stripe not found"); + ret = get_stripe_key(c, rbio->pick.ec.idx, buf); + if (ret) { + bch_err_ratelimited(c, + "error doing reconstruct read: error %i looking up stripe", ret); kfree(buf); - return bch2_trans_exit(&trans) ?: -EIO; + return -EIO; } - bkey_reassemble(&buf->key.k_i, k); - bch2_trans_exit(&trans); - v = &buf->key.v; - nr_data = v->nr_blocks - v->nr_redundant; - - idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); - BUG_ON(idx < 0); - - csum_granularity = 1U << v->csum_granularity_bits; - - offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; - end = offset + bio_sectors(&rbio->bio); - - BUG_ON(end > le16_to_cpu(v->sectors)); - - buf->offset = round_down(offset, csum_granularity); - buf->size = min_t(unsigned, le16_to_cpu(v->sectors), - round_up(end, csum_granularity)) - buf->offset; - - for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); - if (!buf->data[i]) { - ret = -ENOMEM; - goto err; - } + if (!bch2_ptr_matches_stripe(v, rbio->pick)) { + bch_err_ratelimited(c, + "error doing reconstruct read: pointer doesn't match stripe"); + ret = -EIO; + goto err; } - memset(buf->valid, 0xFF, sizeof(buf->valid)); - - for (i = 0; i < v->nr_blocks; i++) { - struct bch_extent_ptr *ptr = v->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; + if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { + bch_err_ratelimited(c, + "error doing reconstruct read: read is bigger than stripe"); + ret = -EIO; + goto err; + } - if (ptr_stale(ca, ptr)) { - __bcache_io_error(c, - "error doing reconstruct read: stale pointer"); - clear_bit(i, buf->valid); - continue; - } + ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); + if (ret) + goto err; + for (i = 0; i < v->nr_blocks; i++) ec_block_io(c, buf, REQ_OP_READ, i, &cl); - } closure_sync(&cl); if (ec_nr_failed(buf) > v->nr_redundant) { - __bcache_io_error(c, + bch_err_ratelimited(c, "error doing reconstruct read: unable to read enough blocks"); ret = -EIO; goto err; @@ -487,10 +514,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) goto err; memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, - buf->data[idx] + ((offset - buf->offset) << 9)); + buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); err: - for (i = 0; i < v->nr_blocks; i++) - kfree(buf->data[i]); + ec_stripe_buf_exit(buf); kfree(buf); return ret; } @@ -643,8 +669,7 @@ void bch2_stripes_heap_update(struct bch_fs *c, static int ec_stripe_delete(struct bch_fs *c, size_t idx) { - //pr_info("deleting stripe %zu", idx); - return bch2_btree_delete_range(c, BTREE_ID_EC, + return bch2_btree_delete_range(c, BTREE_ID_stripes, POS(0, idx), POS(0, idx + 1), NULL); @@ -675,24 +700,25 @@ static void ec_stripe_delete_work(struct work_struct *work) /* stripe creation: */ static int ec_stripe_bkey_insert(struct bch_fs *c, - struct ec_stripe_new *s, - struct bkey_i_stripe *stripe) + struct bkey_i_stripe *stripe, + struct disk_reservation *res) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bpos start_pos = POS(0, c->ec_stripe_hint); + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); - for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, + for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { if (start_pos.offset) { - start_pos = POS_MIN; + start_pos = min_pos; bch2_btree_iter_set_pos(iter, start_pos); continue; } @@ -717,7 +743,7 @@ found_slot: bch2_trans_update(&trans, iter, &stripe->k_i, 0); - ret = bch2_trans_commit(&trans, &s->res, NULL, + ret = bch2_trans_commit(&trans, res, NULL, BTREE_INSERT_NOFAIL); err: bch2_trans_iter_put(&trans, iter); @@ -731,6 +757,46 @@ err: return ret; } +static int ec_stripe_bkey_update(struct btree_trans *trans, + struct bkey_i_stripe *new) +{ + struct btree_iter *iter; + struct bkey_s_c k; + const struct bch_stripe *existing; + unsigned i; + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, + new->k.p, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || k.k->type != KEY_TYPE_stripe) { + bch_err(trans->c, "error updating stripe: not found"); + ret = -ENOENT; + goto err; + } + + existing = bkey_s_c_to_stripe(k).v; + + if (existing->nr_blocks != new->v.nr_blocks) { + bch_err(trans->c, "error updating stripe: nr_blocks does not match"); + ret = -EINVAL; + goto err; + } + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, + stripe_blockcount_get(existing, i)); + + bch2_trans_update(trans, iter, &new->k_i, 0); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + static void extent_stripe_ptr_add(struct bkey_s_extent e, struct ec_stripe_buf *s, struct bch_extent_ptr *ptr, @@ -745,6 +811,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e, *dst = (struct bch_extent_stripe_ptr) { .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, .block = block, + .redundancy = s->key.v.nr_redundant, .idx = s->key.k.p.offset, }; } @@ -757,15 +824,15 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, struct btree_iter *iter; struct bkey_s_c k; struct bkey_s_extent e; - struct bkey_on_stack sk; - int ret = 0, dev, idx; + struct bkey_buf sk; + int ret = 0, dev, block; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); /* XXX this doesn't support the reflink btree */ - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, bkey_start_pos(pos), BTREE_ITER_INTENT); @@ -775,41 +842,41 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, struct bch_extent_ptr *ptr, *ec_ptr = NULL; if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { - bch2_btree_iter_next(iter); + bch2_btree_iter_advance(iter); continue; } - idx = extent_matches_stripe(c, &s->key.v, k); - if (idx < 0) { - bch2_btree_iter_next(iter); + block = bkey_matches_stripe(&s->key.v, k); + if (block < 0) { + bch2_btree_iter_advance(iter); continue; } - dev = s->key.v.ptrs[idx].dev; + dev = s->key.v.ptrs[block].dev; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); e = bkey_i_to_s_extent(sk.k); bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); BUG_ON(!ec_ptr); - extent_stripe_ptr_add(e, s, ec_ptr, idx); + extent_stripe_ptr_add(e, s, ec_ptr, block); bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); bch2_trans_update(&trans, iter, sk.k, 0); ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); + BTREE_INSERT_NOFAIL); if (ret == -EINTR) ret = 0; if (ret) break; } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } @@ -823,14 +890,13 @@ static void ec_stripe_create(struct ec_stripe_new *s) struct open_bucket *ob; struct bkey_i *k; struct stripe *m; - struct bch_stripe *v = &s->stripe.key.v; + struct bch_stripe *v = &s->new_stripe.key.v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - struct closure cl; int ret; BUG_ON(s->h->s == s); - closure_init_stack(&cl); + closure_sync(&s->iodone); if (s->err) { if (s->err != -EROFS) @@ -838,73 +904,86 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err; } + if (s->have_existing_stripe) { + ec_validate_checksums(c, &s->existing_stripe); + + if (ec_do_recov(c, &s->existing_stripe)) { + bch_err(c, "error creating stripe: error reading existing stripe"); + goto err; + } + + for (i = 0; i < nr_data; i++) + if (stripe_blockcount_get(&s->existing_stripe.key.v, i)) + swap(s->new_stripe.data[i], + s->existing_stripe.data[i]); + + ec_stripe_buf_exit(&s->existing_stripe); + } + BUG_ON(!s->allocated); if (!percpu_ref_tryget(&c->writes)) goto err; - BUG_ON(bitmap_weight(s->blocks_allocated, - s->blocks.nr) != s->blocks.nr); - - ec_generate_ec(&s->stripe); + ec_generate_ec(&s->new_stripe); - ec_generate_checksums(&s->stripe); + ec_generate_checksums(&s->new_stripe); /* write p/q: */ for (i = nr_data; i < v->nr_blocks; i++) - ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); + ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); + closure_sync(&s->iodone); - closure_sync(&cl); - - for (i = nr_data; i < v->nr_blocks; i++) - if (!test_bit(i, s->stripe.valid)) { - bch_err(c, "error creating stripe: error writing redundancy buckets"); - goto err_put_writes; - } + if (ec_nr_failed(&s->new_stripe)) { + bch_err(c, "error creating stripe: error writing redundancy buckets"); + goto err_put_writes; + } - ret = s->existing_stripe - ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, - &s->res, NULL, BTREE_INSERT_NOFAIL) - : ec_stripe_bkey_insert(c, s, &s->stripe.key); + ret = s->have_existing_stripe + ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, + ec_stripe_bkey_update(&trans, &s->new_stripe.key)) + : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res); if (ret) { bch_err(c, "error creating stripe: error creating stripe key"); goto err_put_writes; } for_each_keylist_key(&s->keys, k) { - ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); + ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k); if (ret) { - bch_err(c, "error creating stripe: error updating pointers"); + bch_err(c, "error creating stripe: error %i updating pointers", ret); break; } } spin_lock(&c->ec_stripes_heap_lock); - m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); -#if 0 - pr_info("created a %s stripe %llu", - s->existing_stripe ? "existing" : "new", - s->stripe.key.k.p.offset); -#endif + m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset); + BUG_ON(m->on_heap); - bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); + bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); spin_unlock(&c->ec_stripes_heap_lock); err_put_writes: percpu_ref_put(&c->writes); err: bch2_disk_reservation_put(c, &s->res); - open_bucket_for_each(c, &s->blocks, ob, i) { - ob->ec = NULL; - __bch2_open_bucket_put(c, ob); - } - - bch2_open_buckets_put(c, &s->parity); + for (i = 0; i < v->nr_blocks; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; + + if (i < nr_data) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); + } else { + bch2_open_bucket_put(c, ob); + } + } bch2_keylist_free(&s->keys, s->inline_keys); - for (i = 0; i < s->stripe.key.v.nr_blocks; i++) - kvpfree(s->stripe.data[i], s->stripe.size << 9); + ec_stripe_buf_exit(&s->existing_stripe); + ec_stripe_buf_exit(&s->new_stripe); + closure_debug_destroy(&s->iodone); kfree(s); } @@ -981,7 +1060,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) ca = bch_dev_bkey_exists(c, ob->ptr.dev); offset = ca->mi.bucket_size - ob->sectors_free; - return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); + return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, @@ -993,8 +1072,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, if (!ob) return; - //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); - ec = ob->ec; mutex_lock(&ec->lock); @@ -1088,7 +1165,6 @@ static void ec_stripe_key_init(struct bch_fs *c, static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s; - unsigned i; lockdep_assert_held(&h->lock); @@ -1097,41 +1173,27 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) return -ENOMEM; mutex_init(&s->lock); + closure_init(&s->iodone, NULL); atomic_set(&s->pin, 1); s->c = c; s->h = h; s->nr_data = min_t(unsigned, h->nr_active_devs, - EC_STRIPE_MAX) - h->redundancy; + BCH_BKEY_PTRS_MAX) - h->redundancy; s->nr_parity = h->redundancy; bch2_keylist_init(&s->keys, s->inline_keys); - s->stripe.offset = 0; - s->stripe.size = h->blocksize; - memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); - - ec_stripe_key_init(c, &s->stripe.key, s->nr_data, + ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, s->nr_parity, h->blocksize); - for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { - s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); - if (!s->stripe.data[i]) - goto err; - } - h->s = s; - return 0; -err: - for (i = 0; i < s->stripe.key.v.nr_blocks; i++) - kvpfree(s->stripe.data[i], s->stripe.size << 9); - kfree(s); - return -ENOMEM; } static struct ec_stripe_head * ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, - unsigned algo, unsigned redundancy) + unsigned algo, unsigned redundancy, + bool copygc) { struct ec_stripe_head *h; struct bch_dev *ca; @@ -1147,6 +1209,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->target = target; h->algo = algo; h->redundancy = redundancy; + h->copygc = copygc; rcu_read_lock(); h->devs = target_rw_devs(c, BCH_DATA_user, target); @@ -1171,16 +1234,17 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) if (h->s && h->s->allocated && bitmap_weight(h->s->blocks_allocated, - h->s->blocks.nr) == h->s->blocks.nr) + h->s->nr_data) == h->s->nr_data) ec_stripe_set_pending(c, h); mutex_unlock(&h->lock); } struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, - unsigned target, - unsigned algo, - unsigned redundancy) + unsigned target, + unsigned algo, + unsigned redundancy, + bool copygc) { struct ec_stripe_head *h; @@ -1191,76 +1255,98 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, list_for_each_entry(h, &c->ec_stripe_head_list, list) if (h->target == target && h->algo == algo && - h->redundancy == redundancy) { + h->redundancy == redundancy && + h->copygc == copygc) { mutex_lock(&h->lock); goto found; } - h = ec_new_stripe_head_alloc(c, target, algo, redundancy); + h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc); found: mutex_unlock(&c->ec_stripe_head_lock); return h; } -/* - * XXX: use a higher watermark for allocating open buckets here: - */ -static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) +static enum bucket_alloc_ret +new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + struct closure *cl) { - struct bch_devs_mask devs; + struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - unsigned i, nr_have, nr_data = - min_t(unsigned, h->nr_active_devs, - EC_STRIPE_MAX) - h->redundancy; + struct open_buckets buckets; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; - int ret = 0; - - devs = h->devs; - - for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { - __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); - --nr_data; + enum bucket_alloc_ret ret = ALLOC_SUCCESS; + + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (test_bit(i, h->s->blocks_gotten)) { + __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); + if (i < h->s->nr_data) + nr_have_data++; + else + nr_have_parity++; + } } - BUG_ON(h->s->blocks.nr > nr_data); - BUG_ON(h->s->parity.nr > h->redundancy); - - open_bucket_for_each(c, &h->s->parity, ob, i) - __clear_bit(ob->ptr.dev, devs.d); - open_bucket_for_each(c, &h->s->blocks, ob, i) - __clear_bit(ob->ptr.dev, devs.d); + BUG_ON(nr_have_data > h->s->nr_data); + BUG_ON(nr_have_parity > h->s->nr_parity); percpu_down_read(&c->mark_lock); rcu_read_lock(); - if (h->s->parity.nr < h->redundancy) { - nr_have = h->s->parity.nr; - - ret = bch2_bucket_alloc_set(c, &h->s->parity, + buckets.nr = 0; + if (nr_have_parity < h->s->nr_parity) { + ret = bch2_bucket_alloc_set(c, &buckets, &h->parity_stripe, &devs, - h->redundancy, - &nr_have, + h->s->nr_parity, + &nr_have_parity, &have_cache, - RESERVE_NONE, + h->copygc + ? RESERVE_MOVINGGC + : RESERVE_NONE, 0, - NULL); + cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data + h->s->nr_parity, + h->s->nr_data); + BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + + h->s->blocks[j] = buckets.v[i]; + h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + __set_bit(j, h->s->blocks_gotten); + } + if (ret) goto err; } - if (h->s->blocks.nr < nr_data) { - nr_have = h->s->blocks.nr; - - ret = bch2_bucket_alloc_set(c, &h->s->blocks, + buckets.nr = 0; + if (nr_have_data < h->s->nr_data) { + ret = bch2_bucket_alloc_set(c, &buckets, &h->block_stripe, &devs, - nr_data, - &nr_have, + h->s->nr_data, + &nr_have_data, &have_cache, - RESERVE_NONE, + h->copygc + ? RESERVE_MOVINGGC + : RESERVE_NONE, 0, - NULL); + cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data, 0); + BUG_ON(j >= h->s->nr_data); + + h->s->blocks[j] = buckets.v[i]; + h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + __set_bit(j, h->s->blocks_gotten); + } + if (ret) goto err; } @@ -1272,53 +1358,101 @@ err: /* XXX: doesn't obey target: */ static s64 get_existing_stripe(struct bch_fs *c, - unsigned target, - unsigned algo, - unsigned redundancy) + struct ec_stripe_head *head) { ec_stripes_heap *h = &c->ec_stripes_heap; struct stripe *m; size_t heap_idx; u64 stripe_idx; + s64 ret = -1; if (may_create_new_stripe(c)) return -1; spin_lock(&c->ec_stripes_heap_lock); for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + /* No blocks worth reusing, stripe will just be deleted: */ if (!h->data[heap_idx].blocks_nonempty) continue; stripe_idx = h->data[heap_idx].idx; m = genradix_ptr(&c->stripes[0], stripe_idx); - if (m->algorithm == algo && - m->nr_redundant == redundancy && + if (m->algorithm == head->algo && + m->nr_redundant == head->redundancy && + m->sectors == head->blocksize && m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { bch2_stripes_heap_del(c, m, stripe_idx); - spin_unlock(&c->ec_stripes_heap_lock); - return stripe_idx; + ret = stripe_idx; + break; } } - spin_unlock(&c->ec_stripes_heap_lock); - return -1; + return ret; } -static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, + struct ec_stripe_head *h) { - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; + unsigned i; + s64 idx; int ret; - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (!ret) - bkey_reassemble(&stripe->key.k_i, k); - bch2_trans_exit(&trans); + idx = get_existing_stripe(c, h); + if (idx < 0) { + bch_err(c, "failed to find an existing stripe"); + return -ENOSPC; + } + + h->s->have_existing_stripe = true; + ret = get_stripe_key(c, idx, &h->s->existing_stripe); + if (ret) { + bch2_fs_fatal_error(c, "error reading stripe key: %i", ret); + return ret; + } + + if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) { + /* + * this is a problem: we have deleted from the + * stripes heap already + */ + BUG(); + } + + BUG_ON(h->s->existing_stripe.size != h->blocksize); + BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); + + for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { + if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { + __set_bit(i, h->s->blocks_gotten); + __set_bit(i, h->s->blocks_allocated); + } + + ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + } + + bkey_copy(&h->s->new_stripe.key.k_i, + &h->s->existing_stripe.key.k_i); + + return 0; +} + +static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, + struct ec_stripe_head *h) +{ + int ret; + + ret = bch2_disk_reservation_get(c, &h->s->res, + h->blocksize, + h->s->nr_parity, 0); + + if (ret) { + /* + * This means we need to wait for copygc to + * empty out buckets from existing stripes: + */ + bch_err(c, "failed to reserve stripe"); + } return ret; } @@ -1326,86 +1460,58 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, unsigned target, unsigned algo, - unsigned redundancy) + unsigned redundancy, + bool copygc, + struct closure *cl) { - struct closure cl; struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i, data_idx = 0; - s64 idx; int ret; + bool needs_stripe_new; - closure_init_stack(&cl); - - h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); - if (!h) - return NULL; - - if (!h->s && ec_new_stripe_alloc(c, h)) { - bch2_ec_stripe_head_put(c, h); + h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc); + if (!h) { + bch_err(c, "no stripe head"); return NULL; } - if (!h->s->allocated) { - if (!h->s->existing_stripe && - (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { - //pr_info("got existing stripe %llu", idx); - - h->s->existing_stripe = true; - h->s->existing_stripe_idx = idx; - if (get_stripe_key(c, idx, &h->s->stripe)) { - /* btree error */ - BUG(); - } - - for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) - if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { - __set_bit(i, h->s->blocks_allocated); - ec_block_io(c, &h->s->stripe, READ, i, &cl); - } - } - - if (!h->s->existing_stripe && - !h->s->res.sectors) { - ret = bch2_disk_reservation_get(c, &h->s->res, - h->blocksize, - h->s->nr_parity, 0); - if (ret) { - /* What should we do here? */ - bch_err(c, "unable to create new stripe: %i", ret); - bch2_ec_stripe_head_put(c, h); - h = NULL; - goto out; - - } - - } - - if (new_stripe_alloc_buckets(c, h)) { - bch2_ec_stripe_head_put(c, h); - h = NULL; - goto out; + needs_stripe_new = !h->s; + if (needs_stripe_new) { + if (ec_new_stripe_alloc(c, h)) { + ret = -ENOMEM; + bch_err(c, "failed to allocate new stripe"); + goto err; } - open_bucket_for_each(c, &h->s->blocks, ob, i) { - data_idx = find_next_zero_bit(h->s->blocks_allocated, - h->s->nr_data, data_idx); - BUG_ON(data_idx >= h->s->nr_data); + if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) + BUG(); + } - h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; - h->s->data_block_idx[i] = data_idx; - data_idx++; - } + /* + * Try reserve a new stripe before reusing an + * existing stripe. This will prevent unnecessary + * read amplification during write oriented workloads. + */ + ret = 0; + if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe) + ret = __bch2_ec_stripe_head_reserve(c, h); + if (ret && needs_stripe_new) + ret = __bch2_ec_stripe_head_reuse(c, h); + if (ret) + goto err; - open_bucket_for_each(c, &h->s->parity, ob, i) - h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; + if (!h->s->allocated) { + ret = new_stripe_alloc_buckets(c, h, cl); + if (ret) + goto err; - //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); h->s->allocated = true; } -out: - closure_sync(&cl); + return h; + +err: + bch2_ec_stripe_head_put(c, h); + return ERR_PTR(-ret); } void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) @@ -1421,12 +1527,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) if (!h->s) goto unlock; - open_bucket_for_each(c, &h->s->blocks, ob, i) - if (ob->ptr.dev == ca->dev_idx) - goto found; - open_bucket_for_each(c, &h->s->parity, ob, i) + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (!h->s->blocks[i]) + continue; + + ob = c->open_buckets + h->s->blocks[i]; if (ob->ptr.dev == ca->dev_idx) goto found; + } goto unlock; found: h->s->err = -EROFS; @@ -1437,13 +1545,23 @@ unlock: mutex_unlock(&c->ec_stripe_head_lock); } +void bch2_stripes_heap_start(struct bch_fs *c) +{ + struct genradix_iter iter; + struct stripe *m; + + genradix_for_each(&c->stripes[0], iter, m) + if (m->alive) + bch2_stripes_heap_insert(c, m, iter.pos); +} + static int __bch2_stripe_write_key(struct btree_trans *trans, struct btree_iter *iter, struct stripe *m, size_t idx, struct bkey_i_stripe *new_key) { - struct bch_fs *c = trans->c; + const struct bch_stripe *v; struct bkey_s_c k; unsigned i; int ret; @@ -1458,16 +1576,17 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, if (k.k->type != KEY_TYPE_stripe) return -EIO; + v = bkey_s_c_to_stripe(k).v; + for (i = 0; i < v->nr_blocks; i++) + if (m->block_sectors[i] != stripe_blockcount_get(v, i)) + goto write; + return 0; +write: bkey_reassemble(&new_key->k_i, k); - spin_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < new_key->v.nr_blocks; i++) stripe_blockcount_set(&new_key->v, i, m->block_sectors[i]); - m->dirty = false; - - spin_unlock(&c->ec_stripes_heap_lock); bch2_trans_update(trans, iter, &new_key->k_i, 0); return 0; @@ -1487,11 +1606,11 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, + iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); genradix_for_each(&c->stripes[0], giter, m) { - if (!m->dirty) + if (!m->alive) continue; ret = __bch2_trans_do(&trans, NULL, NULL, @@ -1516,18 +1635,11 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, int ret = 0; if (k.k->type == KEY_TYPE_stripe) { - struct stripe *m; - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: bch2_mark_key(c, k, 0, 0, NULL, 0, BTREE_TRIGGER_NOATOMIC); if (ret) return ret; - - spin_lock(&c->ec_stripes_heap_lock); - m = genradix_ptr(&c->stripes[0], k.k->p.offset); - bch2_stripes_heap_insert(c, m, k.k->p.offset); - spin_unlock(&c->ec_stripes_heap_lock); } return ret; @@ -1535,7 +1647,7 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) { - int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, + int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_stripes, NULL, bch2_stripes_read_fn); if (ret) bch_err(c, "error reading stripes: %i", ret); @@ -1552,12 +1664,13 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) int ret = 0; bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0); k = bch2_btree_iter_prev(iter); if (!IS_ERR_OR_NULL(k.k)) idx = k.k->p.offset + 1; + + bch2_trans_iter_put(&trans, iter); ret = bch2_trans_exit(&trans); if (ret) return ret; @@ -1586,7 +1699,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) size_t i; spin_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min(h->used, 20UL); i++) { + for (i = 0; i < min_t(size_t, h->used, 20); i++) { m = genradix_ptr(&c->stripes[0], h->data[i].idx); pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, @@ -1608,19 +1721,17 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) h->target, h->algo, h->redundancy); if (h->s) - pr_buf(out, "\tpending: blocks %u allocated %u\n", - h->s->blocks.nr, + pr_buf(out, "\tpending: blocks %u+%u allocated %u\n", + h->s->nr_data, h->s->nr_parity, bitmap_weight(h->s->blocks_allocated, - h->s->blocks.nr)); + h->s->nr_data)); } mutex_unlock(&c->ec_stripe_head_lock); mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) { - pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", - s->blocks.nr, - bitmap_weight(s->blocks_allocated, - s->blocks.nr), + pr_buf(out, "\tin flight: blocks %u+%u pin %u\n", + s->nr_data, s->nr_parity, atomic_read(&s->pin)); } mutex_unlock(&c->ec_stripe_new_lock); diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 15f751f..744e51e 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -60,9 +60,66 @@ static inline unsigned stripe_val_u64s(const struct bch_stripe *s) } static inline void *stripe_csum(struct bch_stripe *s, - unsigned dev, unsigned csum_idx) + unsigned block, unsigned csum_idx) { - return (void *) s + stripe_csum_offset(s, dev, csum_idx); + EBUG_ON(block >= s->nr_blocks); + EBUG_ON(csum_idx >= stripe_csums_per_device(s)); + + return (void *) s + stripe_csum_offset(s, block, csum_idx); +} + +static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, + unsigned block, unsigned csum_idx) +{ + struct bch_csum csum = { 0 }; + + memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); + return csum; +} + +static inline void stripe_csum_set(struct bch_stripe *s, + unsigned block, unsigned csum_idx, + struct bch_csum csum) +{ + memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); +} + +static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, + const struct bch_extent_ptr *data_ptr, + unsigned sectors) +{ + return data_ptr->dev == stripe_ptr->dev && + data_ptr->gen == stripe_ptr->gen && + data_ptr->offset >= stripe_ptr->offset && + data_ptr->offset < stripe_ptr->offset + sectors; +} + +static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, + struct extent_ptr_decoded p) +{ + unsigned nr_data = s->nr_blocks - s->nr_redundant; + + BUG_ON(!p.has_ec); + + if (p.ec.block >= nr_data) + return false; + + return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, + le16_to_cpu(s->sectors)); +} + +static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m, + struct extent_ptr_decoded p) +{ + unsigned nr_data = m->nr_blocks - m->nr_redundant; + + BUG_ON(!p.has_ec); + + if (p.ec.block >= nr_data) + return false; + + return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, + m->sectors); } struct bch_read_bio; @@ -71,9 +128,9 @@ struct ec_stripe_buf { /* might not be buffering the entire stripe: */ unsigned offset; unsigned size; - unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; + unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; - void *data[EC_STRIPE_MAX]; + void *data[BCH_BKEY_PTRS_MAX]; union { struct bkey_i_stripe key; @@ -88,6 +145,7 @@ struct ec_stripe_new { struct ec_stripe_head *h; struct mutex lock; struct list_head list; + struct closure iodone; /* counts in flight writes, stripe is created when pin == 0 */ atomic_t pin; @@ -98,20 +156,18 @@ struct ec_stripe_new { u8 nr_parity; bool allocated; bool pending; - bool existing_stripe; - u64 existing_stripe_idx; - - unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; + bool have_existing_stripe; - struct open_buckets blocks; - u8 data_block_idx[EC_STRIPE_MAX]; - struct open_buckets parity; + unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; struct disk_reservation res; struct keylist keys; u64 inline_keys[BKEY_U64s * 8]; - struct ec_stripe_buf stripe; + struct ec_stripe_buf new_stripe; + struct ec_stripe_buf existing_stripe; }; struct ec_stripe_head { @@ -121,6 +177,7 @@ struct ec_stripe_head { unsigned target; unsigned algo; unsigned redundancy; + bool copygc; struct bch_devs_mask devs; unsigned nr_active_devs; @@ -145,8 +202,8 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); -struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, - unsigned, unsigned); +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, + unsigned, unsigned, unsigned, bool, struct closure *); void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); @@ -156,6 +213,8 @@ void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_ec_flush_new_stripes(struct bch_fs *); +void bch2_stripes_heap_start(struct bch_fs *); + struct journal_keys; int bch2_stripes_read(struct bch_fs *, struct journal_keys *); int bch2_stripes_write(struct bch_fs *, unsigned); diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index e4d633f..3fc3122 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -4,11 +4,9 @@ #include -#define EC_STRIPE_MAX 16 - struct bch_replicas_padded { struct bch_replicas_entry e; - u8 pad[EC_STRIPE_MAX]; + u8 pad[BCH_BKEY_PTRS_MAX]; }; struct stripe { @@ -20,11 +18,11 @@ struct stripe { u8 nr_blocks; u8 nr_redundant; - unsigned alive:1; - unsigned dirty:1; + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ unsigned on_heap:1; u8 blocks_nonempty; - u16 block_sectors[EC_STRIPE_MAX]; + u16 block_sectors[BCH_BKEY_PTRS_MAX]; + struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; struct bch_replicas_padded r; }; diff --git a/libbcachefs/error.c b/libbcachefs/error.c index cd46706..a8ee1db 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -11,13 +11,13 @@ bool bch2_inconsistent_error(struct bch_fs *c) set_bit(BCH_FS_ERROR, &c->flags); switch (c->opts.errors) { - case BCH_ON_ERROR_CONTINUE: + case BCH_ON_ERROR_continue: return false; - case BCH_ON_ERROR_RO: + case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) bch_err(c, "emergency read only"); return true; - case BCH_ON_ERROR_PANIC: + case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); return true; default: @@ -38,10 +38,10 @@ void bch2_io_error_work(struct work_struct *work) bool dev; down_write(&c->state_lock); - dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, + dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, BCH_FORCE_IF_DEGRADED); if (dev - ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, + ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, BCH_FORCE_IF_DEGRADED) : bch2_fs_emergency_read_only(c)) bch_err(ca, diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 94b5331..0e49fd7 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -181,12 +181,18 @@ void bch2_io_error(struct bch_dev *); /* Logs message and handles the error: */ #define bch2_dev_io_error(ca, fmt, ...) \ do { \ - printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ - "IO error on %s for " fmt), \ + printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \ (ca)->name, ##__VA_ARGS__); \ bch2_io_error(ca); \ } while (0) +#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \ +do { \ + printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\ + (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \ + bch2_io_error(ca); \ +} while (0) + #define bch2_dev_io_err_on(cond, ca, ...) \ ({ \ bool _ret = (cond); \ @@ -196,16 +202,13 @@ do { \ _ret; \ }) -/* kill? */ - -#define __bcache_io_error(c, fmt, ...) \ - printk_ratelimited(KERN_ERR bch2_fmt(c, \ - "IO error: " fmt), ##__VA_ARGS__) - -#define bcache_io_error(c, bio, fmt, ...) \ -do { \ - __bcache_io_error(c, fmt, ##__VA_ARGS__); \ - (bio)->bi_status = BLK_STS_IOERR; \ -} while (0) +#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) \ + bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\ + _ret; \ +}) #endif /* _BCACHEFS_ERROR_H */ diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index fd011df..bb4b2b4 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "bkey_on_stack.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" @@ -63,7 +62,7 @@ static int count_iters_for_insert(struct btree_trans *trans, struct bkey_s_c r_k; for_each_btree_key(trans, iter, - BTREE_ID_REFLINK, POS(0, idx + offset), + BTREE_ID_reflink, POS(0, idx + offset), BTREE_ITER_SLOTS, r_k, ret2) { if (bkey_cmp(bkey_start_pos(r_k.k), POS(0, idx + sectors)) >= 0) @@ -100,24 +99,12 @@ int bch2_extent_atomic_end(struct btree_iter *iter, struct bpos *end) { struct btree_trans *trans = iter->trans; - struct btree *b; - struct btree_node_iter node_iter; - struct bkey_packed *_k; - unsigned nr_iters = 0; + struct btree_iter *copy; + struct bkey_s_c k; + unsigned nr_iters = 0; int ret; - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - b = iter->l[0].b; - node_iter = iter->l[0].iter; - - BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && - bkey_cmp(bkey_start_pos(&insert->k), - bkey_predecessor(b->data->min_key)) < 0); - - *end = bpos_min(insert->k.p, b->key.k.p); + *end = insert->k.p; /* extent_update_to_keys(): */ nr_iters += 1; @@ -127,9 +114,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, if (ret < 0) return ret; - while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { - struct bkey unpacked; - struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + copy = bch2_trans_copy_iter(trans, iter); + + for_each_btree_key_continue(copy, 0, k, ret) { unsigned offset = 0; if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) @@ -156,10 +143,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, &nr_iters, EXTENT_ITERS_MAX); if (ret) break; - - bch2_btree_node_iter_advance(&node_iter, b); } + bch2_trans_iter_put(trans, copy); return ret < 0 ? ret : 0; } @@ -193,18 +179,13 @@ bch2_extent_can_insert(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert) { - struct btree_iter_level *l = &iter->l[0]; - struct btree_node_iter node_iter = l->iter; - struct bkey_packed *_k; struct bkey_s_c k; - struct bkey unpacked; - int sectors; - - _k = bch2_btree_node_iter_peek(&node_iter, l->b); - if (!_k) - return BTREE_INSERT_OK; + int ret, sectors; - k = bkey_disassemble(l->b, _k, &unpacked); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; /* Check if we're splitting a compressed extent: */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 88297b3..b07d395 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -89,7 +89,7 @@ static inline bool ptr_better(struct bch_fs *c, return bch2_rand_range(l1 + l2) > l1; } - if (force_reconstruct_read(c)) + if (bch2_force_reconstruct_read) return p1.idx > p2.idx; return p1.idx < p2.idx; @@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, !bch2_dev_is_readable(ca)) p.idx++; - if (force_reconstruct_read(c) && + if (bch2_force_reconstruct_read && !p.idx && p.has_ec) p.idx++; @@ -158,56 +158,33 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) { - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) return "value too big"; return bch2_bkey_ptrs_invalid(c, k); } -void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const struct bch_extent_ptr *ptr; - const char *err; - char buf[160]; - struct bucket_mark mark; - struct bch_dev *ca; - - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) - return; - - if (!percpu_down_read_trylock(&c->mark_lock)) - return; + bch2_bkey_ptrs_to_text(out, c, k); +} - bkey_for_each_ptr(ptrs, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); +const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - mark = ptr_bucket_mark(ca, ptr); + if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) + return "value too small"; - err = "stale"; - if (gen_after(mark.gen, ptr->gen)) - goto err; + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) + return "value too big"; - err = "inconsistent"; - if (mark.data_type != BCH_DATA_btree || - mark.dirty_sectors < c->opts.btree_node_size) - goto err; - } -out: - percpu_up_read(&c->mark_lock); - return; -err: - bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", - err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - PTR_BUCKET_NR(ca, ptr), - mark.gen, (unsigned) mark.v.counter); - goto out; -} + if (c->sb.version < bcachefs_metadata_version_snapshot && + bp.v->min_key.snapshot) + return "invalid min_key.snapshot"; -void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - bch2_bkey_ptrs_to_text(out, c, k); + return bch2_bkey_ptrs_invalid(c, k); } void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, @@ -215,9 +192,8 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - pr_buf(out, "seq %llx sectors %u written %u min_key ", + pr_buf(out, "seq %llx written %u min_key ", le64_to_cpu(bp.v->seq), - le16_to_cpu(bp.v->sectors), le16_to_cpu(bp.v->sectors_written)); bch2_bpos_to_text(out, bp.v->min_key); @@ -237,8 +213,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, btree_node_type_is_extents(btree_id) && bkey_cmp(bp.v->min_key, POS_MIN)) bp.v->min_key = write - ? bkey_predecessor(bp.v->min_key) - : bkey_successor(bp.v->min_key); + ? bpos_nosnap_predecessor(bp.v->min_key) + : bpos_nosnap_successor(bp.v->min_key); } /* KEY_TYPE_extent: */ @@ -248,49 +224,6 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) return bch2_bkey_ptrs_invalid(c, k); } -void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - char buf[160]; - - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || - !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) - return; - - if (!percpu_down_read_trylock(&c->mark_lock)) - return; - - extent_for_each_ptr_decode(e, p, entry) { - struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); - unsigned stale = gen_after(mark.gen, p.ptr.gen); - unsigned disk_sectors = ptr_disk_sectors(p); - unsigned mark_sectors = p.ptr.cached - ? mark.cached_sectors - : mark.dirty_sectors; - - bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, - "stale dirty pointer (ptr gen %u bucket %u", - p.ptr.gen, mark.gen); - - bch2_fs_inconsistent_on(stale > 96, c, - "key too stale: %i", stale); - - bch2_fs_inconsistent_on(!stale && - (mark.data_type != BCH_DATA_user || - mark_sectors < disk_sectors), c, - "extent pointer not marked: %s:\n" - "type %u sectors %u < %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), - mark.data_type, - mark_sectors, disk_sectors); - } - - percpu_up_read(&c->mark_lock); -} - void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -665,7 +598,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) } bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas) + unsigned nr_replicas, bool compressed) { struct btree_trans trans; struct btree_iter *iter; @@ -678,21 +611,45 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, + for_each_btree_key(&trans, iter, BTREE_ID_extents, pos, BTREE_ITER_SLOTS, k, err) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; - if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { + if (nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { ret = false; break; } } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); return ret; } +unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + unsigned replicas = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; + + if (p.has_ec) + replicas += p.ec.redundancy; + + replicas++; + + } + + return replicas; +} + static unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded p) { @@ -704,19 +661,12 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, ca = bch_dev_bkey_exists(c, p.ptr.dev); - if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + if (ca->mi.state != BCH_MEMBER_STATE_failed) durability = max_t(unsigned, durability, ca->mi.durability); - if (p.has_ec) { - struct stripe *s = - genradix_ptr(&c->stripes[0], p.ec.idx); + if (p.has_ec) + durability += p.ec.redundancy; - if (WARN_ON(!s)) - goto out; - - durability += s->nr_redundant; - } -out: return durability; } @@ -764,6 +714,15 @@ void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, } } +void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + union bch_extent_entry *next = extent_entry_next(entry); + + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k->k.u64s -= extent_entry_u64s(entry); +} + void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) { @@ -949,9 +908,9 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) /* will only happen if all pointers were cached: */ if (!bch2_bkey_nr_ptrs(k.s_c)) - k.k->type = KEY_TYPE_discard; + k.k->type = KEY_TYPE_deleted; - return bkey_whiteout(k.k); + return bkey_deleted(k.k); } void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, @@ -1046,16 +1005,17 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_devs_list devs; const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; unsigned size_ondisk = k.k->size; const char *reason; unsigned nonce = UINT_MAX; + unsigned i; - if (k.k->type == KEY_TYPE_btree_ptr) + if (k.k->type == KEY_TYPE_btree_ptr || + k.k->type == KEY_TYPE_btree_ptr_v2) size_ondisk = c->opts.btree_node_size; - if (k.k->type == KEY_TYPE_btree_ptr_v2) - size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); bkey_extent_entry_for_each(ptrs, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) @@ -1101,6 +1061,12 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) } } + devs = bch2_bkey_devs(k); + bubble_sort(devs.devs, devs.nr, u8_cmp); + for (i = 0; i + 1 < devs.nr; i++) + if (devs.devs[i] == devs.devs[i + 1]) + return "multiple ptrs to same device"; + return NULL; } @@ -1235,7 +1201,7 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) len = where.offset - bkey_start_offset(k.k); - k.k->p = where; + k.k->p.offset = where.offset; k.k->size = len; if (!len) { diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 74c7bb8..ccee43a 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -368,10 +368,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, @@ -379,14 +379,12 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, #define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ .key_invalid = bch2_btree_ptr_invalid, \ - .key_debugcheck = bch2_btree_ptr_debugcheck, \ .val_to_text = bch2_btree_ptr_to_text, \ .swab = bch2_ptr_swab, \ } #define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ - .key_invalid = bch2_btree_ptr_invalid, \ - .key_debugcheck = bch2_btree_ptr_debugcheck, \ + .key_invalid = bch2_btree_ptr_v2_invalid, \ .val_to_text = bch2_btree_ptr_v2_to_text, \ .swab = bch2_ptr_swab, \ .compat = bch2_btree_ptr_v2_compat, \ @@ -395,14 +393,12 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, /* KEY_TYPE_extent: */ const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); enum merge_result bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s); #define bch2_bkey_ops_extent (struct bkey_ops) { \ .key_invalid = bch2_extent_invalid, \ - .key_debugcheck = bch2_extent_debugcheck, \ .val_to_text = bch2_extent_to_text, \ .swab = bch2_ptr_swab, \ .key_normalize = bch2_extent_normalize, \ @@ -538,12 +534,15 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); +bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool); + +unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, unsigned, unsigned); +void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); @@ -583,6 +582,24 @@ void bch2_ptr_swab(struct bkey_s); /* Generic extent code: */ +enum bch_extent_overlap { + BCH_EXTENT_OVERLAP_ALL = 0, + BCH_EXTENT_OVERLAP_BACK = 1, + BCH_EXTENT_OVERLAP_FRONT = 2, + BCH_EXTENT_OVERLAP_MIDDLE = 3, +}; + +/* Returns how k overlaps with m */ +static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, + const struct bkey *m) +{ + int cmp1 = bkey_cmp(k->p, m->p) < 0; + int cmp2 = bkey_cmp(bkey_start_pos(k), + bkey_start_pos(m)) > 0; + + return (cmp1 << 1) + cmp2; +} + int bch2_cut_front_s(struct bpos, struct bkey_s); int bch2_cut_back_s(struct bpos, struct bkey_s); diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 878419d..281a613 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -20,8 +20,10 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, { struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL; + struct btree_iter *inode_iter = NULL; struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); - u64 now = bch2_current_time(trans->c); + u64 now = bch2_current_time(c); + u64 dir_offset = 0; int ret; dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); @@ -34,9 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (!name) new_inode->bi_flags |= BCH_INODE_UNLINKED; - ret = bch2_inode_create(trans, new_inode, - BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); + inode_iter = bch2_inode_create(trans, new_inode, U32_MAX); + ret = PTR_ERR_OR_ZERO(inode_iter); if (ret) goto err; @@ -68,11 +69,24 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ret = bch2_dirent_create(trans, dir_inum, &dir_hash, mode_to_type(new_inode->bi_mode), name, new_inode->bi_inum, + &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; } + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } + + /* XXX use bch2_btree_iter_set_snapshot() */ + inode_iter->snapshot = U32_MAX; + bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); + + ret = bch2_inode_write(trans, inode_iter, new_inode); err: + bch2_trans_iter_put(trans, inode_iter); bch2_trans_iter_put(trans, dir_iter); return ret; } @@ -81,9 +95,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, u64 inum, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name) { + struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL, *inode_iter = NULL; struct bch_hash_info dir_hash; - u64 now = bch2_current_time(trans->c); + u64 now = bch2_current_time(c); + u64 dir_offset = 0; int ret; inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); @@ -94,6 +110,8 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); + inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); ret = PTR_ERR_OR_ZERO(dir_iter); if (ret) @@ -101,12 +119,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_hash = bch2_hash_info_init(trans->c, dir_u); + dir_hash = bch2_hash_info_init(c, dir_u); - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, - mode_to_type(inode_u->bi_mode), - name, inum, BCH_HASH_SET_MUST_CREATE) ?: - bch2_inode_write(trans, dir_iter, dir_u) ?: + ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + inode_u->bi_dir = dir_inum; + inode_u->bi_dir_offset = dir_offset; + } + + ret = bch2_inode_write(trans, dir_iter, dir_u) ?: bch2_inode_write(trans, inode_iter, inode_u); err: bch2_trans_iter_put(trans, dir_iter); @@ -119,10 +146,11 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *inode_u, const struct qstr *name) { + struct bch_fs *c = trans->c; struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, *inode_iter = NULL; struct bch_hash_info dir_hash; - u64 inum, now = bch2_current_time(trans->c); + u64 inum, now = bch2_current_time(c); struct bkey_s_c k; int ret; @@ -131,7 +159,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - dir_hash = bch2_hash_info_init(trans->c, dir_u); + dir_hash = bch2_hash_info_init(c, dir_u); dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, name, BTREE_ITER_INTENT); @@ -197,10 +225,12 @@ int bch2_rename_trans(struct btree_trans *trans, const struct qstr *dst_name, enum bch_rename_mode mode) { + struct bch_fs *c = trans->c; struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; struct bch_hash_info src_hash, dst_hash; - u64 src_inode, dst_inode, now = bch2_current_time(trans->c); + u64 src_inode, src_offset, dst_inode, dst_offset; + u64 now = bch2_current_time(c); int ret; src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, @@ -209,7 +239,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (ret) goto err; - src_hash = bch2_hash_info_init(trans->c, src_dir_u); + src_hash = bch2_hash_info_init(c, src_dir_u); if (dst_dir != src_dir) { dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, @@ -218,7 +248,7 @@ int bch2_rename_trans(struct btree_trans *trans, if (ret) goto err; - dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); + dst_hash = bch2_hash_info_init(c, dst_dir_u); } else { dst_dir_u = src_dir_u; dst_hash = src_hash; @@ -227,8 +257,8 @@ int bch2_rename_trans(struct btree_trans *trans, ret = bch2_dirent_rename(trans, src_dir, &src_hash, dst_dir, &dst_hash, - src_name, &src_inode, - dst_name, &dst_inode, + src_name, &src_inode, &src_offset, + dst_name, &dst_inode, &dst_offset, mode); if (ret) goto err; @@ -247,6 +277,16 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + src_inode_u->bi_dir = dst_dir_u->bi_inum; + src_inode_u->bi_dir_offset = dst_offset; + + if (mode == BCH_RENAME_EXCHANGE) { + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } + } + if (mode == BCH_RENAME_OVERWRITE) { if (S_ISDIR(src_inode_u->bi_mode) != S_ISDIR(dst_inode_u->bi_mode)) { diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 3aed2ca..1a94e7f 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -3,7 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" @@ -35,6 +35,22 @@ #include #include +static inline struct address_space *faults_disabled_mapping(void) +{ + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); +} + +static inline void set_fdm_dropped_locks(void) +{ + current->faults_disabled_mapping = + (void *) (((unsigned long) current->faults_disabled_mapping)|1); +} + +static inline bool fdm_dropped_locks(void) +{ + return ((unsigned long) current->faults_disabled_mapping) & 1; +} + struct quota_res { u64 sectors; }; @@ -68,6 +84,7 @@ struct dio_read { struct closure cl; struct kiocb *req; long ret; + bool should_dirty; struct bch_read_bio rbio; }; @@ -265,28 +282,13 @@ static inline struct bch_page_state *bch2_page_state(struct page *page) /* for newly allocated pages: */ static void __bch2_page_state_release(struct page *page) { - struct bch_page_state *s = __bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + kfree(detach_page_private(page)); } static void bch2_page_state_release(struct page *page) { - struct bch_page_state *s = bch2_page_state(page); - - if (!s) - return; - - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - kfree(s); + EBUG_ON(!PageLocked(page)); + __bch2_page_state_release(page); } /* for newly allocated pages: */ @@ -300,13 +302,7 @@ static struct bch_page_state *__bch2_page_state_create(struct page *page, return NULL; spin_lock_init(&s->lock); - /* - * migrate_page_move_mapping() assumes that pages with private data - * have their count elevated by 1. - */ - get_page(page); - set_page_private(page, (unsigned long) s); - SetPagePrivate(page); + attach_page_private(page, s); return s; } @@ -514,10 +510,35 @@ static void bch2_set_page_dirty(struct bch_fs *c, vm_fault_t bch2_page_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); struct bch_inode_info *inode = file_bch_inode(file); int ret; + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) + goto got_lock; + + bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + + bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + bch2_pagecache_add_get(&inode->ei_pagecache_lock); +got_lock: ret = filemap_fault(vmf); bch2_pagecache_add_put(&inode->ei_pagecache_lock); @@ -608,14 +629,8 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (PagePrivate(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (PagePrivate(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); @@ -647,41 +662,33 @@ static void bch2_readpages_end_io(struct bio *bio) bio_put(bio); } -static inline void page_state_init_for_read(struct page *page) -{ - SetPagePrivate(page); - page->private = 0; -} - struct readpages_iter { struct address_space *mapping; struct page **pages; unsigned nr_pages; - unsigned nr_added; unsigned idx; pgoff_t offset; }; static int readpages_iter_init(struct readpages_iter *iter, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct readahead_control *ractl) { + unsigned i, nr_pages = readahead_count(ractl); + memset(iter, 0, sizeof(*iter)); - iter->mapping = mapping; - iter->offset = list_last_entry(pages, struct page, lru)->index; + iter->mapping = ractl->mapping; + iter->offset = readahead_index(ractl); + iter->nr_pages = nr_pages; iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); if (!iter->pages) return -ENOMEM; - while (!list_empty(pages)) { - struct page *page = list_last_entry(pages, struct page, lru); - - __bch2_page_state_create(page, __GFP_NOFAIL); - - iter->pages[iter->nr_pages++] = page; - list_del(&page->lru); + nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); + put_page(iter->pages[i]); } return 0; @@ -689,41 +696,9 @@ static int readpages_iter_init(struct readpages_iter *iter, static inline struct page *readpage_iter_next(struct readpages_iter *iter) { - struct page *page; - unsigned i; - int ret; - - BUG_ON(iter->idx > iter->nr_added); - BUG_ON(iter->nr_added > iter->nr_pages); - - if (iter->idx < iter->nr_added) - goto out; - - while (1) { - if (iter->idx == iter->nr_pages) - return NULL; - - ret = add_to_page_cache_lru_vec(iter->mapping, - iter->pages + iter->nr_added, - iter->nr_pages - iter->nr_added, - iter->offset + iter->nr_added, - GFP_NOFS); - if (ret > 0) - break; - - page = iter->pages[iter->nr_added]; - iter->idx++; - iter->nr_added++; - - __bch2_page_state_release(page); - put_page(page); - } - - iter->nr_added += ret; + if (iter->idx >= iter->nr_pages) + return NULL; - for (i = iter->idx; i < iter->nr_added; i++) - put_page(iter->pages[i]); -out: EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); return iter->pages[iter->idx]; @@ -817,7 +792,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; - struct bkey_on_stack sk; + struct bkey_buf sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; int ret = 0; @@ -825,11 +800,12 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, rbio->c = c; rbio->start_time = local_clock(); - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); retry: while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; bch2_btree_iter_set_pos(iter, POS(inum, rbio->bio.bi_iter.bi_sector)); @@ -843,9 +819,9 @@ retry: bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); - ret = bch2_read_indirect_extent(trans, + ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &sk); if (ret) break; @@ -869,7 +845,8 @@ retry: if (bkey_extent_is_allocation(k.k)) bch2_add_page_sectors(&rbio->bio, k); - bch2_read_extent(trans, rbio, k, offset_into_extent, flags); + bch2_read_extent(trans, rbio, iter->pos, + data_btree, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) break; @@ -882,17 +859,18 @@ retry: goto retry; if (ret) { - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bch_err_inum_ratelimited(c, inum, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); } -int bch2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void bch2_readahead(struct readahead_control *ractl) { - struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; @@ -901,12 +879,11 @@ int bch2_readpages(struct file *file, struct address_space *mapping, struct readpages_iter readpages_iter; int ret; - ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); + ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, BTREE_ITER_SLOTS); bch2_pagecache_add_get(&inode->ei_pagecache_lock); @@ -934,10 +911,9 @@ int bch2_readpages(struct file *file, struct address_space *mapping, bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); kfree(readpages_iter.pages); - - return 0; } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, @@ -954,11 +930,12 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, BTREE_ITER_SLOTS); bchfs_read(&trans, iter, rbio, inum, NULL); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); } @@ -1042,6 +1019,8 @@ static void bch2_writepage_io_done(struct closure *cl) unsigned i; if (io->op.error) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; @@ -1644,12 +1623,22 @@ again: /* O_DIRECT reads */ +static void bio_check_or_release(struct bio *bio, bool check_dirty) +{ + if (check_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + static void bch2_dio_read_complete(struct closure *cl) { struct dio_read *dio = container_of(cl, struct dio_read, cl); dio->req->ki_complete(dio->req, dio->ret, 0); - bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); } static void bch2_direct_IO_read_endio(struct bio *bio) @@ -1664,8 +1653,11 @@ static void bch2_direct_IO_read_endio(struct bio *bio) static void bch2_direct_IO_read_split_endio(struct bio *bio) { + struct dio_read *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + bch2_direct_IO_read_endio(bio); - bio_check_pages_dirty(bio); /* transfers ownership */ + bio_check_or_release(bio, should_dirty); } static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) @@ -1719,6 +1711,12 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) dio->req = req; dio->ret = ret; + /* + * This is one of the sketchier things I've encountered: we have to skip + * the dirtying of requests that are internal from the kernel (i.e. from + * loopback), because we'll deadlock on page_lock. + */ + dio->should_dirty = iter_is_iovec(iter); goto start; while (iter->count) { @@ -1740,7 +1738,9 @@ start: } offset += bio->bi_iter.bi_size; - bio_set_pages_dirty(bio); + + if (dio->should_dirty) + bio_set_pages_dirty(bio); if (iter->count) closure_get(&dio->cl); @@ -1754,7 +1754,7 @@ start: closure_sync(&dio->cl); closure_debug_destroy(&dio->cl); ret = dio->ret; - bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); return ret; } else { return -EIOCBQUEUED; @@ -1812,14 +1812,16 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bio *bio = &dio->op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bv; - unsigned unaligned; - bool sync = dio->sync; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; long ret; if (dio->loop) goto loop; while (1) { + iter_count = dio->iter.count; + if (kthread) kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); @@ -1827,13 +1829,34 @@ static long bch2_dio_write_loop(struct dio_write *dio) ret = bio_iov_iter_get_pages(bio, &dio->iter); + dropped_locks = fdm_dropped_locks(); + current->faults_disabled_mapping = NULL; if (kthread) kthread_unuse_mm(dio->mm); + /* + * If the fault handler returned an error but also signalled + * that it dropped & retook ei_pagecache_lock, we just need to + * re-shoot down the page cache and retry: + */ + if (dropped_locks && ret) + ret = 0; + if (unlikely(ret < 0)) goto err; + if (unlikely(dropped_locks)) { + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) + goto err; + + if (!bio->bi_iter.bi_size) + continue; + } + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); bio->bi_iter.bi_size -= unaligned; iov_iter_revert(&dio->iter, unaligned); @@ -1865,7 +1888,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) dio->op.opts.data_replicas, 0); if (unlikely(ret) && !bch2_check_range_allocated(c, dio->op.pos, - bio_sectors(bio), dio->op.opts.data_replicas)) + bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0)) goto err; task_io_account_write(bio->bi_iter.bi_size); @@ -1908,7 +1933,13 @@ loop: bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); - if (!dio->iter.count || dio->op.error) + + if (dio->op.error) { + set_bit(EI_INODE_ERROR, &inode->ei_flags); + break; + } + + if (!dio->iter.count) break; bio_reset(bio); @@ -2104,7 +2135,7 @@ static inline int range_has_data(struct bch_fs *c, bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { + for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; @@ -2113,6 +2144,7 @@ static inline int range_has_data(struct bch_fs *c, break; } } + bch2_trans_iter_put(&trans, iter); return bch2_trans_exit(&trans) ?: ret; } @@ -2282,6 +2314,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) bch2_trans_init(&trans, c, 0, 0); iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); ret = PTR_ERR_OR_ZERO(iter); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); if (ret) @@ -2296,7 +2329,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (ret) goto err; - BUG_ON(inode->v.i_size < inode_u.bi_size); + WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && + inode->v.i_size < inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { ret = bch2_extend(inode, &inode_u, iattr); @@ -2409,19 +2443,16 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct bkey_on_stack copy; + struct bkey_buf copy; struct btree_trans trans; - struct btree_iter *src, *dst; + struct btree_iter *src, *dst, *del; loff_t shift, new_size; u64 src_start; - int ret; + int ret = 0; if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; - bkey_on_stack_init(©); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); - /* * We need i_mutex to keep the page cache consistent with the extents * btree, and the btree consistent with i_size - we don't need outside @@ -2477,15 +2508,15 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, goto err; } - src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + bch2_bkey_buf_init(©); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); + src = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS(inode->v.i_ino, src_start >> 9), BTREE_ITER_INTENT); - BUG_ON(IS_ERR_OR_NULL(src)); - dst = bch2_trans_copy_iter(&trans, src); - BUG_ON(IS_ERR_OR_NULL(dst)); + del = bch2_trans_copy_iter(&trans, src); - while (1) { + while (ret == 0 || ret == -EINTR) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; @@ -2499,18 +2530,16 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ? bch2_btree_iter_peek_prev(src) : bch2_btree_iter_peek(src); if ((ret = bkey_err(k))) - goto bkey_err; + continue; if (!k.k || k.k->p.inode != inode->v.i_ino) break; - BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); - if (insert && bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) break; reassemble: - bkey_on_stack_reassemble(©, c, k); + bch2_bkey_buf_reassemble(©, c, k); if (insert && bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) @@ -2521,7 +2550,7 @@ reassemble: ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); if (ret) - goto bkey_err; + continue; if (bkey_cmp(atomic_end, copy.k->k.p)) { if (insert) { @@ -2537,6 +2566,7 @@ reassemble: delete.k.p = copy.k->k.p; delete.k.size = copy.k->k.size; delete.k.p.offset -= shift >> 9; + bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k)); next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; @@ -2557,26 +2587,24 @@ reassemble: BUG_ON(ret); } - bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); - - ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: + ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?: bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: bch2_trans_commit(&trans, &disk_res, &inode->ei_journal_seq, BTREE_INSERT_NOFAIL); bch2_disk_reservation_put(c, &disk_res); -bkey_err: + if (!ret) bch2_btree_iter_set_pos(src, next_pos); - - if (ret == -EINTR) - ret = 0; - if (ret) - goto err; - - bch2_trans_cond_resched(&trans); } - bch2_trans_unlock(&trans); + bch2_trans_iter_put(&trans, del); + bch2_trans_iter_put(&trans, dst); + bch2_trans_iter_put(&trans, src); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(©, c); + + if (ret) + goto err; if (!insert) { i_size_write(&inode->v, new_size); @@ -2586,8 +2614,6 @@ bkey_err: mutex_unlock(&inode->ei_update_lock); } err: - bch2_trans_exit(&trans); - bkey_on_stack_exit(©, c); bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); return ret; @@ -2637,12 +2663,12 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, truncate_pagecache_range(&inode->v, offset, end - 1); } - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS(inode->v.i_ino, block_start >> 9), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); end_pos = POS(inode->v.i_ino, block_end >> 9); - while (bkey_cmp(iter->pos, end_pos) < 0) { + while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { s64 i_sectors_delta = 0; struct disk_reservation disk_res = { 0 }; struct quota_res quota_res = { 0 }; @@ -2706,9 +2732,11 @@ bkey_err: bch2_disk_reservation_put(c, &disk_res); if (ret == -EINTR) ret = 0; - if (ret) - goto err; } + bch2_trans_iter_put(&trans, iter); + + if (ret) + goto err; /* * Do we need to extend the file? @@ -2730,6 +2758,7 @@ bkey_err: ret = PTR_ERR_OR_ZERO(inode_iter); } while (ret == -EINTR); + bch2_trans_iter_put(&trans, inode_iter); bch2_trans_unlock(&trans); if (ret) @@ -2834,9 +2863,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, u64 aligned_len; loff_t ret = 0; - if (!c->opts.reflink) - return -EOPNOTSUPP; - if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) return -EINVAL; @@ -2970,7 +2996,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), 0, k, ret) { if (k.k->p.inode != inode->v.i_ino) { break; @@ -2980,6 +3006,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) } else if (k.k->p.offset >> 9 > isize) break; } + bch2_trans_iter_put(&trans, iter); ret = bch2_trans_exit(&trans) ?: ret; if (ret) @@ -3017,8 +3044,8 @@ static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) int pg_offset; loff_t ret = -1; - page = find_lock_entry(mapping, index); - if (!page || xa_is_value(page)) + page = find_lock_page(mapping, index); + if (!page) return offset; pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); @@ -3065,7 +3092,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { @@ -3083,6 +3110,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) offset = max(offset, bkey_start_offset(k.k) << 9); } } + bch2_trans_iter_put(&trans, iter); ret = bch2_trans_exit(&trans) ?: ret; if (ret) diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index 7063556..2537a3d 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -19,8 +19,7 @@ int bch2_writepage(struct page *, struct writeback_control *); int bch2_readpage(struct file *, struct page *); int bch2_writepages(struct address_space *, struct writeback_control *); -int bch2_readpages(struct file *, struct address_space *, - struct list_head *, unsigned); +void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page **, void **); diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 0873d2f..eb87163 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -183,6 +183,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, struct bch_inode_info *src, const char __user *name) { + struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); struct bch_inode_info *dst; struct inode *vinode = NULL; char *kname = NULL; @@ -202,8 +203,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, qstr.name = kname; ret = -ENOENT; - inum = bch2_dirent_lookup(c, src->v.i_ino, - &src->ei_str_hash, + inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, &qstr); if (!inum) goto err1; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 1d66aca..8034d48 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -3,7 +3,7 @@ #include "bcachefs.h" #include "acl.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "chardev.h" @@ -42,6 +42,11 @@ static void journal_seq_copy(struct bch_fs *c, struct bch_inode_info *dst, u64 journal_seq) { + /* + * atomic64_cmpxchg has a fallback for archs that don't support it, + * cmpxchg does not: + */ + atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; u64 old, v = READ_ONCE(dst->ei_journal_seq); do { @@ -49,7 +54,7 @@ static void journal_seq_copy(struct bch_fs *c, if (old >= journal_seq) break; - } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); + } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); } @@ -86,6 +91,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock) __pagecache_lock_put(lock, 1); } +bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) +{ + return __pagecache_lock_tryget(lock, 1); +} + void bch2_pagecache_add_get(struct pagecache_lock *lock) { __pagecache_lock_get(lock, 1); @@ -225,6 +235,13 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) return &inode->v; } +static int inum_test(struct inode *inode, void *p) +{ + unsigned long *ino = p; + + return *ino == inode->i_ino; +} + static struct bch_inode_info * __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, umode_t mode, dev_t rdev, bool tmpfile) @@ -259,7 +276,8 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry, if (!tmpfile) mutex_lock(&dir->ei_update_lock); - bch2_trans_init(&trans, c, 8, 1024); + bch2_trans_init(&trans, c, 8, + 2048 + (!tmpfile ? dentry->d_name.len : 0)); retry: bch2_trans_begin(&trans); @@ -304,8 +322,12 @@ err_before_quota: * thread pulling the inode in and modifying it: */ - old = to_bch_ei(insert_inode_locked2(&inode->v)); - if (unlikely(old)) { + inode->v.i_state |= I_CREATING; + old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, + inum_test, NULL, &inode->v.i_ino)); + BUG_ON(!old); + + if (unlikely(old != inode)) { /* * We raced, another process pulled the new inode into cache * before us: @@ -346,11 +368,11 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct inode *vinode = NULL; u64 inum; - inum = bch2_dirent_lookup(c, dir->v.i_ino, - &dir->ei_str_hash, + inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, &dentry->d_name); if (inum) @@ -390,16 +412,12 @@ static int __bch2_link(struct bch_fs *c, mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c, 4, 1024); - do { - bch2_trans_begin(&trans); - ret = bch2_link_trans(&trans, + ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, + BTREE_INSERT_NOUNLOCK, + bch2_link_trans(&trans, dir->v.i_ino, inode->v.i_ino, &dir_u, &inode_u, - &dentry->d_name) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_NOUNLOCK); - } while (ret == -EINTR); + &dentry->d_name)); if (likely(!ret)) { BUG_ON(inode_u.bi_inum != inode->v.i_ino); @@ -446,17 +464,12 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_init(&trans, c, 4, 1024); - do { - bch2_trans_begin(&trans); - - ret = bch2_unlink_trans(&trans, + ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, dir->v.i_ino, &dir_u, - &inode_u, &dentry->d_name) ?: - bch2_trans_commit(&trans, NULL, - &dir->ei_journal_seq, - BTREE_INSERT_NOUNLOCK| - BTREE_INSERT_NOFAIL); - } while (ret == -EINTR); + &inode_u, &dentry->d_name)); if (likely(!ret)) { BUG_ON(inode_u.bi_inum != inode->v.i_ino); @@ -570,21 +583,16 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, goto err; } -retry: - bch2_trans_begin(&trans); - ret = bch2_rename_trans(&trans, - src_dir->v.i_ino, &src_dir_u, - dst_dir->v.i_ino, &dst_dir_u, - &src_inode_u, - &dst_inode_u, - &src_dentry->d_name, - &dst_dentry->d_name, - mode) ?: - bch2_trans_commit(&trans, NULL, - &journal_seq, - BTREE_INSERT_NOUNLOCK); - if (ret == -EINTR) - goto retry; + ret = __bch2_trans_do(&trans, NULL, &journal_seq, + BTREE_INSERT_NOUNLOCK, + bch2_rename_trans(&trans, + src_dir->v.i_ino, &src_dir_u, + dst_dir->v.i_ino, &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode)); if (unlikely(ret)) goto err; @@ -706,7 +714,7 @@ retry: bch2_setattr_copy(inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); + ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); if (ret) goto btree_err; } @@ -717,6 +725,8 @@ retry: BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOFAIL); btree_err: + bch2_trans_iter_put(&trans, inode_iter); + if (ret == -EINTR) goto retry; if (unlikely(ret)) @@ -807,7 +817,7 @@ static int bch2_fill_extent(struct bch_fs *c, struct fiemap_extent_info *info, struct bkey_s_c k, unsigned flags) { - if (bkey_extent_is_data(k.k)) { + if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -838,6 +848,12 @@ static int bch2_fill_extent(struct bch_fs *c, } return 0; + } else if (bkey_extent_is_inline_data(k.k)) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, + flags| + FIEMAP_EXTENT_DATA_INLINE); } else if (k.k->type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, @@ -858,7 +874,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_on_stack cur, prev; + struct bkey_buf cur, prev; struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; @@ -871,19 +887,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (start + len < start) return -EINVAL; - bkey_on_stack_init(&cur); - bkey_on_stack_init(&prev); + bch2_bkey_buf_init(&cur); + bch2_bkey_buf_init(&prev); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS(ei->v.i_ino, start >> 9), 0); retry: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && bkey_cmp(iter->pos, end) < 0) { + enum btree_id data_btree = BTREE_ID_extents; + if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_next(iter); + bch2_btree_iter_advance(iter); continue; } @@ -891,24 +909,22 @@ retry: bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_realloc(&cur, c, k.k->u64s); - bkey_on_stack_realloc(&prev, c, k.k->u64s); - bkey_reassemble(cur.k, k); + bch2_bkey_buf_reassemble(&cur, c, k); - ret = bch2_read_indirect_extent(&trans, + ret = bch2_read_indirect_extent(&trans, &data_btree, &offset_into_extent, &cur); if (ret) break; k = bkey_i_to_s_c(cur.k); + bch2_bkey_buf_realloc(&prev, c, k.k->u64s); sectors = min(sectors, k.k->size - offset_into_extent); - if (offset_into_extent) - bch2_cut_front(POS(k.k->p.inode, - bkey_start_offset(k.k) + - offset_into_extent), - cur.k); + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + + offset_into_extent), + cur.k); bch2_key_resize(&cur.k->k, sectors); cur.k->k.p = iter->pos; cur.k->k.p.offset += cur.k->k.size; @@ -923,10 +939,8 @@ retry: bkey_copy(prev.k, cur.k); have_extent = true; - if (k.k->type == KEY_TYPE_reflink_v) - bch2_btree_iter_set_pos(iter, k.k->p); - else - bch2_btree_iter_next(iter); + bch2_btree_iter_set_pos(iter, + POS(iter->pos.inode, iter->pos.offset + sectors)); } if (ret == -EINTR) @@ -936,9 +950,10 @@ retry: ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); + bch2_trans_iter_put(&trans, iter); ret = bch2_trans_exit(&trans) ?: ret; - bkey_on_stack_exit(&cur, c); - bkey_on_stack_exit(&prev, c); + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); return ret < 0 ? ret : 0; } @@ -983,10 +998,10 @@ static const struct file_operations bch_file_operations = { .open = generic_file_open, .fsync = bch2_fsync, .splice_read = generic_file_splice_read, - /* - * Broken, on v5.3: +#if 0 + /* Busted: */ .splice_write = iter_file_splice_write, - */ +#endif .fallocate = bch2_fallocate_dispatch, .unlocked_ioctl = bch2_fs_file_ioctl, #ifdef CONFIG_COMPAT @@ -1062,7 +1077,7 @@ static const struct address_space_operations bch_address_space_operations = { .writepage = bch2_writepage, .readpage = bch2_readpage, .writepages = bch2_writepages, - .readpages = bch2_readpages, + .readahead = bch2_readahead, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = bch2_write_begin, .write_end = bch2_write_end, @@ -1127,9 +1142,9 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; + inode->ei_flags = 0; inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; - inode->ei_str_hash = bch2_hash_info_init(c, bi); inode->ei_qid = bch_qid(bi); inode->v.i_mapping->a_ops = &bch_address_space_operations; @@ -1228,7 +1243,7 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode->v.i_ino); + bch2_inode_rm(c, inode->v.i_ino, true); } } @@ -1238,6 +1253,11 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) struct bch_fs *c = sb->s_fs_info; struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); unsigned shift = sb->s_blocksize_bits - 9; + /* + * this assumes inodes take up 64 bytes, which is a decent average + * number: + */ + u64 avail_inodes = ((usage.capacity - usage.used) << 3); u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; @@ -1245,8 +1265,9 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = usage.capacity >> shift; buf->f_bfree = (usage.capacity - usage.used) >> shift; buf->f_bavail = buf->f_bfree; - buf->f_files = 0; - buf->f_ffree = 0; + + buf->f_files = usage.nr_inodes + avail_inodes; + buf->f_ffree = avail_inodes; fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index eda903a..2d82ed7 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -26,12 +26,14 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock) } void bch2_pagecache_add_put(struct pagecache_lock *); +bool bch2_pagecache_add_tryget(struct pagecache_lock *); void bch2_pagecache_add_get(struct pagecache_lock *); void bch2_pagecache_block_put(struct pagecache_lock *); void bch2_pagecache_block_get(struct pagecache_lock *); struct bch_inode_info { struct inode v; + unsigned long ei_flags; struct mutex ei_update_lock; u64 ei_journal_seq; @@ -43,12 +45,16 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; - struct bch_hash_info ei_str_hash; - /* copy of inode in btree: */ struct bch_inode_unpacked ei_inode; }; +/* + * Set if we've gotten a btree error for this inode, and thus the vfs inode and + * btree inode may be inconsistent: + */ +#define EI_INODE_ERROR 0 + #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 5a6df3d..acf128f 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "dirent.h" #include "error.h" @@ -24,7 +24,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) u64 sectors = 0; int ret; - for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, + for_each_btree_key(trans, iter, BTREE_ID_extents, POS(inum, 0), 0, k, ret) { if (k.k->p.inode != inum) break; @@ -58,7 +58,7 @@ static int __remove_dirent(struct btree_trans *trans, buf[name.len] = '\0'; name.name = buf; - ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); + ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0); if (ret && ret != -EINTR) bch_err(c, "remove_dirent: err %i looking up directory inode", ret); if (ret) @@ -126,8 +126,8 @@ static int walk_inode(struct btree_trans *trans, struct inode_walker *w, u64 inum) { if (inum != w->cur_inum) { - int ret = bch2_inode_find_by_inum_trans(trans, inum, - &w->inode); + int ret = __bch2_inode_find_by_inum_trans(trans, inum, + &w->inode, 0); if (ret && ret != -ENOENT) return ret; @@ -193,7 +193,7 @@ static int hash_redo_key(const struct bch_hash_desc desc, bch2_trans_update(trans, k_iter, &delete, 0); return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, - tmp, BCH_HASH_SET_MUST_CREATE); + tmp, 0); } static int fsck_hash_delete_at(struct btree_trans *trans, @@ -230,7 +230,6 @@ static int hash_check_duplicates(struct btree_trans *trans, return 0; iter = bch2_trans_copy_iter(trans, h->chain); - BUG_ON(IS_ERR(iter)); for_each_btree_key_continue(iter, 0, k2, ret) { if (bkey_cmp(k2.k->p, k.k->p) >= 0) @@ -258,17 +257,15 @@ static void hash_set_chain_start(struct btree_trans *trans, struct hash_check *h, struct btree_iter *k_iter, struct bkey_s_c k) { - bool hole = (k.k->type != KEY_TYPE_whiteout && + bool hole = (k.k->type != KEY_TYPE_hash_whiteout && k.k->type != desc.key_type); if (hole || k.k->p.offset > h->chain_end + 1) hash_stop_chain(trans, h); if (!hole) { - if (!h->chain) { + if (!h->chain) h->chain = bch2_trans_copy_iter(trans, k_iter); - BUG_ON(IS_ERR(h->chain)); - } h->chain_end = k.k->p.offset; } @@ -322,7 +319,7 @@ static int hash_check_key(struct btree_trans *trans, bch_err(c, "hash_redo_key err %i", ret); return ret; } - return 1; + return -EINTR; } ret = hash_check_duplicates(trans, desc, h, k_iter, k); @@ -399,7 +396,7 @@ err_redo: if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" "hash table key at wrong offset: btree %u, offset %llu, " "hashed to %llu chain starts at %llu\n%s", - buf, strlen(buf), BTREE_ID_DIRENTS, + buf, strlen(buf), BTREE_ID_dirents, k->k->p.offset, hash, h->chain->pos.offset, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { @@ -416,18 +413,10 @@ err_redo: goto err; } -static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) -{ - return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, - POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), - POS(inode_nr + 1, 0), NULL); -} - -static int bch2_fix_overlapping_extent(struct btree_trans *trans, - struct btree_iter *iter, +static int fix_overlapping_extent(struct btree_trans *trans, struct bkey_s_c k, struct bpos cut_at) { - struct btree_iter *u_iter; + struct btree_iter *iter; struct bkey_i *u; int ret; @@ -439,24 +428,24 @@ static int bch2_fix_overlapping_extent(struct btree_trans *trans, bkey_reassemble(u, k); bch2_cut_front(cut_at, u); - u_iter = bch2_trans_copy_iter(trans, iter); - ret = PTR_ERR_OR_ZERO(u_iter); - if (ret) - return ret; /* - * We don't want to go through the - * extent_handle_overwrites path: + * We don't want to go through the extent_handle_overwrites path: + * + * XXX: this is going to screw up disk accounting, extent triggers + * assume things about extent overwrites - we should be running the + * triggers manually here */ - __bch2_btree_iter_set_pos(u_iter, u->k.p, false); + iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p, + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); - /* - * XXX: this is going to leave disk space - * accounting slightly wrong - */ - ret = bch2_trans_update(trans, u_iter, u, 0); - bch2_trans_iter_put(trans, u_iter); - return ret; + BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(trans, iter); + + return bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); } /* @@ -470,21 +459,45 @@ static int check_extents(struct bch_fs *c) struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_on_stack prev; - u64 i_sectors; + struct bkey_buf prev; + u64 i_sectors = 0; int ret = 0; - bkey_on_stack_init(&prev); + bch2_bkey_buf_init(&prev); prev.k->k = KEY(0, 0, 0); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch_verbose(c, "checking extents"); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_INTENT); retry: - for_each_btree_key_continue(iter, 0, k, ret) { + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { + if (w.have_inode && + w.cur_inum != k.k->p.inode && + !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && + fsck_err_on(w.inode.bi_sectors != i_sectors, c, + "inode %llu has incorrect i_sectors: got %llu, should be %llu", + w.inode.bi_inum, + w.inode.bi_sectors, i_sectors)) { + struct btree_iter *inode_iter = + bch2_trans_get_iter(&trans, BTREE_ID_inodes, + POS(0, w.cur_inum), + BTREE_ITER_INTENT); + + w.inode.bi_sectors = i_sectors; + + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_inode_write(&trans, inode_iter, &w.inode)); + bch2_trans_iter_put(&trans, inode_iter); + if (ret) + break; + } + if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { char buf1[200]; char buf2[200]; @@ -492,86 +505,55 @@ retry: bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); bch2_bkey_val_to_text(&PBUF(buf2), c, k); - if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_fix_overlapping_extent(&trans, - iter, k, prev.k->k.p)); - if (ret) - goto err; - } + if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) + return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR; } - bkey_on_stack_reassemble(&prev, c, k); ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; + if (w.first_this_inode) + i_sectors = 0; + if (fsck_err_on(!w.have_inode, c, - "extent type %u for missing inode %llu", - k.k->type, k.k->p.inode) || + "extent type %u for missing inode %llu", + k.k->type, k.k->p.inode) || fsck_err_on(w.have_inode && - !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, - "extent type %u for non regular file, inode %llu mode %o", - k.k->type, k.k->p.inode, w.inode.bi_mode)) { - bch2_trans_unlock(&trans); - - ret = bch2_inode_truncate(c, k.k->p.inode, 0); - if (ret) - goto err; - continue; + !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, + "extent type %u for non regular file, inode %llu mode %o", + k.k->type, k.k->p.inode, w.inode.bi_mode)) { + bch2_fs_lazy_rw(c); + return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, + POS(k.k->p.inode, 0), + POS(k.k->p.inode, U64_MAX), + NULL) ?: -EINTR; } - if (fsck_err_on(w.first_this_inode && - w.have_inode && - !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && - w.inode.bi_sectors != - (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), - c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", - w.inode.bi_inum, - w.inode.bi_sectors, i_sectors)) { - struct bkey_inode_buf p; - - w.inode.bi_sectors = i_sectors; - - bch2_trans_unlock(&trans); - - bch2_inode_pack(&p, &w.inode); - - ret = bch2_btree_insert(c, BTREE_ID_INODES, - &p.inode.k_i, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error in fsck: error %i updating inode", ret); - goto err; - } - - /* revalidate iterator: */ - k = bch2_btree_iter_peek(iter); + if (fsck_err_on(w.have_inode && + !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + k.k->type != KEY_TYPE_reservation && + k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, + "extent type %u offset %llu past end of inode %llu, i_size %llu", + k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { + bch2_fs_lazy_rw(c); + return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, + POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c))), + POS(k.k->p.inode, U64_MAX), + NULL) ?: -EINTR; } - if (fsck_err_on(w.have_inode && - !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - k.k->type != KEY_TYPE_reservation && - k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, - "extent type %u offset %llu past end of inode %llu, i_size %llu", - k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { - bch2_trans_unlock(&trans); + if (bkey_extent_is_allocation(k.k)) + i_sectors += k.k->size; + bch2_bkey_buf_reassemble(&prev, c, k); - ret = bch2_inode_truncate(c, k.k->p.inode, - w.inode.bi_size); - if (ret) - goto err; - continue; - } + bch2_btree_iter_advance(iter); } -err: fsck_err: if (ret == -EINTR) goto retry; - bkey_on_stack_exit(&prev, c); + bch2_trans_iter_put(&trans, iter); + bch2_bkey_buf_exit(&prev, c); return bch2_trans_exit(&trans) ?: ret; } @@ -597,10 +579,11 @@ static int check_dirents(struct bch_fs *c) hash_check_init(&h); - iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), 0); retry: - for_each_btree_key_continue(iter, 0, k, ret) { + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { struct bkey_s_c_dirent d; struct bch_inode_unpacked target; bool have_target; @@ -675,7 +658,7 @@ retry: continue; } - ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); + ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0); if (ret && ret != -ENOENT) break; @@ -692,6 +675,39 @@ retry: continue; } + if (!target.bi_nlink && + !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && + (target.bi_dir != k.k->p.inode || + target.bi_dir_offset != k.k->p.offset) && + (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, + "inode %llu has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + d_inum, + target.bi_dir, + target.bi_dir_offset, + k.k->p.inode, + k.k->p.offset) || + c->opts.version_upgrade)) { + struct bkey_inode_buf p; + + target.bi_dir = k.k->p.inode; + target.bi_dir_offset = k.k->p.offset; + bch2_trans_unlock(&trans); + + bch2_inode_pack(c, &p, &target); + + ret = bch2_btree_insert(c, BTREE_ID_inodes, + &p.inode.k_i, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret) { + bch_err(c, "error in fsck: error %i updating inode", ret); + goto err; + } + continue; + } + if (fsck_err_on(have_target && d.v->d_type != mode_to_type(target.bi_mode), c, @@ -719,6 +735,8 @@ retry: goto err; } + + bch2_btree_iter_advance(iter); } hash_stop_chain(&trans, &h); @@ -727,6 +745,8 @@ fsck_err: if (ret == -EINTR) goto retry; + bch2_trans_iter_put(&trans, h.chain); + bch2_trans_iter_put(&trans, iter); return bch2_trans_exit(&trans) ?: ret; } @@ -749,10 +769,11 @@ static int check_xattrs(struct bch_fs *c) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS(BCACHEFS_ROOT_INO, 0), 0); retry: - for_each_btree_key_continue(iter, 0, k, ret) { + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k))) { ret = walk_inode(&trans, &w, k.k->p.inode); if (ret) break; @@ -762,7 +783,7 @@ retry: k.k->p.inode)) { ret = bch2_btree_delete_at(&trans, iter, 0); if (ret) - goto err; + break; continue; } @@ -772,12 +793,16 @@ retry: ret = hash_check_key(&trans, bch2_xattr_hash_desc, &h, iter, k); if (ret) - goto fsck_err; + break; + + bch2_btree_iter_advance(iter); } -err: fsck_err: if (ret == -EINTR) goto retry; + + bch2_trans_iter_put(&trans, h.chain); + bch2_trans_iter_put(&trans, iter); return bch2_trans_exit(&trans) ?: ret; } @@ -789,7 +814,9 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) bch_verbose(c, "checking root directory"); - ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); + ret = bch2_trans_do(c, NULL, NULL, 0, + __bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO, + root_inode, 0)); if (ret && ret != -ENOENT) return ret; @@ -808,9 +835,9 @@ create_root: 0, NULL); root_inode->bi_inum = BCACHEFS_ROOT_INO; - bch2_inode_pack(&packed, root_inode); + bch2_inode_pack(c, &packed, root_inode); - return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, + return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); @@ -836,7 +863,8 @@ static int check_lostfound(struct bch_fs *c, goto create_lostfound; } - ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); + ret = bch2_trans_do(c, NULL, NULL, 0, + __bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0)); if (ret && ret != -ENOENT) return ret; @@ -866,36 +894,22 @@ create_lostfound: return ret; } -struct inode_bitmap { - unsigned long *bits; - size_t size; -}; +typedef GENRADIX(unsigned long) inode_bitmap; -static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) +static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr) { - return nr < b->size ? test_bit(nr, b->bits) : false; + unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG); + return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false; } -static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) +static inline int inode_bitmap_set(inode_bitmap *b, size_t nr) { - if (nr >= b->size) { - size_t new_size = max_t(size_t, max_t(size_t, - PAGE_SIZE * 8, - b->size * 2), - nr + 1); - void *n; - - new_size = roundup_pow_of_two(new_size); - n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); - if (!n) { - return -ENOMEM; - } + unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL); - b->bits = n; - b->size = new_size; - } + if (!w) + return -ENOMEM; - __set_bit(nr, b->bits); + *w |= 1UL << (nr & (BITS_PER_LONG - 1)); return 0; } @@ -934,7 +948,7 @@ noinline_for_stack static int check_directory_structure(struct bch_fs *c, struct bch_inode_unpacked *lostfound_inode) { - struct inode_bitmap dirs_done = { NULL, 0 }; + inode_bitmap dirs_done; struct pathbuf path = { 0, 0, NULL }; struct pathbuf_entry *e; struct btree_trans trans; @@ -951,6 +965,7 @@ static int check_directory_structure(struct bch_fs *c, /* DFS: */ restart_dfs: + genradix_init(&dirs_done); had_unreachable = false; ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); @@ -970,7 +985,7 @@ next: if (e->offset == U64_MAX) goto up; - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, + for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS(e->inum, e->offset + 1), 0, k, ret) { if (k.k->p.inode != e->inum) break; @@ -1023,7 +1038,7 @@ up: path.nr--; } - iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS_MIN, 0); retry: for_each_btree_key_continue(iter, 0, k, ret) { if (k.k->type != KEY_TYPE_inode) @@ -1057,7 +1072,7 @@ retry: if (had_unreachable) { bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); - kfree(dirs_done.bits); + genradix_free(&dirs_done); kfree(path.entries); memset(&dirs_done, 0, sizeof(dirs_done)); memset(&path, 0, sizeof(path)); @@ -1066,7 +1081,7 @@ retry: err: fsck_err: ret = bch2_trans_exit(&trans) ?: ret; - kfree(dirs_done.bits); + genradix_free(&dirs_done); kfree(path.entries); return ret; } @@ -1087,6 +1102,11 @@ static void inc_link(struct bch_fs *c, nlink_table *links, if (inum < range_start || inum >= *range_end) return; + if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) { + *range_end = inum; + return; + } + link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); if (!link) { bch_verbose(c, "allocation failed during fsck - will need another pass"); @@ -1115,7 +1135,7 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); - for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { + for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) { switch (k.k->type) { case KEY_TYPE_dirent: d = bkey_s_c_to_dirent(k); @@ -1133,6 +1153,8 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, bch2_trans_cond_resched(&trans); } + bch2_trans_iter_put(&trans, iter); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) bch_err(c, "error in fsck: btree error %i while walking dirents", ret); @@ -1267,7 +1289,7 @@ static int check_inode(struct btree_trans *trans, bch2_fs_lazy_rw(c); - ret = bch2_inode_rm(c, u.bi_inum); + ret = bch2_inode_rm(c, u.bi_inum, false); if (ret) bch_err(c, "error in fsck: error %i while deleting inode", ret); return ret; @@ -1285,8 +1307,10 @@ static int check_inode(struct btree_trans *trans, * XXX: need to truncate partial blocks too here - or ideally * just switch units to bytes and that issue goes away */ - - ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + POS(u.bi_inum, round_up(u.bi_size, block_bytes(c))), + POS(u.bi_inum, U64_MAX), + NULL); if (ret) { bch_err(c, "error in fsck: error %i truncating inode", ret); return ret; @@ -1323,10 +1347,21 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (!S_ISDIR(u.bi_mode) && + u.bi_nlink && + !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) && + (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c, + "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") || + c->opts.version_upgrade)) { + u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED; + do_update = true; + } + if (do_update) { struct bkey_inode_buf p; - bch2_inode_pack(&p, &u); + bch2_inode_pack(c, &p, &u); + p.inode.k.p = iter->pos; ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| @@ -1356,28 +1391,30 @@ static int bch2_gc_walk_inodes(struct bch_fs *c, bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, + iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, range_start), 0); nlinks_iter = genradix_iter_init(links, 0); while ((k = bch2_btree_iter_peek(iter)).k && - !(ret2 = bkey_err(k))) { + !(ret2 = bkey_err(k)) && + iter->pos.offset < range_end) { peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); if (!link && (!k.k || iter->pos.offset >= range_end)) break; nlinks_pos = range_start + nlinks_iter.pos; - if (iter->pos.offset > nlinks_pos) { + + if (link && nlinks_pos < iter->pos.offset) { /* Should have been caught by dirents pass: */ - need_fsck_err_on(link && link->count, c, + need_fsck_err_on(link->count, c, "missing inode %llu (nlink %u)", nlinks_pos, link->count); genradix_iter_advance(&nlinks_iter, links); goto peek_nlinks; } - if (iter->pos.offset < nlinks_pos || !link) + if (!link || nlinks_pos > iter->pos.offset) link = &zero_links; if (k.k && k.k->type == KEY_TYPE_inode) { @@ -1396,10 +1433,11 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); if (nlinks_pos == iter->pos.offset) genradix_iter_advance(&nlinks_iter, links); - bch2_btree_iter_next(iter); + bch2_btree_iter_advance(iter); bch2_trans_cond_resched(&trans); } fsck_err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); if (ret2) @@ -1480,7 +1518,7 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { + for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) { if (k.k->type != KEY_TYPE_inode) continue; @@ -1491,11 +1529,12 @@ int bch2_fsck_walk_inodes_only(struct bch_fs *c) BCH_INODE_I_SECTORS_DIRTY| BCH_INODE_UNLINKED)) { ret = check_inode(&trans, NULL, iter, inode, NULL); - BUG_ON(ret == -EINTR); if (ret) break; } } + bch2_trans_iter_put(&trans, iter); + BUG_ON(ret == -EINTR); return bch2_trans_exit(&trans) ?: ret; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 7d20f08..d4c3283 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_key_cache.h" #include "bkey_methods.h" #include "btree_update.h" #include "error.h" #include "extents.h" #include "inode.h" #include "str_hash.h" +#include "varint.h" #include @@ -88,22 +90,17 @@ static int inode_decode_field(const u8 *in, const u8 *end, return bytes; } -void bch2_inode_pack(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) +static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) { - u8 *out = packed->inode.v.fields; + struct bkey_i_inode *k = &packed->inode; + u8 *out = k->v.fields; u8 *end = (void *) &packed[1]; u8 *last_nonzero_field = out; unsigned nr_fields = 0, last_nonzero_fieldnr = 0; unsigned bytes; - bkey_inode_init(&packed->inode.k_i); - packed->inode.k.p.offset = inode->bi_inum; - packed->inode.v.bi_hash_seed = inode->bi_hash_seed; - packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); - packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); - -#define x(_name, _bits) \ +#define x(_name, _bits) \ out += inode_encode_field(out, end, 0, inode->_name); \ nr_fields++; \ \ @@ -122,7 +119,69 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, set_bkey_val_bytes(&packed->inode.k, bytes); memset_u64s_tail(&packed->inode.v, 0, bytes); - SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); + SET_INODE_NR_FIELDS(&k->v, nr_fields); +} + +static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + struct bkey_i_inode *k = &packed->inode; + u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + unsigned bytes; + int ret; + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (inode->_name) { \ + ret = bch2_varint_encode(out, inode->_name); \ + out += ret; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + } + + BCH_INODE_FIELDS() +#undef x + BUG_ON(out > end); + + out = last_nonzero_field; + nr_fields = last_nonzero_fieldnr; + + bytes = out - (u8 *) &packed->inode.v; + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + + SET_INODE_NR_FIELDS(&k->v, nr_fields); +} + +void bch2_inode_pack(struct bch_fs *c, + struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + bkey_inode_init(&packed->inode.k_i); + packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); + packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); + + if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) { + SET_INODE_NEW_VARINT(&packed->inode.v, true); + bch2_inode_pack_v2(packed, inode); + } else { + bch2_inode_pack_v1(packed, inode); + } if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bch_inode_unpacked unpacked; @@ -134,26 +193,23 @@ void bch2_inode_pack(struct bkey_inode_buf *packed, BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); BUG_ON(unpacked.bi_mode != inode->bi_mode); -#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); +#define x(_name, _bits) if (unpacked._name != inode->_name) \ + panic("unpacked %llu should be %llu", \ + (u64) unpacked._name, (u64) inode->_name); BCH_INODE_FIELDS() #undef x } } -int bch2_inode_unpack(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) +static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) { const u8 *in = inode.v->fields; - const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); + const u8 *end = bkey_val_end(inode); u64 field[2]; unsigned fieldnr = 0, field_bits; int ret; - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); - #define x(_name, _bits) \ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ memset(&unpacked->_name, 0, \ @@ -176,6 +232,62 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, #undef x /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + const u8 *in = inode.v->fields; + const u8 *end = bkey_val_end(inode); + unsigned fieldnr = 0; + int ret; + u64 v[2]; + +#define x(_name, _bits) \ + if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ + ret = bch2_varint_decode(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ + ret = bch2_varint_decode(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v[1] = 0; \ + } \ + } else { \ + v[0] = v[1] = 0; \ + } \ + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ + return -1; \ + fieldnr++; + + BCH_INODE_FIELDS() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +int bch2_inode_unpack(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + if (INODE_NEW_VARINT(inode.v)) { + return bch2_inode_unpack_v2(inode, unpacked); + } else { + return bch2_inode_unpack_v1(inode, unpacked); + } return 0; } @@ -188,12 +300,9 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans, struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), - BTREE_ITER_SLOTS|flags); - if (IS_ERR(iter)) - return iter; - - k = bch2_btree_iter_peek_slot(iter); + iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_CACHED|flags); + k = bch2_btree_iter_peek_cached(iter); ret = bkey_err(k); if (ret) goto err; @@ -222,7 +331,8 @@ int bch2_inode_write(struct btree_trans *trans, if (IS_ERR(inode_p)) return PTR_ERR(inode_p); - bch2_inode_pack(inode_p, inode); + bch2_inode_pack(trans->c, inode_p, inode); + inode_p->inode.k.p.snapshot = iter->snapshot; bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); return 0; } @@ -271,6 +381,8 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, return; } + pr_buf(out, "mode: %o ", unpacked.bi_mode); + #define x(_name, _bits) \ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); BCH_INODE_FIELDS() @@ -358,72 +470,113 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -int bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) +struct btree_iter *bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + u32 snapshot) { - struct bkey_inode_buf *inode_p; + struct bch_fs *c = trans->c; struct btree_iter *iter = NULL; struct bkey_s_c k; - u64 start; + u64 min, max, start, pos, *hint; int ret; - if (!max) - max = ULLONG_MAX; + u64 cpu = raw_smp_processor_id(); + unsigned bits = (c->opts.inodes_32bit + ? 31 : 63) - c->inode_shard_bits; + + min = (cpu << bits); + max = (cpu << bits) | ~(ULLONG_MAX << bits); - if (trans->c->opts.inodes_32bit) - max = min_t(u64, max, U32_MAX); + min = max_t(u64, min, BLOCKDEV_INODE_MAX); + hint = c->unused_inode_hints + cpu; start = READ_ONCE(*hint); if (start >= max || start < min) start = min; - inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); + pos = start; + iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos), + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); again: - for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (bkey_cmp(iter->pos, POS(0, max)) > 0) - break; + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_cmp(k.k->p, POS(0, max)) < 0) { + while (pos < iter->pos.offset) { + if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) + goto found_slot; + + pos++; + } - if (k.k->type != KEY_TYPE_inode) - goto found_slot; + if (k.k->p.snapshot == snapshot && + k.k->type != KEY_TYPE_inode && + !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { + bch2_btree_iter_next(iter); + continue; + } + + /* + * We don't need to iterate over keys in every snapshot once + * we've found just one: + */ + pos = iter->pos.offset + 1; + bch2_btree_iter_set_pos(iter, POS(0, pos)); } - bch2_trans_iter_put(trans, iter); + while (!ret && pos < max) { + if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) + goto found_slot; - if (ret) - return ret; + pos++; + } - if (start != min) { - /* Retry from start */ - start = min; - goto again; + if (!ret && start == min) + ret = -ENOSPC; + + if (ret) { + bch2_trans_iter_put(trans, iter); + return ERR_PTR(ret); } - return -ENOSPC; + /* Retry from start */ + pos = start = min; + bch2_btree_iter_set_pos(iter, POS(0, pos)); + goto again; found_slot: + bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { + bch2_trans_iter_put(trans, iter); + return ERR_PTR(ret); + } + + /* We may have raced while the iterator wasn't pointing at pos: */ + if (k.k->type == KEY_TYPE_inode || + bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) + goto again; + *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - - bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); - bch2_trans_iter_put(trans, iter); - return 0; + return iter; } -int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) +int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter *iter = NULL; struct bkey_i_inode_generation delete; struct bpos start = POS(inode_nr, 0); struct bpos end = POS(inode_nr + 1, 0); + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; int ret; + bch2_trans_init(&trans, c, 0, 0); + /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should @@ -432,79 +585,71 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, - start, end, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_XATTRS, - start, end, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_DIRENTS, - start, end, NULL); + ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, + start, end, NULL) ?: + bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs, + start, end, NULL) ?: + bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents, + start, end, NULL); if (ret) - return ret; - - bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - do { - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - u32 bi_generation = 0; - - ret = bkey_err(k); - if (ret) - break; + goto err; +retry: + bch2_trans_begin(&trans); + + if (cached) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_cached(iter); + } else { + iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + } - bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, - "inode %llu not found when deleting", - inode_nr); + ret = bkey_err(k); + if (ret) + goto err; - switch (k.k->type) { - case KEY_TYPE_inode: { - struct bch_inode_unpacked inode_u; + if (k.k->type != KEY_TYPE_inode) { + bch2_fs_inconsistent(trans.c, + "inode %llu not found when deleting", + inode_nr); + ret = -EIO; + goto err; + } - if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) - bi_generation = inode_u.bi_generation + 1; - break; - } - case KEY_TYPE_inode_generation: { - struct bkey_s_c_inode_generation g = - bkey_s_c_to_inode_generation(k); - bi_generation = le32_to_cpu(g.v->bi_generation); - break; - } - } + bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); - if (!bi_generation) { - bkey_init(&delete.k); - delete.k.p.offset = inode_nr; - } else { - bkey_inode_generation_init(&delete.k_i); - delete.k.p.offset = inode_nr; - delete.v.bi_generation = cpu_to_le32(bi_generation); - } + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter->pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - bch2_trans_update(&trans, iter, &delete.k_i, 0); + bch2_trans_update(&trans, iter, &delete.k_i, 0); - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL); - } while (ret == -EINTR); + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_put(&trans, iter); + if (ret == -EINTR) + goto retry; bch2_trans_exit(&trans); return ret; } -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) +int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + unsigned flags) { struct btree_iter *iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, - POS(0, inode_nr), BTREE_ITER_SLOTS); - if (IS_ERR(iter)) - return PTR_ERR(iter); - - k = bch2_btree_iter_peek_slot(iter); + iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, + POS(0, inode_nr), flags); + k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED + ? bch2_btree_iter_peek_cached(iter) + : bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; @@ -517,38 +662,17 @@ err: return ret; } +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + return __bch2_inode_find_by_inum_trans(trans, inode_nr, + inode, BTREE_ITER_CACHED); + +} + int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, struct bch_inode_unpacked *inode) { return bch2_trans_do(c, NULL, NULL, 0, bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); } - -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_inode_pack_test(void) -{ - struct bch_inode_unpacked *u, test_inodes[] = { - { - .bi_atime = U64_MAX, - .bi_ctime = U64_MAX, - .bi_mtime = U64_MAX, - .bi_otime = U64_MAX, - .bi_size = U64_MAX, - .bi_sectors = U64_MAX, - .bi_uid = U32_MAX, - .bi_gid = U32_MAX, - .bi_nlink = U32_MAX, - .bi_generation = U32_MAX, - .bi_dev = U32_MAX, - }, - }; - - for (u = test_inodes; - u < test_inodes + ARRAY_SIZE(test_inodes); - u++) { - struct bkey_inode_buf p; - - bch2_inode_pack(&p, u); - } -} -#endif diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index bb759a4..23c322d 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -24,6 +24,14 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, .val_to_text = bch2_inode_generation_to_text, \ } +#if 0 +typedef struct { + u64 lo; + u32 hi; +} __packed __aligned(4) u96; +#endif +typedef u64 u96; + struct bch_inode_unpacked { u64 bi_inum; __le64 bi_hash_seed; @@ -43,7 +51,8 @@ struct bkey_inode_buf { #undef x } __attribute__((packed, aligned(8))); -void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); +void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, + const struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); struct btree_iter *bch2_inode_peek(struct btree_trans *, @@ -60,12 +69,13 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -int bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *, - u64, u64, u64 *); +struct btree_iter *bch2_inode_create(struct btree_trans *, + struct bch_inode_unpacked *, u32); -int bch2_inode_rm(struct bch_fs *, u64); +int bch2_inode_rm(struct bch_fs *, u64, bool); +int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *, unsigned); int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, struct bch_inode_unpacked *); int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); @@ -168,10 +178,4 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, } } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_inode_pack_test(void); -#else -static inline void bch2_inode_pack_test(void) {} -#endif - #endif /* _BCACHEFS_INODE_H */ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 8add8cc..36b10cb 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -9,7 +9,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" @@ -171,7 +171,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, while (size) { struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min(PAGE_SIZE, size); + unsigned len = min_t(size_t, PAGE_SIZE, size); BUG_ON(!bio_add_page(bio, page, len, 0)); size -= len; @@ -183,39 +183,47 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, /* Extent update path: */ -static int sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool may_allocate, - bool *maybe_extending, - s64 *delta) +int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *maybe_extending, + bool *should_check_enospc, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) { + struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); int ret = 0; - *maybe_extending = true; - *delta = 0; + *maybe_extending = true; + *should_check_enospc = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; iter = bch2_trans_copy_iter(trans, extent_iter); - if (IS_ERR(iter)) - return PTR_ERR(iter); for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { - if (!may_allocate && - bch2_bkey_nr_ptrs_fully_allocated(old) < - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { - ret = -ENOSPC; - break; - } + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); - *delta += (min(new->k.p.offset, - old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k))) * + *i_sectors_delta += sectors * (bkey_extent_is_allocation(&new->k) - bkey_extent_is_allocation(old.k)); + *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); + *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot + ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) + : 0; + + if (!*should_check_enospc && + (new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *should_check_enospc = true; + if (bkey_cmp(old.k->p, new->k.p) >= 0) { /* * Check if there's already data above where we're @@ -249,29 +257,41 @@ int bch2_extent_update(struct btree_trans *trans, struct disk_reservation *disk_res, u64 *journal_seq, u64 new_i_size, - s64 *i_sectors_delta) + s64 *i_sectors_delta_total) { /* this must live until after bch2_trans_commit(): */ struct bkey_inode_buf inode_p; - bool extending = false; - s64 delta = 0; + bool extending = false, should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; ret = bch2_extent_trim_atomic(k, iter); if (ret) return ret; - ret = sum_sector_overwrites(trans, iter, k, - disk_res && disk_res->sectors != 0, - &extending, &delta); + ret = bch2_sum_sector_overwrites(trans, iter, k, + &extending, + &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); if (ret) return ret; + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, + !should_check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } + new_i_size = extending ? min(k->k.p.offset << 9, new_i_size) : 0; - if (delta || new_i_size) { + if (i_sectors_delta || new_i_size) { struct btree_iter *inode_iter; struct bch_inode_unpacked inode_u; @@ -298,10 +318,13 @@ int bch2_extent_update(struct btree_trans *trans, else new_i_size = 0; - inode_u.bi_sectors += delta; + inode_u.bi_sectors += i_sectors_delta; + + if (i_sectors_delta || new_i_size) { + bch2_inode_pack(trans->c, &inode_p, &inode_u); + + inode_p.inode.k.p.snapshot = iter->snapshot; - if (delta || new_i_size) { - bch2_inode_pack(&inode_p, &inode_u); bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i, 0); } @@ -313,12 +336,13 @@ int bch2_extent_update(struct btree_trans *trans, ret = bch2_trans_commit(trans, disk_res, journal_seq, BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); - if (!ret && i_sectors_delta) - *i_sectors_delta += delta; + BTREE_INSERT_NOFAIL); + if (ret) + return ret; - return ret; + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + return 0; } int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, @@ -378,12 +402,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, int ret = 0; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS(inum, start), BTREE_ITER_INTENT); ret = bch2_fpunch_at(&trans, iter, POS(inum, end), journal_seq, i_sectors_delta); + + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); if (ret == -EINTR) @@ -395,17 +421,17 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; - struct bkey_on_stack sk; + struct bkey_buf sk; struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; struct btree_iter *iter; int ret; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, bkey_start_pos(&k->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -414,7 +440,9 @@ int bch2_write_index_default(struct bch_write_op *op) k = bch2_keylist_front(keys); - bkey_on_stack_realloc(&sk, c, k->k.u64s); + k->k.p.snapshot = iter->snapshot; + + bch2_bkey_buf_realloc(&sk, c, k->k.u64s); bkey_copy(sk.k, k); bch2_cut_front(iter->pos, sk.k); @@ -430,8 +458,9 @@ int bch2_write_index_default(struct bch_write_op *op) bch2_keylist_pop_front(keys); } while (!bch2_keylist_empty(keys)); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } @@ -479,9 +508,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); @@ -578,7 +604,8 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); if (ret) { - __bcache_io_error(c, "btree IO error %i", ret); + bch_err_inum_ratelimited(c, op->pos.inode, + "write error %i from btree update", ret); op->error = ret; } } @@ -623,7 +650,10 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + op->pos.inode, + op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ + "data write error: %s", bch2_blk_status_to_str(bio->bi_status))) set_bit(wbio->dev, op->failed.d); @@ -1281,15 +1311,14 @@ void bch2_write(struct closure *cl) wbio_init(bio)->put_bio = false; if (bio_sectors(bio) & (c->opts.block_size - 1)) { - __bcache_io_error(c, "misaligned write"); + bch_err_inum_ratelimited(c, op->pos.inode, + "misaligned write"); op->error = -EIO; goto err; } if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { - if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) - __bcache_io_error(c, "read only"); op->error = -EROFS; goto err; } @@ -1518,8 +1547,8 @@ static struct promote_op *promote_alloc(struct bch_fs *c, promote = __promote_alloc(c, k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_REFLINK - : BTREE_ID_EXTENTS, + ? BTREE_ID_reflink + : BTREE_ID_extents, k, pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1605,18 +1634,18 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio { struct btree_trans trans; struct btree_iter *iter; - struct bkey_on_stack sk; + struct bkey_buf sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - rbio->pos, BTREE_ITER_SLOTS); + iter = bch2_trans_get_iter(&trans, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); retry: rbio->bio.bi_status = 0; @@ -1624,109 +1653,38 @@ retry: if (bkey_err(k)) goto err; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, - rbio->pos.offset - + rbio->data_pos.offset - rbio->pick.crc.offset)) { /* extent we wanted to read no longer exists: */ rbio->hole = true; goto out; } - ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags); + ret = __bch2_read_extent(&trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) goto err; out: bch2_rbio_done(rbio); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return; err: rbio->bio.bi_status = BLK_STS_IOERR; goto out; } -static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_io_failures *failed, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_on_stack sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bkey_on_stack_init(&sk); - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS, k, ret) { - unsigned bytes, sectors, offset_into_extent; - - bkey_on_stack_reassemble(&sk, c, k); - - offset_into_extent = iter->pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, &sk); - if (ret) - break; - - k = bkey_i_to_s_c(sk.k); - - sectors = min(sectors, k.k->size - offset_into_extent); - - bch2_trans_unlock(&trans); - - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, - offset_into_extent, failed, flags); - switch (ret) { - case READ_RETRY: - goto retry; - case READ_ERR: - goto err; - }; - - if (bytes == bvec_iter.bi_size) - goto out; - - swap(bvec_iter.bi_size, bytes); - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - } - - if (ret == -EINTR) - goto retry; - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - BUG_ON(!ret); - __bcache_io_error(c, "btree IO error: %i", ret); -err: - rbio->bio.bi_status = BLK_STS_IOERR; -out: - bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); - bch2_rbio_done(rbio); -} - static void bch2_rbio_retry(struct work_struct *work) { struct bch_read_bio *rbio = @@ -1734,7 +1692,7 @@ static void bch2_rbio_retry(struct work_struct *work) struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; - u64 inode = rbio->pos.inode; + u64 inode = rbio->read_pos.inode; struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); @@ -1749,10 +1707,14 @@ static void bch2_rbio_retry(struct work_struct *work) flags |= BCH_READ_IN_RETRY; flags &= ~BCH_READ_MAY_PROMOTE; - if (flags & BCH_READ_NODECODE) + if (flags & BCH_READ_NODECODE) { bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); - else - bch2_read_retry(c, rbio, iter, inode, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inode, &failed, flags); + } } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1778,7 +1740,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; - u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; struct bch_extent_crc_unpacked new_crc; struct btree_iter *iter = NULL; struct bkey_i *new; @@ -1788,26 +1750,12 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (crc_is_compressed(rbio->pick.crc)) return 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, + iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if ((ret = PTR_ERR_OR_ZERO(iter))) - goto out; - k = bch2_btree_iter_peek_slot(iter); if ((ret = bkey_err(k))) goto out; - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - BKEY_EXTENT_U64s_MAX * 8); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - k = bkey_i_to_s_c(new); - if (bversion_cmp(k.k->version, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; @@ -1826,6 +1774,16 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, goto out; } + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; @@ -1925,17 +1883,15 @@ csum_err: return; } - bch2_dev_io_error(ca, - "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", - rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); return; decompression_err: - __bcache_io_error(c, "decompression error, inode %llu offset %llu", - rbio->pos.inode, - (u64) rbio->bvec_iter.bi_sector); + bch_err_inum_ratelimited(c, rbio->read_pos.inode, + "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); return; } @@ -1957,7 +1913,10 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", bch2_blk_status_to_str(bio->bi_status))) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; @@ -1987,7 +1946,7 @@ static void bch2_read_endio(struct bio *bio) int __bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, - struct bkey_on_stack *orig_k) + struct bkey_buf *orig_k) { struct btree_iter *iter; struct bkey_s_c k; @@ -1997,13 +1956,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + *offset_into_extent; - iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, + iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, reflink_offset), BTREE_ITER_SLOTS); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - return ret; - k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -2011,21 +1966,22 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, if (k.k->type != KEY_TYPE_reflink_v && k.k->type != KEY_TYPE_indirect_inline_data) { - __bcache_io_error(trans->c, + bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, "pointer to nonexistent indirect extent"); ret = -EIO; goto err; } *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); - bkey_on_stack_reassemble(orig_k, trans->c, k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); err: bch2_trans_iter_put(trans, iter); return ret; } int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c k, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, struct bch_io_failures *failed, unsigned flags) { @@ -2035,7 +1991,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_dev *ca; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos pos = bkey_start_pos(k.k); + struct bpos data_pos = bkey_start_pos(k.k); int pick_ret; if (bkey_extent_is_inline_data(k.k)) { @@ -2057,7 +2013,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto hole; if (pick_ret < 0) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); goto err; } @@ -2110,7 +2067,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, pick.crc.offset || offset_into_extent)); - pos.offset += offset_into_extent; + data_pos.offset += offset_into_extent; pick.ptr.offset += pick.crc.offset + offset_into_extent; offset_into_extent = 0; @@ -2182,7 +2139,9 @@ get_bio: /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; - rbio->pos = pos; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; rbio->version = k.k->version; rbio->promote = promote; INIT_WORK(&rbio->work, NULL); @@ -2196,7 +2155,11 @@ get_bio: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - if (pick.ptr.cached) + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); @@ -2207,7 +2170,8 @@ get_bio: if (!rbio->pick.idx) { if (!rbio->have_ioref) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } @@ -2247,6 +2211,9 @@ out: ret = READ_RETRY; } + if (!ret) + goto out_read_done; + return ret; } @@ -2273,53 +2240,48 @@ out_read_done: return 0; } -void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; struct btree_iter *iter; - struct bkey_on_stack sk; + struct bkey_buf sk; struct bkey_s_c k; - unsigned flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED; int ret; - BUG_ON(rbio->_state); BUG_ON(flags & BCH_READ_NODECODE); - BUG_ON(flags & BCH_READ_IN_RETRY); - rbio->c = c; - rbio->start_time = local_clock(); - - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inode, rbio->bio.bi_iter.bi_sector), + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode, bvec_iter.bi_sector), BTREE_ITER_SLOTS); while (1) { unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; bch2_btree_iter_set_pos(iter, - POS(inode, rbio->bio.bi_iter.bi_sector)); + POS(inode, bvec_iter.bi_sector)); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) - goto err; + break; offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); - ret = bch2_read_indirect_extent(&trans, + ret = bch2_read_indirect_extent(&trans, &data_btree, &offset_into_extent, &sk); if (ret) - goto err; + break; k = bkey_i_to_s_c(sk.k); @@ -2335,31 +2297,37 @@ retry: */ bch2_trans_unlock(&trans); - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; - swap(rbio->bio.bi_iter.bi_size, bytes); + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); - if (rbio->bio.bi_iter.bi_size == bytes) + if (bvec_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - bch2_read_extent(&trans, rbio, k, offset_into_extent, flags); + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; if (flags & BCH_READ_LAST_FRAGMENT) break; - swap(rbio->bio.bi_iter.bi_size, bytes); - bio_advance(&rbio->bio, bytes); + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); } -out: - bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); - return; -err: - if (ret == -EINTR) + bch2_trans_iter_put(&trans, iter); + + if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) goto retry; - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); - bch2_rbio_done(rbio); - goto out; + if (ret) { + bch_err_inum_ratelimited(c, inode, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); } void bch2_fs_io_exit(struct bch_fs *c) diff --git a/libbcachefs/io.h b/libbcachefs/io.h index e6aac59..2ac03c0 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -3,7 +3,7 @@ #define _BCACHEFS_IO_H #include "checksum.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "io_types.h" #define to_wbio(_bio) \ @@ -60,6 +60,8 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) : op->c->wq; } +int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, bool *, s64 *, s64 *); int bch2_extent_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct disk_reservation *, u64 *, u64, s64 *); @@ -112,15 +114,18 @@ struct cache_promote_op; struct extent_ptr_decoded; int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, - struct bkey_on_stack *); + struct bkey_buf *); static inline int bch2_read_indirect_extent(struct btree_trans *trans, + enum btree_id *data_btree, unsigned *offset_into_extent, - struct bkey_on_stack *k) + struct bkey_buf *k) { - return k->k->k.type == KEY_TYPE_reflink_p - ? __bch2_read_indirect_extent(trans, offset_into_extent, k) - : 0; + if (k->k->k.type != KEY_TYPE_reflink_p) + return 0; + + *data_btree = BTREE_ID_reflink; + return __bch2_read_indirect_extent(trans, offset_into_extent, k); } enum bch_read_flags { @@ -137,20 +142,37 @@ enum bch_read_flags { }; int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, - struct bvec_iter, struct bkey_s_c, unsigned, + struct bvec_iter, struct bpos, enum btree_id, + struct bkey_s_c, unsigned, struct bch_io_failures *, unsigned); static inline void bch2_read_extent(struct btree_trans *trans, - struct bch_read_bio *rbio, - struct bkey_s_c k, - unsigned offset_into_extent, - unsigned flags) + struct bch_read_bio *rbio, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, unsigned flags) { - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k, - offset_into_extent, NULL, flags); + __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags); } -void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); +void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + u64, struct bch_io_failures *, unsigned flags); + +static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + u64 inode) +{ + struct bch_io_failures failed = { .nr = 0 }; + + BUG_ON(rbio->_state); + + rbio->c = c; + rbio->start_time = local_clock(); + + __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed, + BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED); +} static inline struct bch_read_bio *rbio_init(struct bio *bio, struct bch_io_opts opts) diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index b23727d..e7aca7c 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -58,8 +58,18 @@ struct bch_read_bio { struct bch_devs_list devs_have; struct extent_ptr_decoded pick; - /* start pos of data we read (may not be pos of data we want) */ - struct bpos pos; + + /* + * pos we read from - different from data_pos for indirect extents: + */ + struct bpos read_pos; + + /* + * start pos of data we read (may not be pos of data we want) - for + * promote, narrow extents paths: + */ + enum btree_id data_btree; + struct bpos data_pos; struct bversion version; struct promote_op *promote; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index b8b7199..b901be5 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -9,7 +9,9 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_gc.h" +#include "btree_update.h" #include "buckets.h" +#include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -18,7 +20,19 @@ #include -static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); +static u64 last_unwritten_seq(struct journal *j) +{ + union journal_res_state s = READ_ONCE(j->reservations); + + lockdep_assert_held(&j->lock); + + return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); +} + +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq >= last_unwritten_seq(j); +} static bool __journal_entry_is_open(union journal_res_state state) { @@ -30,27 +44,50 @@ static bool journal_entry_is_open(struct journal *j) return __journal_entry_is_open(j->reservations); } -static void journal_pin_new_entry(struct journal *j, int count) +static inline struct journal_buf * +journal_seq_to_buf(struct journal *j, u64 seq) { - struct journal_entry_pin_list *p; + struct journal_buf *buf = NULL; - /* - * The fifo_push() needs to happen at the same time as j->seq is - * incremented for journal_last_seq() to be calculated correctly - */ - atomic64_inc(&j->seq); - p = fifo_push_ref(&j->pin); + EBUG_ON(seq > journal_cur_seq(j)); + EBUG_ON(seq == journal_cur_seq(j) && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); + + if (journal_seq_unwritten(j, seq)) { + buf = j->buf + (seq & JOURNAL_BUF_MASK); + EBUG_ON(le64_to_cpu(buf->data->seq) != seq); + } + return buf; +} +static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) +{ INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->key_cache_list); INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, count); p->devs.nr = 0; } +static void journal_pin_new_entry(struct journal *j) +{ + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for journal_last_seq() to be calculated correctly + */ + atomic64_inc(&j->seq); + journal_pin_list_init(fifo_push_ref(&j->pin), 1); +} + static void bch2_journal_buf_init(struct journal *j) { struct journal_buf *buf = journal_cur_buf(j); + bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + memset(buf->has_inode, 0, sizeof(buf->has_inode)); memset(buf->data, 0, sizeof(*buf->data)); @@ -72,26 +109,23 @@ void bch2_journal_halt(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + j->err_seq = journal_cur_seq(j); journal_wake(j); closure_wake_up(&journal_cur_buf(j)->wait); } /* journal entry close/open: */ -void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) +void __bch2_journal_buf_put(struct journal *j) { - if (!need_write_just_set && - test_bit(JOURNAL_NEED_WRITE, &j->flags)) - bch2_time_stats_update(j->delay_time, - j->need_write_time); - - clear_bit(JOURNAL_NEED_WRITE, &j->flags); - closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } /* * Returns true if journal entry is now closed: + * + * We don't close a journal_buf until the next journal_buf is finished writing, + * and can be opened again - this also initializes the next journal_buf: */ static bool __journal_entry_close(struct journal *j) { @@ -99,7 +133,6 @@ static bool __journal_entry_close(struct journal *j) struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); - bool set_need_write = false; unsigned sectors; lockdep_assert_held(&j->lock); @@ -118,20 +151,19 @@ static bool __journal_entry_close(struct journal *j) if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { set_bit(JOURNAL_NEED_WRITE, &j->flags); j->need_write_time = local_clock(); - set_need_write = true; } - if (new.prev_buf_unwritten) - return false; - new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; new.idx++; - new.prev_buf_unwritten = 1; + + if (new.idx == new.unwritten_idx) + return false; BUG_ON(journal_state_count(new, new.idx)); } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); sectors = vstruct_blocks_plus(buf->data, c->block_bits, @@ -139,8 +171,6 @@ static bool __journal_entry_close(struct journal *j) BUG_ON(sectors > buf->sectors); buf->sectors = sectors; - bkey_extent_init(&buf->key); - /* * We have to set last_seq here, _before_ opening a new journal entry: * @@ -162,29 +192,45 @@ static bool __journal_entry_close(struct journal *j) */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); - if (journal_entry_empty(buf->data)) - clear_bit(JOURNAL_NOT_EMPTY, &j->flags); - else - set_bit(JOURNAL_NOT_EMPTY, &j->flags); + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); - journal_pin_new_entry(j, 1); + /* Initialize new buffer: */ + journal_pin_new_entry(j); bch2_journal_buf_init(j); cancel_delayed_work(&j->write_work); + clear_bit(JOURNAL_NEED_WRITE, &j->flags); bch2_journal_space_available(j); - bch2_journal_buf_put(j, old.idx, set_need_write); + bch2_journal_buf_put(j, old.idx); return true; } +static bool journal_entry_want_write(struct journal *j) +{ + union journal_res_state s = READ_ONCE(j->reservations); + bool ret = false; + + /* + * Don't close it yet if we already have a write in flight, but do set + * NEED_WRITE: + */ + if (s.idx != s.unwritten_idx) + set_bit(JOURNAL_NEED_WRITE, &j->flags); + else + ret = __journal_entry_close(j); + + return ret; +} + static bool journal_entry_close(struct journal *j) { bool ret; spin_lock(&j->lock); - ret = __journal_entry_close(j); + ret = journal_entry_want_write(j); spin_unlock(&j->lock); return ret; @@ -202,16 +248,19 @@ static bool journal_entry_close(struct journal *j) */ static int journal_entry_open(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; int u64s; u64 v; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + lockdep_assert_held(&j->lock); BUG_ON(journal_entry_is_open(j)); if (j->blocked) - return -EAGAIN; + return cur_entry_blocked; if (j->cur_entry_error) return j->cur_entry_error; @@ -227,7 +276,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= le32_to_cpu(buf->data->u64s)) - return -ENOSPC; + return cur_entry_journal_full; /* * Must be set before marking the journal entry as open: @@ -239,7 +288,7 @@ static int journal_entry_open(struct journal *j) old.v = new.v = v; if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) - return -EROFS; + return cur_entry_insufficient_devices; /* Handle any already added entries */ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); @@ -263,8 +312,8 @@ static int journal_entry_open(struct journal *j) static bool journal_quiesced(struct journal *j) { - union journal_res_state state = READ_ONCE(j->reservations); - bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); + union journal_res_state s = READ_ONCE(j->reservations); + bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); if (!ret) journal_entry_close(j); @@ -291,17 +340,29 @@ static void journal_write_work(struct work_struct *work) u64 bch2_inode_journal_seq(struct journal *j, u64 inode) { size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - u64 seq = 0; + union journal_res_state s; + unsigned i; + u64 seq; - if (!test_bit(h, j->buf[0].has_inode) && - !test_bit(h, j->buf[1].has_inode)) - return 0; spin_lock(&j->lock); - if (test_bit(h, journal_cur_buf(j)->has_inode)) - seq = journal_cur_seq(j); - else if (test_bit(h, journal_prev_buf(j)->has_inode)) - seq = journal_cur_seq(j) - 1; + seq = journal_cur_seq(j); + s = READ_ONCE(j->reservations); + i = s.idx; + + while (1) { + if (test_bit(h, j->buf[i].has_inode)) + goto out; + + if (i == s.unwritten_idx) + break; + + i = (i - 1) & JOURNAL_BUF_MASK; + seq--; + } + + seq = 0; +out: spin_unlock(&j->lock); return seq; @@ -352,7 +413,7 @@ retry: * Don't want to close current journal entry, just need to * invoke reclaim: */ - ret = -ENOSPC; + ret = cur_entry_journal_full; goto unlock; } @@ -375,14 +436,16 @@ retry: * there's still a previous one in flight: */ trace_journal_entry_full(c); - ret = -EAGAIN; + ret = cur_entry_blocked; } else { ret = journal_entry_open(j); } unlock: - if ((ret == -EAGAIN || ret == -ENOSPC) && - !j->res_get_blocked_start) + if ((ret && ret != cur_entry_insufficient_devices) && + !j->res_get_blocked_start) { j->res_get_blocked_start = local_clock() ?: 1; + trace_journal_full(c); + } can_discard = j->can_discard; spin_unlock(&j->lock); @@ -390,32 +453,46 @@ unlock: if (!ret) goto retry; - if (ret == -ENOSPC) { - WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), - "JOURNAL_RES_GET_RESERVED set but journal full"); - - /* - * Journal is full - can't rely on reclaim from work item due to - * freezing: - */ - trace_journal_full(c); + if ((ret == cur_entry_journal_full || + ret == cur_entry_journal_pin_full) && + !can_discard && + j->reservations.idx == j->reservations.unwritten_idx && + (flags & JOURNAL_RES_GET_RESERVED)) { + char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC); + + bch_err(c, "Journal stuck!"); + if (journal_debug_buf) { + bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); + bch_err(c, "%s", journal_debug_buf); + + bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j); + bch_err(c, "Journal pins:\n%s", journal_debug_buf); + kfree(journal_debug_buf); + } - if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { - if (can_discard) { - bch2_journal_do_discards(j); - goto retry; - } + bch2_fatal_error(c); + dump_stack(); + } - if (mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ + if ((ret == cur_entry_journal_full || + ret == cur_entry_journal_pin_full) && + !(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); + goto retry; } - ret = -EAGAIN; + if (mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } } - return ret; + return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; } /* @@ -446,10 +523,12 @@ static bool journal_preres_available(struct journal *j, unsigned new_u64s, unsigned flags) { - bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); + bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); - if (!ret) - bch2_journal_reclaim_work(&j->reclaim_work.work); + if (!ret && mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } return ret; } @@ -503,168 +582,82 @@ out: /* journal flushing: */ -u64 bch2_journal_last_unwritten_seq(struct journal *j) -{ - u64 seq; - - spin_lock(&j->lock); - seq = journal_cur_seq(j); - if (j->reservations.prev_buf_unwritten) - seq--; - spin_unlock(&j->lock); - - return seq; -} - /** - * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't - * open yet, or wait if we cannot + * bch2_journal_flush_seq_async - wait for a journal entry to be written * - * used by the btree interior update machinery, when it needs to write a new - * btree root - every journal entry contains the roots of all the btrees, so it - * doesn't need to bother with getting a journal reservation + * like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * necessary */ -int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) +int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct closure *parent) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - int ret; + struct journal_buf *buf; + int ret = 0; - spin_lock(&j->lock); + if (seq <= j->flushed_seq_ondisk) + return 1; - /* - * Can't try to open more than one sequence number ahead: - */ - BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); - - if (journal_cur_seq(j) > seq || - journal_entry_is_open(j)) { - spin_unlock(&j->lock); - return 0; - } + spin_lock(&j->lock); - if (journal_cur_seq(j) < seq && - !__journal_entry_close(j)) { - /* haven't finished writing out the previous one: */ - trace_journal_entry_full(c); - ret = -EAGAIN; - } else { - BUG_ON(journal_cur_seq(j) != seq); + BUG_ON(seq > journal_cur_seq(j)); - ret = journal_entry_open(j); + /* Recheck under lock: */ + if (j->err_seq && seq >= j->err_seq) { + ret = -EIO; + goto out; } - if ((ret == -EAGAIN || ret == -ENOSPC) && - !j->res_get_blocked_start) - j->res_get_blocked_start = local_clock() ?: 1; - - if (ret == -EAGAIN || ret == -ENOSPC) - closure_wait(&j->async_wait, cl); - - spin_unlock(&j->lock); - - if (ret == -ENOSPC) { - trace_journal_full(c); - bch2_journal_reclaim_work(&j->reclaim_work.work); - ret = -EAGAIN; + if (seq <= j->flushed_seq_ondisk) { + ret = 1; + goto out; } - return ret; -} - -static int journal_seq_error(struct journal *j, u64 seq) -{ - union journal_res_state state = READ_ONCE(j->reservations); + /* if seq was written, but not flushed - flush a newer one instead */ + seq = max(seq, last_unwritten_seq(j)); - if (seq == journal_cur_seq(j)) - return bch2_journal_error(j); - - if (seq + 1 == journal_cur_seq(j) && - !state.prev_buf_unwritten && - seq > j->seq_ondisk) - return -EIO; - - return 0; -} - -static inline struct journal_buf * -journal_seq_to_buf(struct journal *j, u64 seq) -{ - /* seq should be for a journal entry that has been opened: */ - BUG_ON(seq > journal_cur_seq(j)); - BUG_ON(seq == journal_cur_seq(j) && - j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); +recheck_need_open: + if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) { + struct journal_res res = { 0 }; - if (seq == journal_cur_seq(j)) - return journal_cur_buf(j); - if (seq + 1 == journal_cur_seq(j) && - j->reservations.prev_buf_unwritten) - return journal_prev_buf(j); - return NULL; -} + spin_unlock(&j->lock); -/** - * bch2_journal_wait_on_seq - wait for a journal entry to be written - * - * does _not_ cause @seq to be written immediately - if there is no other - * activity to cause the relevant journal entry to be filled up or flushed it - * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is - * configurable). - */ -void bch2_journal_wait_on_seq(struct journal *j, u64 seq, - struct closure *parent) -{ - struct journal_buf *buf; + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; - spin_lock(&j->lock); + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + set_bit(JOURNAL_NEED_WRITE, &j->flags); - if ((buf = journal_seq_to_buf(j, seq))) { - if (!closure_wait(&buf->wait, parent)) + if (parent && !closure_wait(&buf->wait, parent)) BUG(); - if (seq == journal_cur_seq(j)) { - smp_mb(); - if (bch2_journal_error(j)) - closure_wake_up(&buf->wait); - } - } + bch2_journal_res_put(j, &res); - spin_unlock(&j->lock); -} - -/** - * bch2_journal_flush_seq_async - wait for a journal entry to be written - * - * like bch2_journal_wait_on_seq, except that it triggers a write immediately if - * necessary - */ -void bch2_journal_flush_seq_async(struct journal *j, u64 seq, - struct closure *parent) -{ - struct journal_buf *buf; - - spin_lock(&j->lock); - - if (parent && - (buf = journal_seq_to_buf(j, seq))) - if (!closure_wait(&buf->wait, parent)) - BUG(); - - if (seq == journal_cur_seq(j)) - __journal_entry_close(j); - spin_unlock(&j->lock); -} + spin_lock(&j->lock); + goto want_write; + } -static int journal_seq_flushed(struct journal *j, u64 seq) -{ - int ret; + /* + * if write was kicked off without a flush, flush the next sequence + * number instead + */ + buf = journal_seq_to_buf(j, seq); + if (buf->noflush) { + seq++; + goto recheck_need_open; + } - spin_lock(&j->lock); - ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); + buf->must_flush = true; + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); +want_write: if (seq == journal_cur_seq(j)) - __journal_entry_close(j); + journal_entry_want_write(j); +out: spin_unlock(&j->lock); - return ret; } @@ -673,28 +666,14 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) u64 start_time = local_clock(); int ret, ret2; - ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); + ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); - bch2_time_stats_update(j->flush_seq_time, start_time); + if (!ret) + bch2_time_stats_update(j->flush_seq_time, start_time); return ret ?: ret2 < 0 ? ret2 : 0; } -/** - * bch2_journal_meta_async - force a journal entry to be written - */ -void bch2_journal_meta_async(struct journal *j, struct closure *parent) -{ - struct journal_res res; - - memset(&res, 0, sizeof(res)); - - bch2_journal_res_get(j, &res, jset_u64s(0), 0); - bch2_journal_res_put(j, &res); - - bch2_journal_flush_seq_async(j, res.seq, parent); -} - int bch2_journal_meta(struct journal *j) { struct journal_res res; @@ -790,16 +769,19 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (nr <= ja->nr) return 0; - ret = -ENOMEM; new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); - if (!new_buckets || !new_bucket_seq) + if (!new_buckets || !new_bucket_seq) { + ret = -ENOMEM; goto err; + } journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) + nr + sizeof(*journal_buckets) / sizeof(u64)); + if (!journal_buckets) { + ret = -ENOSPC; goto err; + } /* * We may be called from the device add path, before the new device has @@ -828,8 +810,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, goto err; } } else { - ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, + rcu_read_lock(); + ob = bch2_bucket_alloc(c, ca, RESERVE_NONE, false, cl); + rcu_read_unlock(); if (IS_ERR(ob)) { ret = cl ? -EAGAIN : -ENOSPC; goto err; @@ -843,6 +827,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, spin_lock(&c->journal.lock); } + /* + * XXX + * For resize at runtime, we should be writing the new + * superblock before inserting into the journal array + */ + pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; __array_insert_item(ja->buckets, ja->nr, pos); __array_insert_item(ja->bucket_seq, ja->nr, pos); @@ -862,22 +852,32 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), - 0); + if (!c || new_fs) + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), + 0); if (c) { spin_unlock(&c->journal.lock); percpu_up_read(&c->mark_lock); } + if (c && !new_fs) + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_trans_mark_metadata_bucket(&trans, NULL, ca, + bucket, BCH_DATA_journal, + ca->mi.bucket_size)); + if (!new_fs) bch2_open_bucket_put(c, ob); - } - ret = 0; + if (ret) + goto err; + } err: + bch2_sb_resize_journal(&ca->disk_sb, + ja->nr + sizeof(*journal_buckets) / sizeof(u64)); kfree(new_bucket_seq); kfree(new_buckets); @@ -938,14 +938,17 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) if (dynamic_fault("bcachefs:add:journal_alloc")) return -ENOMEM; + /* 1/128th of the device by default: */ + nr = ca->mi.nbuckets >> 7; + /* - * clamp journal size to 1024 buckets or 512MB (in sectors), whichever + * clamp journal size to 8192 buckets or 8GB (in sectors), whichever * is smaller: */ - nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, + nr = clamp_t(unsigned, nr, BCH_JOURNAL_BUCKETS_MIN, - min(1 << 10, - (1 << 20) / ca->mi.bucket_size)); + min(1 << 13, + (1 << 24) / ca->mi.bucket_size)); return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); } @@ -955,15 +958,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { union journal_res_state state; - struct journal_buf *w; - bool ret; + bool ret = false; + unsigned i; spin_lock(&j->lock); state = READ_ONCE(j->reservations); - w = j->buf + !state.idx; + i = state.idx; - ret = state.prev_buf_unwritten && - bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); + while (i != state.unwritten_idx) { + i = (i - 1) & JOURNAL_BUF_MASK; + if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) + ret = true; + } spin_unlock(&j->lock); return ret; @@ -980,17 +986,21 @@ void bch2_fs_journal_stop(struct journal *j) wait_event(j->wait, journal_entry_close(j)); - /* do we need to write another journal entry? */ - if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) - bch2_journal_meta(j); + /* + * Always write a new journal entry, to make sure the clock hands are up + * to date (and match the superblock) + */ + bch2_journal_meta(j); journal_quiesce(j); BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_NOT_EMPTY, &j->flags)); + test_bit(JOURNAL_REPLAY_DONE, &j->flags) && + (journal_entry_is_open(j) || + j->last_empty_seq + 1 != journal_cur_seq(j))); cancel_delayed_work_sync(&j->write_work); - cancel_delayed_work_sync(&j->reclaim_work); + bch2_journal_reclaim_stop(j); } int bch2_fs_journal_start(struct journal *j, u64 cur_seq, @@ -1023,28 +1033,34 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 1); - p->devs.nr = 0; - } + fifo_for_each_entry_ptr(p, &j->pin, seq) + journal_pin_list_init(p, 1); list_for_each_entry(i, journal_entries, list) { + unsigned ptr; + seq = le64_to_cpu(i->j.seq); BUG_ON(seq >= cur_seq); if (seq < last_seq) continue; - journal_seq_pin(j, seq)->devs = i->devs; + p = journal_seq_pin(j, seq); + + p->devs.nr = 0; + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); } spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; + + journal_pin_new_entry(j); + + j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); - journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); @@ -1098,8 +1114,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - kvpfree(j->buf[1].data, j->buf[1].buf_size); - kvpfree(j->buf[0].data, j->buf[0].buf_size); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) + kvpfree(j->buf[i].data, j->buf[i].buf_size); free_fifo(&j->pin); } @@ -1107,6 +1125,7 @@ int bch2_fs_journal_init(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); static struct lock_class_key res_key; + unsigned i; int ret = 0; pr_verbose_init(c->opts, ""); @@ -1115,33 +1134,34 @@ int bch2_fs_journal_init(struct journal *j) spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); - INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); + init_waitqueue_head(&j->reclaim_wait); init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->reclaim_lock); mutex_init(&j->discard_lock); lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; j->write_delay_ms = 1000; j->reclaim_delay_ms = 100; - /* Btree roots: */ - j->entry_u64s_reserved += - BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); - atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || - !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) { ret = -ENOMEM; goto out; } + for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + if (!j->buf[i].data) { + ret = -ENOMEM; + goto out; + } + } + j->pin.front = j->pin.back = 1; out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1150,15 +1170,14 @@ out: /* debug: */ -void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; struct bch_dev *ca; - unsigned iter; + unsigned i; rcu_read_lock(); - spin_lock(&j->lock); s = READ_ONCE(j->reservations); pr_buf(out, @@ -1166,16 +1185,30 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "seq:\t\t\t%llu\n" "last_seq:\t\t%llu\n" "last_seq_ondisk:\t%llu\n" + "flushed_seq_ondisk:\t%llu\n" "prereserved:\t\t%u/%u\n" + "each entry reserved:\t%u\n" + "nr flush writes:\t%llu\n" + "nr noflush writes:\t%llu\n" + "nr direct reclaim:\t%llu\n" + "nr background reclaim:\t%llu\n" "current entry sectors:\t%u\n" + "current entry error:\t%u\n" "current entry:\t\t", fifo_used(&j->pin), journal_cur_seq(j), journal_last_seq(j), j->last_seq_ondisk, + j->flushed_seq_ondisk, j->prereserved.reserved, j->prereserved.remaining, - j->cur_entry_sectors); + j->entry_u64s_reserved, + j->nr_flush_writes, + j->nr_noflush_writes, + j->nr_direct_reclaim, + j->nr_background_reclaim, + j->cur_entry_sectors, + j->cur_entry_error); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: @@ -1192,16 +1225,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } pr_buf(out, - "current entry refs:\t%u\n" - "prev entry unwritten:\t", - journal_state_count(s, s.idx)); - - if (s.prev_buf_unwritten) - pr_buf(out, "yes, ref %u sectors %u\n", - journal_state_count(s, !s.idx), - journal_prev_buf(j)->sectors); - else - pr_buf(out, "no\n"); + "current entry:\t\tidx %u refcount %u\n", + s.idx, journal_state_count(s, s.idx)); + + i = s.idx; + while (i != s.unwritten_idx) { + i = (i - 1) & JOURNAL_BUF_MASK; + + pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", + i, journal_state_count(s, i), j->buf[i].sectors); + } pr_buf(out, "need write:\t\t%i\n" @@ -1209,22 +1242,40 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) test_bit(JOURNAL_NEED_WRITE, &j->flags), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - for_each_member_device_rcu(ca, c, iter, + pr_buf(out, "space:\n"); + pr_buf(out, "\tdiscarded\t%u:%u\n", + j->space[journal_space_discarded].next_entry, + j->space[journal_space_discarded].total); + pr_buf(out, "\tclean ondisk\t%u:%u\n", + j->space[journal_space_clean_ondisk].next_entry, + j->space[journal_space_clean_ondisk].total); + pr_buf(out, "\tclean\t\t%u:%u\n", + j->space[journal_space_clean].next_entry, + j->space[journal_space_clean].total); + pr_buf(out, "\ttotal\t\t%u:%u\n", + j->space[journal_space_total].next_entry, + j->space[journal_space_total].total); + + for_each_member_device_rcu(ca, c, i, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; + if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) + continue; + if (!ja->nr) continue; pr_buf(out, "dev %u:\n" "\tnr\t\t%u\n" + "\tbucket size\t%u\n" "\tavailable\t%u:%u\n" - "\tdiscard_idx\t\t%u\n" - "\tdirty_idx_ondisk\t%u (seq %llu)\n" - "\tdirty_idx\t\t%u (seq %llu)\n" + "\tdiscard_idx\t%u\n" + "\tdirty_ondisk\t%u (seq %llu)\n" + "\tdirty_idx\t%u (seq %llu)\n" "\tcur_idx\t\t%u (seq %llu)\n", - iter, ja->nr, + i, ja->nr, ca->mi.bucket_size, bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free, ja->discard_idx, @@ -1233,10 +1284,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ja->cur_idx, ja->bucket_seq[ja->cur_idx]); } - spin_unlock(&j->lock); rcu_read_unlock(); } +void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +{ + spin_lock(&j->lock); + __bch2_journal_debug_to_text(out, j); + spin_unlock(&j->lock); +} + void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) { struct journal_entry_pin_list *pin_list; diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index f60bc96..cc49712 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j) return j->buf + j->reservations.idx; } -static inline struct journal_buf *journal_prev_buf(struct journal *j) -{ - return j->buf + !j->reservations.idx; -} - /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) @@ -141,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j) static inline u64 journal_cur_seq(struct journal *j) { - BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); return j->pin.back - 1; } @@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64); static inline int journal_state_count(union journal_res_state s, int idx) { - return idx == 0 ? s.buf0_count : s.buf1_count; + switch (idx) { + case 0: return s.buf0_count; + case 1: return s.buf1_count; + case 2: return s.buf2_count; + case 3: return s.buf3_count; + } + BUG(); } static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; s->buf1_count += s->idx == 1; + s->buf2_count += s->idx == 2; + s->buf3_count += s->idx == 3; } static inline void bch2_journal_set_has_inode(struct journal *j, @@ -210,11 +213,13 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type enum btree_id id, unsigned level, const void *data, unsigned u64s) { - memset(entry, 0, sizeof(*entry)); entry->u64s = cpu_to_le16(u64s); - entry->type = type; entry->btree_id = id; entry->level = level; + entry->type = type; + entry->pad[0] = 0; + entry->pad[1] = 0; + entry->pad[2] = 0; memcpy_u64s_small(entry->_data, data, u64s); return jset_u64s(u64s); @@ -255,21 +260,24 @@ static inline bool journal_entry_empty(struct jset *j) return true; } -void __bch2_journal_buf_put(struct journal *, bool); +void __bch2_journal_buf_put(struct journal *); -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, - bool need_write_just_set) +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) { union journal_res_state s; s.v = atomic64_sub_return(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, + .buf2_count = idx == 2, + .buf3_count = idx == 3, }).v, &j->reservations.counter); - if (!journal_state_count(s, idx)) { - EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); - __bch2_journal_buf_put(j, need_write_just_set); - } + + EBUG_ON(((s.idx - idx) & 3) > + ((s.idx - s.unwritten_idx) & 3)); + + if (!journal_state_count(s, idx) && idx == s.unwritten_idx) + __bch2_journal_buf_put(j); } /* @@ -289,7 +297,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, NULL, 0); - bch2_journal_buf_put(j, res->idx, false); + bch2_journal_buf_put(j, res->idx); res->ref = 0; } @@ -300,7 +308,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, #define JOURNAL_RES_GET_NONBLOCK (1 << 0) #define JOURNAL_RES_GET_CHECK (1 << 1) #define JOURNAL_RES_GET_RESERVED (1 << 2) -#define JOURNAL_RES_GET_RECLAIM (1 << 3) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, @@ -325,11 +332,18 @@ static inline int journal_res_get_fast(struct journal *j, !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) return 0; - if (flags & JOURNAL_RES_GET_CHECK) - return 1; - new.cur_entry_offset += res->u64s; journal_state_inc(&new); + + /* + * If the refcount would overflow, we have to wait: + * XXX - tracepoint this: + */ + if (!journal_state_count(new, new.idx)) + return 0; + + if (flags & JOURNAL_RES_GET_CHECK) + return 1; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); @@ -371,7 +385,7 @@ out: static inline bool journal_check_may_get_unreserved(struct journal *j) { union journal_preres_state s = READ_ONCE(j->prereserved); - bool ret = s.reserved <= s.remaining && + bool ret = s.reserved < s.remaining && fifo_free(&j->pin) > 8; lockdep_assert_held(&j->lock); @@ -397,7 +411,12 @@ static inline void bch2_journal_preres_put(struct journal *j, s.v = atomic64_sub_return(s.v, &j->prereserved.counter); res->u64s = 0; - closure_wake_up(&j->preres_wait); + + if (unlikely(s.waiting)) { + clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), + (unsigned long *) &j->prereserved.v); + closure_wake_up(&j->preres_wait); + } if (s.reserved <= s.remaining && !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { @@ -413,32 +432,32 @@ int __bch2_journal_preres_get(struct journal *, static inline int bch2_journal_preres_get_fast(struct journal *j, struct journal_preres *res, unsigned new_u64s, - unsigned flags) + unsigned flags, + bool set_waiting) { int d = new_u64s - res->u64s; union journal_preres_state old, new; u64 v = atomic64_read(&j->prereserved.counter); + int ret; do { old.v = new.v = v; - - new.reserved += d; - - /* - * If we're being called from the journal reclaim path, we have - * to unconditionally give out the pre-reservation, there's - * nothing else sensible we can do - otherwise we'd recurse back - * into the reclaim path and deadlock: - */ - - if (!(flags & JOURNAL_RES_GET_RECLAIM) && - new.reserved > new.remaining) + ret = 0; + + if ((flags & JOURNAL_RES_GET_RESERVED) || + new.reserved + d < new.remaining) { + new.reserved += d; + ret = 1; + } else if (set_waiting && !new.waiting) + new.waiting = true; + else return 0; } while ((v = atomic64_cmpxchg(&j->prereserved.counter, old.v, new.v)) != old.v); - res->u64s += d; - return 1; + if (ret) + res->u64s += d; + return ret; } static inline int bch2_journal_preres_get(struct journal *j, @@ -449,7 +468,7 @@ static inline int bch2_journal_preres_get(struct journal *j, if (new_u64s <= res->u64s) return 0; - if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) + if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) return 0; if (flags & JOURNAL_RES_GET_NONBLOCK) @@ -464,13 +483,8 @@ void bch2_journal_entry_res_resize(struct journal *, struct journal_entry_res *, unsigned); -u64 bch2_journal_last_unwritten_seq(struct journal *); -int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); - -void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); -void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); +int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); -void bch2_journal_meta_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); @@ -486,11 +500,6 @@ static inline int bch2_journal_error(struct journal *j) struct bch_dev; -static inline bool journal_flushes_device(struct bch_dev *ca) -{ - return true; -} - static inline void bch2_journal_set_replay_done(struct journal *j) { BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); @@ -500,6 +509,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); +void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_pins_to_text(struct printbuf *, struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index bd0e6b3..c7fa03c 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -5,15 +5,33 @@ #include "btree_update_interior.h" #include "buckets.h" #include "checksum.h" +#include "disk_groups.h" #include "error.h" #include "io.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include +static void __journal_replay_free(struct journal_replay *i) +{ + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + +} + +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +{ + i->ignore = true; + + if (!c->opts.read_entire_journal) + __journal_replay_free(i); +} + struct journal_list { struct closure cl; struct mutex lock; @@ -29,35 +47,37 @@ struct journal_list { * be replayed: */ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct bch_extent_ptr entry_ptr, struct journal_list *jlist, struct jset *j, bool bad) { - struct journal_replay *i, *pos; - struct bch_devs_list devs = { .nr = 0 }; + struct journal_replay *i, *pos, *dup = NULL; + struct bch_extent_ptr *ptr; struct list_head *where; size_t bytes = vstruct_bytes(j); - __le64 last_seq; - int ret; - - last_seq = !list_empty(jlist->head) - ? list_last_entry(jlist->head, struct journal_replay, - list)->j.last_seq - : 0; + u64 last_seq = 0; + int ret = JOURNAL_ENTRY_ADD_OK; - if (!c->opts.read_entire_journal) { - /* Is this entry older than the range we need? */ - if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; + list_for_each_entry_reverse(i, jlist->head, list) { + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + break; } + } - /* Drop entries we don't need anymore */ + /* Is this entry older than the range we need? */ + if (!c->opts.read_entire_journal && + le64_to_cpu(j->seq) < last_seq) { + ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + goto out; + } + + /* Drop entries we don't need anymore */ + if (!JSET_NO_FLUSH(j)) { list_for_each_entry_safe(i, pos, jlist->head, list) { if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) break; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + journal_replay_free(c, i); } } @@ -70,30 +90,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, where = jlist->head; add: - i = where->next != jlist->head + dup = where->next != jlist->head ? container_of(where->next, struct journal_replay, list) : NULL; + if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq)) + dup = NULL; + /* * Duplicate journal entries? If so we want the one that didn't have a * checksum error: */ - if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { - if (i->bad) { - devs = i->devs; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + if (dup) { + if (dup->bad) { + /* we'll replace @dup: */ } else if (bad) { + i = dup; goto found; } else { - fsck_err_on(bytes != vstruct_bytes(&i->j) || - memcmp(j, &i->j, bytes), c, + fsck_err_on(bytes != vstruct_bytes(&dup->j) || + memcmp(j, &dup->j, bytes), c, "found duplicate but non identical journal entries (seq %llu)", le64_to_cpu(j->seq)); + i = dup; goto found; } - } i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); @@ -102,16 +123,34 @@ add: goto out; } - list_add(&i->list, where); - i->devs = devs; - i->bad = bad; + i->nr_ptrs = 0; + i->bad = bad; + i->ignore = false; memcpy(&i->j, j, bytes); + + if (dup) { + i->nr_ptrs = dup->nr_ptrs; + memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs)); + __journal_replay_free(dup); + } + + list_add(&i->list, where); found: - if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) - bch2_dev_list_add_dev(&i->devs, ca->dev_idx); - else - fsck_err_on(1, c, "duplicate journal entries on same device"); - ret = JOURNAL_ENTRY_ADD_OK; + for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { + if (ptr->dev == ca->dev_idx) { + bch_err(c, "duplicate journal entry %llu on same device", + le64_to_cpu(i->j.seq)); + goto out; + } + } + + if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + goto out; + } + + i->ptrs[i->nr_ptrs++] = entry_ptr; out: fsck_err: return ret; @@ -161,46 +200,54 @@ static void journal_entry_null_range(void *start, void *end) #define journal_entry_err_on(cond, c, msg, ...) \ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) -static int journal_validate_key(struct bch_fs *c, struct jset *jset, +#define FSCK_DELETED_KEY 5 + +static int journal_validate_key(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned level, enum btree_id btree_id, - struct bkey_i *k, - const char *type, int write) + struct bkey_i *k, const char *type, + unsigned version, int big_endian, int write) { void *next = vstruct_next(entry); const char *invalid; - unsigned version = le32_to_cpu(jset->version); int ret = 0; if (journal_entry_err_on(!k->k.u64s, c, - "invalid %s in journal: k->u64s 0", type)) { + "invalid %s in %s entry offset %zi/%u: k->u64s 0", + type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s))) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (journal_entry_err_on((void *) bkey_next(k) > (void *) vstruct_next(entry), c, - "invalid %s in journal: extends past end of journal entry", - type)) { + "invalid %s in %s entry offset %zi/%u: extends past end of journal entry", + type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s))) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, - "invalid %s in journal: bad format %u", - type, k->k.format)) { - le16_add_cpu(&entry->u64s, -k->k.u64s); + "invalid %s in %s entry offset %zi/%u: bad format %u", + type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), + k->k.format)) { + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (!write) - bch2_bkey_compat(level, btree_id, version, - JSET_BIG_ENDIAN(jset), write, - NULL, bkey_to_packed(k)); + bch2_bkey_compat(level, btree_id, version, big_endian, + write, NULL, bkey_to_packed(k)); invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), __btree_node_type(level, btree_id)); @@ -208,46 +255,50 @@ static int journal_validate_key(struct bch_fs *c, struct jset *jset, char buf[160]; bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); - mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", - type, invalid, buf); + mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s", + type, where, + (u64 *) k - entry->_data, + le16_to_cpu(entry->u64s), + invalid, buf); - le16_add_cpu(&entry->u64s, -k->k.u64s); + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); - return 0; + return FSCK_DELETED_KEY; } if (write) - bch2_bkey_compat(level, btree_id, version, - JSET_BIG_ENDIAN(jset), write, - NULL, bkey_to_packed(k)); + bch2_bkey_compat(level, btree_id, version, big_endian, + write, NULL, bkey_to_packed(k)); fsck_err: return ret; } static int journal_entry_validate_btree_keys(struct bch_fs *c, - struct jset *jset, + const char *where, struct jset_entry *entry, - int write) + unsigned version, int big_endian, int write) { - struct bkey_i *k; + struct bkey_i *k = entry->start; - vstruct_for_each(entry, k) { - int ret = journal_validate_key(c, jset, entry, + while (k != vstruct_last(entry)) { + int ret = journal_validate_key(c, where, entry, entry->level, entry->btree_id, - k, "key", write); - if (ret) - return ret; + k, "key", version, big_endian, write); + if (ret == FSCK_DELETED_KEY) + continue; + + k = bkey_next(k); } return 0; } static int journal_entry_validate_btree_root(struct bch_fs *c, - struct jset *jset, + const char *where, struct jset_entry *entry, - int write) + unsigned version, int big_endian, int write) { struct bkey_i *k = entry->start; int ret = 0; @@ -266,25 +317,25 @@ static int journal_entry_validate_btree_root(struct bch_fs *c, return 0; } - return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, - "btree root", write); + return journal_validate_key(c, where, entry, 1, entry->btree_id, k, + "btree root", version, big_endian, write); fsck_err: return ret; } static int journal_entry_validate_prio_ptrs(struct bch_fs *c, - struct jset *jset, + const char *where, struct jset_entry *entry, - int write) + unsigned version, int big_endian, int write) { /* obsolete, don't care: */ return 0; } static int journal_entry_validate_blacklist(struct bch_fs *c, - struct jset *jset, + const char *where, struct jset_entry *entry, - int write) + unsigned version, int big_endian, int write) { int ret = 0; @@ -297,9 +348,9 @@ fsck_err: } static int journal_entry_validate_blacklist_v2(struct bch_fs *c, - struct jset *jset, + const char *where, struct jset_entry *entry, - int write) + unsigned version, int big_endian, int write) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; @@ -323,9 +374,9 @@ fsck_err: } static int journal_entry_validate_usage(struct bch_fs *c, - struct jset *jset, + const char *where, struct jset_entry *entry, - int write) + unsigned version, int big_endian, int write) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -344,9 +395,9 @@ fsck_err: } static int journal_entry_validate_data_usage(struct bch_fs *c, - struct jset *jset, + const char *where, struct jset_entry *entry, - int write) + unsigned version, int big_endian, int write) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -365,9 +416,72 @@ fsck_err: return ret; } +static int journal_entry_validate_clock(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes != sizeof(*clock), + c, "invalid journal entry clock: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(clock->rw > 1, + c, "invalid journal entry clock: bad rw")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static int journal_entry_validate_dev_usage(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */ + unsigned dev; + int ret = 0; + + if (journal_entry_err_on(bytes < expected, + c, "invalid journal entry dev usage: bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), + c, "invalid journal entry dev usage: bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, + c, "invalid journal entry dev usage: bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + struct jset_entry_ops { - int (*validate)(struct bch_fs *, struct jset *, - struct jset_entry *, int); + int (*validate)(struct bch_fs *, const char *, + struct jset_entry *, unsigned, int, int); }; static const struct jset_entry_ops bch2_jset_entry_ops[] = { @@ -379,22 +493,29 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = { #undef x }; -static int journal_entry_validate(struct bch_fs *c, struct jset *jset, - struct jset_entry *entry, int write) +int bch2_journal_entry_validate(struct bch_fs *c, const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) { return entry->type < BCH_JSET_ENTRY_NR - ? bch2_jset_entry_ops[entry->type].validate(c, jset, - entry, write) + ? bch2_jset_entry_ops[entry->type].validate(c, where, entry, + version, big_endian, write) : 0; } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, int write) { + char buf[100]; struct jset_entry *entry; int ret = 0; vstruct_for_each(jset, entry) { + scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u", + le64_to_cpu(jset->seq), + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s)); + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), c, "journal entry extends past end of jset")) { @@ -402,7 +523,9 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, break; } - ret = journal_entry_validate(c, jset, entry, write); + ret = bch2_journal_entry_validate(c, buf, entry, + le32_to_cpu(jset->version), + JSET_BIG_ENDIAN(jset), write); if (ret) break; } @@ -430,52 +553,70 @@ static int jset_validate(struct bch_fs *c, version < bcachefs_metadata_version_min) || version >= bcachefs_metadata_version_max, c, "%s sector %llu seq %llu: unknown journal entry version %u", - ca->name, sector, le64_to_cpu(jset->seq), + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), version)) { - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; + /* don't try to continue: */ + return EINVAL; } + if (bytes > (sectors_read << 9) && + sectors_read < bucket_sectors_left) + return JOURNAL_ENTRY_REREAD; + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, "%s sector %llu seq %llu: journal entry too big (%zu bytes)", - ca->name, sector, le64_to_cpu(jset->seq), bytes)) { - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), bytes)) { + ret = JOURNAL_ENTRY_BAD; + le32_add_cpu(&jset->u64s, + -((bytes - (bucket_sectors_left << 9)) / 8)); } - if (bytes > sectors_read << 9) - return JOURNAL_ENTRY_REREAD; - - if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", - ca->name, sector, le64_to_cpu(jset->seq), - JSET_CSUM_TYPE(jset))) - return JOURNAL_ENTRY_BAD; + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + JSET_CSUM_TYPE(jset))) { + ret = JOURNAL_ENTRY_BAD; + goto csum_done; + } + + if (write) + goto csum_done; csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, "%s sector %llu seq %llu: journal checksum bad", - ca->name, sector, le64_to_cpu(jset->seq))) { - /* XXX: retry IO, when we start retrying checksum errors */ - /* XXX: note we might have missing journal entries */ - return JOURNAL_ENTRY_BAD; - } + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq))) + ret = JOURNAL_ENTRY_BAD; bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - - if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, - "invalid journal entry: last_seq > seq")) { +csum_done: + /* last_seq is ignored when JSET_NO_FLUSH is true */ + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(jset->last_seq), + le64_to_cpu(jset->seq))) { jset->last_seq = jset->seq; return JOURNAL_ENTRY_BAD; } - - return 0; fsck_err: return ret; } +static int jset_validate_for_write(struct bch_fs *c, struct jset *jset) +{ + unsigned sectors = vstruct_sectors(jset, c->block_bits); + + return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?: + jset_validate_entries(c, jset, WRITE); +} + struct journal_read_buf { void *data; size_t size; @@ -536,10 +677,17 @@ reread: bio_put(bio); if (bch2_dev_io_err_on(ret, ca, - "journal read from sector %llu", + "journal read error: sector %llu", offset) || - bch2_meta_read_fault("journal")) - return -EIO; + bch2_meta_read_fault("journal")) { + /* + * We don't error out of the recovery process + * here, since the relevant journal entry may be + * found on a different device, and missing or + * no journal entries will be handled later + */ + return 0; + } j = buf->data; } @@ -589,7 +737,10 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, jlist, j, ret != 0); + ret = journal_entry_add(c, ca, (struct bch_extent_ptr) { + .dev = ca->dev_idx, + .offset = offset, + }, jlist, j, ret != 0); mutex_unlock(&jlist->lock); switch (ret) { @@ -677,14 +828,35 @@ err: goto out; } -int bch2_journal_read(struct bch_fs *c, struct list_head *list) +static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + unsigned i; + + for (i = 0; i < j->nr_ptrs; i++) { + struct bch_dev *ca = c->devs[j->ptrs[i].dev]; + u64 offset; + + div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); + + if (i) + pr_buf(out, " "); + pr_buf(out, "%u:%llu (offset %llu)", + j->ptrs[i].dev, + (u64) j->ptrs[i].offset, offset); + } +} + +int bch2_journal_read(struct bch_fs *c, struct list_head *list, + u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i; + struct journal_replay *i, *t; struct bch_dev *ca; unsigned iter; size_t keys = 0, entries = 0; bool degraded = false; + u64 seq, last_seq = 0; int ret = 0; closure_init_stack(&jlist.cl); @@ -697,8 +869,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) continue; - if ((ca->mi.state == BCH_MEMBER_STATE_RW || - ca->mi.state == BCH_MEMBER_STATE_RO) && + if ((ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro) && percpu_ref_tryget(&ca->io_ref)) closure_call(&ca->journal.read, bch2_journal_read_device, @@ -713,23 +885,129 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; + if (list_empty(list)) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + + i = list_last_entry(list, struct journal_replay, list); + *start_seq = le64_to_cpu(i->j.seq) + 1; + + /* + * Find most recent flush entry, and ignore newer non flush entries - + * those entries will be blacklisted: + */ + list_for_each_entry_safe_reverse(i, t, list, list) { + if (i->ignore) + continue; + + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; + } + + journal_replay_free(c, i); + } + + if (!last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + return -1; + } + + /* Drop blacklisted entries and entries older than last_seq: */ + list_for_each_entry_safe(i, t, list, list) { + if (i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + if (seq < last_seq) { + journal_replay_free(c, i); + continue; + } + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + "found blacklisted journal entry %llu", seq); + + journal_replay_free(c, i); + } + } + + /* Check for missing entries: */ + seq = last_seq; + list_for_each_entry(i, list, list) { + if (i->ignore) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; + char buf1[200], buf2[200]; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (i->list.prev != list) { + struct printbuf out = PBUF(buf1); + struct journal_replay *p = list_prev_entry(i, list); + + bch2_journal_ptrs_to_text(&out, c, p); + pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits)); + } else + sprintf(buf1, "(none)"); + bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" + " prev at %s\n" + " next at %s", + missing_start, missing_end, + last_seq, *blacklist_seq - 1, + buf1, buf2); + } + + seq++; + } + list_for_each_entry(i, list, list) { struct jset_entry *entry; struct bkey_i *k, *_n; - struct bch_replicas_padded replicas; + struct bch_replicas_padded replicas = { + .e.data_type = BCH_DATA_journal, + .e.nr_required = 1, + }; + unsigned ptr; char buf[80]; + if (i->ignore) + continue; + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + + bch2_replicas_entry_sort(&replicas.e); + /* * If we're mounting in degraded mode - if we didn't read all * the devices - this is wrong: */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); - if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, @@ -746,12 +1024,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) entries++; } - if (!list_empty(list)) { - i = list_last_entry(list, struct journal_replay, list); + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, *start_seq); - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, le64_to_cpu(i->j.seq)); - } + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); fsck_err: return ret; } @@ -785,7 +1063,7 @@ static void __journal_write_alloc(struct journal *j, * it: */ if (!ca->mi.durability || - ca->mi.state != BCH_MEMBER_STATE_RW || + ca->mi.state != BCH_MEMBER_STATE_rw || !ja->nr || bch2_bkey_has_device(bkey_i_to_s_c(&w->key), ca->dev_idx) || @@ -820,16 +1098,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, unsigned sectors) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_devs_mask devs; struct journal_device *ja; struct bch_dev *ca; struct dev_alloc_list devs_sorted; + unsigned target = c->opts.metadata_target ?: + c->opts.foreground_target; unsigned i, replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); rcu_read_lock(); +retry: + devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, - &c->rw_devs[BCH_DATA_journal]); + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); @@ -861,9 +1143,17 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + + if (replicas < replicas_want && target) { + /* Retry from all devices: */ + target = 0; + goto retry; + } done: rcu_read_unlock(); + BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; } @@ -924,41 +1214,61 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) return; memcpy(new_buf, buf->data, buf->buf_size); - kvpfree(buf->data, buf->buf_size); - buf->data = new_buf; - buf->buf_size = new_size; + + spin_lock(&j->lock); + swap(buf->data, new_buf); + swap(buf->buf_size, new_size); + spin_unlock(&j->lock); + + kvpfree(new_buf, new_size); +} + +static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) +{ + return j->buf + j->reservations.unwritten_idx; } static void journal_write_done(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_devs_list devs = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); struct bch_replicas_padded replicas; - u64 seq = le64_to_cpu(w->data->seq); - u64 last_seq = le64_to_cpu(w->data->last_seq); + union journal_res_state old, new; + u64 v, seq, last_seq; + int err = 0; bch2_time_stats_update(j->write_time, j->write_start_time); if (!devs.nr) { bch_err(c, "unable to write journal to sufficient devices"); - goto err; + err = -EIO; + } else { + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); + if (bch2_mark_replicas(c, &replicas.e)) + err = -EIO; } - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); - - if (bch2_mark_replicas(c, &replicas.e)) - goto err; + if (err) + bch2_fatal_error(c); spin_lock(&j->lock); + seq = le64_to_cpu(w->data->seq); + last_seq = le64_to_cpu(w->data->last_seq); + if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = devs; j->seq_ondisk = seq; - j->last_seq_ondisk = last_seq; - bch2_journal_space_available(j); + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; + + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + } /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard @@ -967,14 +1277,21 @@ static void journal_write_done(struct closure *cl) * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); -out: + journal_reclaim_kick(&c->journal); + /* also must come before signalling write completion: */ closure_debug_destroy(cl); - BUG_ON(!j->reservations.prev_buf_unwritten); - atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, - &j->reservations.counter); + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(new.idx == new.unwritten_idx); + + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + bch2_journal_space_available(j); closure_wake_up(&w->wait); journal_wake(j); @@ -982,11 +1299,10 @@ out: if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) mod_delayed_work(system_freezable_wq, &j->write_work, 0); spin_unlock(&j->lock); - return; -err: - bch2_fatal_error(c); - spin_lock(&j->lock); - goto out; + + if (new.unwritten_idx != new.idx && + !journal_state_count(new, new.unwritten_idx)) + closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } static void journal_write_endio(struct bio *bio) @@ -994,10 +1310,10 @@ static void journal_write_endio(struct bio *bio) struct bch_dev *ca = bio->bi_private; struct journal *j = &ca->fs->journal; - if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); @@ -1009,27 +1325,93 @@ static void journal_write_endio(struct bio *bio) percpu_ref_put(&ca->io_ref); } +static void do_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_extent_ptr *ptr; + struct bio *bio; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!percpu_ref_tryget(&ca->io_ref)) { + /* XXX: fix this */ + bch_err(c, "missing device for journal write\n"); + continue; + } + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], + sectors); + + bio = ca->journal.bio; + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META; + + BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); + ca->prev_journal_sector = bio->bi_iter.bi_sector; + + if (!JSET_NO_FLUSH(w->data)) + bio->bi_opf |= REQ_FUA; + if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) + bio->bi_opf |= REQ_PREFLUSH; + + bch2_bio_map(bio, w->data, sectors << 9); + + trace_journal_write(bio); + closure_bio_submit(bio, cl); + + ca->journal.bucket_seq[ca->journal.cur_idx] = + le64_to_cpu(w->data->seq); + } + + continue_at(cl, journal_write_done, system_highpri_wq); + return; +} + void bch2_journal_write(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); struct jset_entry *start, *end; struct jset *jset; struct bio *bio; - struct bch_extent_ptr *ptr; + char *journal_debug_buf = NULL; bool validate_before_checksum = false; - unsigned i, sectors, bytes, u64s; + unsigned i, sectors, bytes, u64s, nr_rw_members = 0; int ret; - bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); journal_buf_realloc(j, w); jset = w->data; j->write_start_time = local_clock(); + spin_lock(&j->lock); + if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && + !w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = 0; + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + } + spin_unlock(&j->lock); + /* * New btree roots are set by journalling them; when the journal entry * gets written we have to propagate them to c->btree_roots @@ -1046,8 +1428,8 @@ void bch2_journal_write(struct closure *cl) end = bch2_btree_roots_to_journal_entries(c, jset->start, end); - end = bch2_journal_super_entries_add_common(c, end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, + le64_to_cpu(jset->seq)); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1056,10 +1438,7 @@ void bch2_journal_write(struct closure *cl) journal_write_compact(jset); - jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ? cpu_to_le32(BCH_JSET_VERSION_OLD) : cpu_to_le32(c->sb.version); @@ -1067,14 +1446,17 @@ void bch2_journal_write(struct closure *cl) SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + if (journal_entry_empty(jset)) + j->last_empty_seq = le64_to_cpu(jset->seq); + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; - if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) + if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) validate_before_checksum = true; if (validate_before_checksum && - jset_validate_entries(c, jset, WRITE)) + jset_validate_for_write(c, jset)) goto err; bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), @@ -1085,7 +1467,7 @@ void bch2_journal_write(struct closure *cl) journal_nonce(jset), jset); if (!validate_before_checksum && - jset_validate_entries(c, jset, WRITE)) + jset_validate_for_write(c, jset)) goto err; sectors = vstruct_sectors(jset, c->block_bits); @@ -1104,6 +1486,12 @@ retry_alloc: goto retry_alloc; } + if (ret) { + journal_debug_buf = kmalloc(4096, GFP_ATOMIC); + if (journal_debug_buf) + __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); + } + /* * write is allocated, no longer need to account for it in * bch2_journal_space_available(): @@ -1118,7 +1506,9 @@ retry_alloc: spin_unlock(&j->lock); if (ret) { - bch_err(c, "Unable to allocate journal write"); + bch_err(c, "Unable to allocate journal write:\n%s", + journal_debug_buf); + kfree(journal_debug_buf); bch2_fatal_error(c); continue_at(cl, journal_write_done, system_highpri_wq); return; @@ -1131,36 +1521,14 @@ retry_alloc: if (c->opts.nochanges) goto no_io; - extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - if (!percpu_ref_tryget(&ca->io_ref)) { - /* XXX: fix this */ - bch_err(c, "missing device for journal write\n"); - continue; - } - - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], - sectors); - - bio = ca->journal.bio; - bio_reset(bio); - bio_set_dev(bio, ca->disk_sb.bdev); - bio->bi_iter.bi_sector = ptr->offset; - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - bio_set_op_attrs(bio, REQ_OP_WRITE, - REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); - bch2_bio_map(bio, jset, sectors << 9); - - trace_journal_write(bio); - closure_bio_submit(bio, cl); + for_each_rw_member(ca, c, i) + nr_rw_members++; - ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); - } + if (nr_rw_members > 1) + w->separate_flush = true; - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { + if (!JSET_NO_FLUSH(jset) && w->separate_flush) { + for_each_rw_member(ca, c, i) { percpu_ref_get(&ca->io_ref); bio = ca->journal.bio; @@ -1171,7 +1539,12 @@ retry_alloc: bio->bi_private = ca; closure_bio_submit(bio, cl); } + } + + bch2_bucket_seq_cleanup(c); + continue_at(cl, do_journal_write, system_highpri_wq); + return; no_io: bch2_bucket_seq_cleanup(c); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 6958ee0..f34281a 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -8,9 +8,12 @@ */ struct journal_replay { struct list_head list; - struct bch_devs_list devs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; + unsigned nr_ptrs; + /* checksum error, but we may want to try using it anyways: */ bool bad; + bool ignore; /* must be last: */ struct jset j; }; @@ -37,7 +40,10 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_read(struct bch_fs *, struct list_head *); +int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *, + unsigned, int, int); + +int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); void bch2_journal_write(struct closure *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 5759198..7be6c65 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -1,12 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_key_cache.h" +#include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" #include "super.h" +#include +#include +#include + /* Free space calculations: */ static unsigned journal_space_from(struct journal_device *ja, @@ -53,82 +59,108 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) old.v, new.v)) != old.v); } -static struct journal_space { - unsigned next_entry; - unsigned remaining; -} __journal_space_available(struct journal *j, unsigned nr_devs_want, - enum journal_space_from from) +static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - unsigned sectors_next_entry = UINT_MAX; - unsigned sectors_total = UINT_MAX; - unsigned i, nr_devs = 0; - unsigned unwritten_sectors = j->reservations.prev_buf_unwritten - ? journal_prev_buf(j)->sectors - : 0; + unsigned sectors = 0; - rcu_read_lock(); - for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_journal]) { - struct journal_device *ja = &ca->journal; - unsigned buckets_this_device, sectors_this_device; + while (!sectors && *idx != j->reservations.idx) { + sectors = j->buf[*idx].sectors; - if (!ja->nr) - continue; + *idx = (*idx + 1) & JOURNAL_BUF_MASK; + } - buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); - sectors_this_device = ja->sectors_free; + return sectors; +} - /* - * We that we don't allocate the space for a journal entry - * until we write it out - thus, account for it here: - */ - if (unwritten_sectors >= sectors_this_device) { - if (!buckets_this_device) - continue; +static struct journal_space +journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) +{ + struct journal_device *ja = &ca->journal; + unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx; - buckets_this_device--; - sectors_this_device = ca->mi.bucket_size; - } + if (from == journal_space_total) + return (struct journal_space) { + .next_entry = ca->mi.bucket_size, + .total = ca->mi.bucket_size * ja->nr, + }; - sectors_this_device -= unwritten_sectors; + buckets = bch2_journal_dev_buckets_available(j, ja, from); + sectors = ja->sectors_free; - if (sectors_this_device < ca->mi.bucket_size && - buckets_this_device) { - buckets_this_device--; - sectors_this_device = ca->mi.bucket_size; + /* + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ + while ((unwritten = get_unwritten_sectors(j, &idx))) { + if (unwritten >= sectors) { + if (!buckets) { + sectors = 0; + break; + } + + buckets--; + sectors = ca->mi.bucket_size; } - if (!sectors_this_device) + sectors -= unwritten; + } + + if (sectors < ca->mi.bucket_size && buckets) { + buckets--; + sectors = ca->mi.bucket_size; + } + + return (struct journal_space) { + .next_entry = sectors, + .total = sectors + buckets * ca->mi.bucket_size, + }; +} + +static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, + enum journal_space_from from) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned i, pos, nr_devs = 0; + struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + + BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + if (!ca->journal.nr) continue; - sectors_next_entry = min(sectors_next_entry, - sectors_this_device); + space = journal_dev_space_available(j, ca, from); + if (!space.next_entry) + continue; - sectors_total = min(sectors_total, - buckets_this_device * ca->mi.bucket_size + - sectors_this_device); + for (pos = 0; pos < nr_devs; pos++) + if (space.total > dev_space[pos].total) + break; - nr_devs++; + array_insert_item(dev_space, nr_devs, pos, space); } rcu_read_unlock(); if (nr_devs < nr_devs_want) return (struct journal_space) { 0, 0 }; - return (struct journal_space) { - .next_entry = sectors_next_entry, - .remaining = max_t(int, 0, sectors_total - sectors_next_entry), - }; + /* + * We sorted largest to smallest, and we want the smallest out of the + * @nr_devs_want largest devices: + */ + return dev_space[nr_devs_want - 1]; } void bch2_journal_space_available(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct journal_space discarded, clean_ondisk, clean; - unsigned overhead, u64s_remaining = 0; + unsigned clean, clean_ondisk, total; + s64 u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); unsigned i, nr_online = 0, nr_devs_want; @@ -164,31 +196,53 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < c->opts.metadata_replicas_required) { - ret = -EROFS; - goto out; - } - - if (!fifo_free(&j->pin)) { - ret = -ENOSPC; + ret = cur_entry_insufficient_devices; goto out; } nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); - discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); - clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); - clean = __journal_space_available(j, nr_devs_want, journal_space_clean); + for (i = 0; i < journal_space_nr; i++) + j->space[i] = __journal_space_available(j, nr_devs_want, i); + + clean_ondisk = j->space[journal_space_clean_ondisk].total; + clean = j->space[journal_space_clean].total; + total = j->space[journal_space_total].total; - if (!discarded.next_entry) - ret = -ENOSPC; + if (!clean_ondisk && + j->reservations.idx == + j->reservations.unwritten_idx) { + char *buf = kmalloc(4096, GFP_ATOMIC); - overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * - journal_entry_overhead(j); - u64s_remaining = clean.remaining << 6; - u64s_remaining = max_t(int, 0, u64s_remaining - overhead); + bch_err(c, "journal stuck"); + if (buf) { + __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); + pr_err("\n%s", buf); + kfree(buf); + } + + bch2_fatal_error(c); + ret = cur_entry_journal_stuck; + } else if (!j->space[journal_space_discarded].next_entry) + ret = cur_entry_journal_full; + else if (!fifo_free(&j->pin)) + ret = cur_entry_journal_pin_full; + + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && + (clean - clean_ondisk <= total / 8) && + (clean_ondisk * 2 > clean )) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + + u64s_remaining = (u64) clean << 6; + u64s_remaining -= (u64) total << 3; + u64s_remaining = max(0LL, u64s_remaining); u64s_remaining /= 4; + u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); out: - j->cur_entry_sectors = !ret ? discarded.next_entry : 0; + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; journal_set_remaining(j, u64s_remaining); journal_check_may_get_unreserved(j); @@ -263,6 +317,7 @@ static void bch2_journal_reclaim_fast(struct journal *j) while (!fifo_empty(&j->pin) && !atomic_read(&fifo_peek_front(&j->pin).count)) { BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed)); BUG_ON(!fifo_pop(&j->pin, temp)); popped = true; } @@ -271,6 +326,14 @@ static void bch2_journal_reclaim_fast(struct journal *j) bch2_journal_space_available(j); } +void __bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) + bch2_journal_reclaim_fast(j); +} + void bch2_journal_pin_put(struct journal *j, u64 seq) { struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); @@ -290,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j, if (!journal_pin_active(pin)) return; + if (j->flush_in_progress == pin) + j->flush_in_progress_dropped = true; + pin_list = journal_seq_pin(j, pin->seq); pin->seq = 0; list_del_init(&pin->list); @@ -314,60 +380,39 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) +void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) { - struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); - - __journal_pin_drop(j, pin); - - BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); - - atomic_inc(&pin_list->count); - pin->seq = seq; - pin->flush = flush_fn; - - list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); -} + struct journal_entry_pin_list *pin_list; -void __bch2_journal_pin_add(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ spin_lock(&j->lock); - bch2_journal_pin_add_locked(j, seq, pin, flush_fn); - spin_unlock(&j->lock); - /* - * If the journal is currently full, we might want to call flush_fn - * immediately: - */ - journal_wake(j); -} - -void bch2_journal_pin_update(struct journal *j, u64 seq, - struct journal_entry_pin *pin, - journal_pin_flush_fn flush_fn) -{ - if (journal_pin_active(pin) && pin->seq < seq) + if (seq < journal_last_seq(j)) { + /* + * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on + * the src pin - with the pin dropped, the entry to pin might no + * longer to exist, but that means there's no longer anything to + * copy and we can bail out here: + */ + spin_unlock(&j->lock); return; + } - spin_lock(&j->lock); + pin_list = journal_seq_pin(j, seq); - if (pin->seq != seq) { - bch2_journal_pin_add_locked(j, seq, pin, flush_fn); - } else { - struct journal_entry_pin_list *pin_list = - journal_seq_pin(j, seq); + __journal_pin_drop(j, pin); - /* - * If the pin is already pinning the right sequence number, it - * still might've already been flushed: - */ - list_move(&pin->list, &pin_list->list); - } + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; + if (flush_fn == bch2_btree_key_cache_journal_flush) + list_add(&pin->list, &pin_list->key_cache_list); + else if (flush_fn) + list_add(&pin->list, &pin_list->list); + else + list_add(&pin->list, &pin_list->flushed); spin_unlock(&j->lock); /* @@ -377,20 +422,6 @@ void bch2_journal_pin_update(struct journal *j, u64 seq, journal_wake(j); } -void bch2_journal_pin_copy(struct journal *j, - struct journal_entry_pin *dst, - struct journal_entry_pin *src, - journal_pin_flush_fn flush_fn) -{ - spin_lock(&j->lock); - - if (journal_pin_active(src) && - (!journal_pin_active(dst) || src->seq < dst->seq)) - bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); - - spin_unlock(&j->lock); -} - /** * bch2_journal_pin_flush: ensure journal pin callback is no longer running */ @@ -411,88 +442,106 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) */ static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) +journal_get_next_pin(struct journal *j, + bool get_any, + bool get_key_cache, + u64 max_seq, u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; - if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) - return NULL; - - spin_lock(&j->lock); - - fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) - if (*seq > max_seq || - (ret = list_first_entry_or_null(&pin_list->list, - struct journal_entry_pin, list))) + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { + if (*seq > max_seq && !get_any && !get_key_cache) break; - if (ret) { - list_move(&ret->list, &pin_list->flushed); - BUG_ON(j->flush_in_progress); - j->flush_in_progress = ret; - j->last_flushed = jiffies; - } + if (*seq <= max_seq || get_any) { + ret = list_first_entry_or_null(&pin_list->list, + struct journal_entry_pin, list); + if (ret) + return ret; + } - spin_unlock(&j->lock); + if (*seq <= max_seq || get_any || get_key_cache) { + ret = list_first_entry_or_null(&pin_list->key_cache_list, + struct journal_entry_pin, list); + if (ret) + return ret; + } + } - return ret; + return NULL; } /* returns true if we did work */ -static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, - unsigned min_nr) +static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_any, + unsigned min_key_cache) { struct journal_entry_pin *pin; - bool ret = false; + size_t nr_flushed = 0; + journal_pin_flush_fn flush_fn; u64 seq; + int err; + + if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) + return 0; lockdep_assert_held(&j->reclaim_lock); - while ((pin = journal_get_next_pin(j, min_nr - ? U64_MAX : seq_to_flush, &seq))) { - if (min_nr) - min_nr--; + while (1) { + cond_resched(); + + j->last_flushed = jiffies; + + spin_lock(&j->lock); + pin = journal_get_next_pin(j, + min_any != 0, + min_key_cache != 0, + seq_to_flush, &seq); + if (pin) { + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; + j->flush_in_progress_dropped = false; + flush_fn = pin->flush; + } + spin_unlock(&j->lock); + + if (!pin) + break; - pin->flush(j, pin, seq); + if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) + min_key_cache--; - BUG_ON(j->flush_in_progress != pin); + if (min_any) + min_any--; + + err = flush_fn(j, pin, seq); + + spin_lock(&j->lock); + /* Pin might have been dropped or rearmed: */ + if (likely(!err && !j->flush_in_progress_dropped)) + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); j->flush_in_progress = NULL; + j->flush_in_progress_dropped = false; + spin_unlock(&j->lock); + wake_up(&j->pin_flush_wait); - ret = true; + + if (err) + break; + + nr_flushed++; } - return ret; + return nr_flushed; } -/** - * bch2_journal_reclaim - free up journal buckets - * - * Background journal reclaim writes out btree nodes. It should be run - * early enough so that we never completely run out of journal buckets. - * - * High watermarks for triggering background reclaim: - * - FIFO has fewer than 512 entries left - * - fewer than 25% journal buckets free - * - * Background reclaim runs until low watermarks are reached: - * - FIFO has more than 1024 entries left - * - more than 50% journal buckets free - * - * As long as a reclaim can complete in the time it takes to fill up - * 512 journal entries or 25% of all journal buckets, then - * journal_next_bucket() should not stall. - */ -void bch2_journal_reclaim(struct journal *j) +static u64 journal_seq_to_flush(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - unsigned iter, min_nr = 0; u64 seq_to_flush = 0; - - lockdep_assert_held(&j->reclaim_lock); - - bch2_journal_do_discards(j); + unsigned iter; spin_lock(&j->lock); @@ -524,34 +573,174 @@ void bch2_journal_reclaim(struct journal *j) (j->pin.size >> 1)); spin_unlock(&j->lock); + return seq_to_flush; +} + +/** + * bch2_journal_reclaim - free up journal buckets + * + * Background journal reclaim writes out btree nodes. It should be run + * early enough so that we never completely run out of journal buckets. + * + * High watermarks for triggering background reclaim: + * - FIFO has fewer than 512 entries left + * - fewer than 25% journal buckets free + * + * Background reclaim runs until low watermarks are reached: + * - FIFO has more than 1024 entries left + * - more than 50% journal buckets free + * + * As long as a reclaim can complete in the time it takes to fill up + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +static int __bch2_journal_reclaim(struct journal *j, bool direct) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool kthread = (current->flags & PF_KTHREAD) != 0; + u64 seq_to_flush; + size_t min_nr, nr_flushed; + unsigned flags; + int ret = 0; + /* - * If it's been longer than j->reclaim_delay_ms since we last flushed, - * make sure to flush at least one journal pin: + * We can't invoke memory reclaim while holding the reclaim_lock - + * journal reclaim is required to make progress for memory reclaim + * (cleaning the caches), so we can't get stuck in memory reclaim while + * we're holding the reclaim lock: */ - if (time_after(jiffies, j->last_flushed + - msecs_to_jiffies(j->reclaim_delay_ms))) - min_nr = 1; + lockdep_assert_held(&j->reclaim_lock); + flags = memalloc_noreclaim_save(); + + do { + if (kthread && kthread_should_stop()) + break; + + if (bch2_journal_error(j)) { + ret = -EIO; + break; + } + + bch2_journal_do_discards(j); + + seq_to_flush = journal_seq_to_flush(j); + min_nr = 0; + + /* + * If it's been longer than j->reclaim_delay_ms since we last flushed, + * make sure to flush at least one journal pin: + */ + if (time_after(jiffies, j->last_flushed + + msecs_to_jiffies(j->reclaim_delay_ms))) + min_nr = 1; + + if (j->prereserved.reserved * 2 > j->prereserved.remaining) + min_nr = 1; + + if (fifo_free(&j->pin) <= 32) + min_nr = 1; + + trace_journal_reclaim_start(c, + min_nr, + j->prereserved.reserved, + j->prereserved.remaining, + atomic_read(&c->btree_cache.dirty), + c->btree_cache.used, + atomic_long_read(&c->btree_key_cache.nr_dirty), + atomic_long_read(&c->btree_key_cache.nr_keys)); + + nr_flushed = journal_flush_pins(j, seq_to_flush, + min_nr, + min(bch2_nr_btree_keys_need_flush(c), 128UL)); + + if (direct) + j->nr_direct_reclaim += nr_flushed; + else + j->nr_background_reclaim += nr_flushed; + trace_journal_reclaim_finish(c, nr_flushed); + + if (nr_flushed) + wake_up(&j->reclaim_wait); + } while (min_nr && nr_flushed && !direct); + + memalloc_noreclaim_restore(flags); - if (j->prereserved.reserved * 2 > j->prereserved.remaining) { - seq_to_flush = max(seq_to_flush, journal_last_seq(j)); - min_nr = 1; + return ret; +} + +int bch2_journal_reclaim(struct journal *j) +{ + return __bch2_journal_reclaim(j, true); +} + +static int bch2_journal_reclaim_thread(void *arg) +{ + struct journal *j = arg; + unsigned long next; + int ret = 0; + + set_freezable(); + + kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); + + while (!ret && !kthread_should_stop()) { + j->reclaim_kicked = false; + + mutex_lock(&j->reclaim_lock); + ret = __bch2_journal_reclaim(j, false); + mutex_unlock(&j->reclaim_lock); + + next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + if (j->reclaim_kicked) + break; + if (time_after_eq(jiffies, next)) + break; + schedule_timeout(next - jiffies); + try_to_freeze(); + + } + __set_current_state(TASK_RUNNING); } - journal_flush_pins(j, seq_to_flush, min_nr); + return 0; +} - if (!bch2_journal_error(j)) - queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, - msecs_to_jiffies(j->reclaim_delay_ms)); +void bch2_journal_reclaim_stop(struct journal *j) +{ + struct task_struct *p = j->reclaim_thread; + + j->reclaim_thread = NULL; + + if (p) { + kthread_stop(p); + put_task_struct(p); + } } -void bch2_journal_reclaim_work(struct work_struct *work) +int bch2_journal_reclaim_start(struct journal *j) { - struct journal *j = container_of(to_delayed_work(work), - struct journal, reclaim_work); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct task_struct *p; - mutex_lock(&j->reclaim_lock); - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); + if (j->reclaim_thread) + return 0; + + p = kthread_create(bch2_journal_reclaim_thread, j, + "bch-reclaim/%s", c->name); + if (IS_ERR(p)) { + bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p)); + return PTR_ERR(p); + } + + get_task_struct(p); + j->reclaim_thread = p; + wake_up_process(p); + return 0; } static int journal_flush_done(struct journal *j, u64 seq_to_flush, @@ -565,7 +754,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - *did_work = journal_flush_pins(j, seq_to_flush, 0); + *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0; spin_lock(&j->lock); /* diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 8128907..adf1f5c 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -4,11 +4,16 @@ #define JOURNAL_PIN (32 * 1024) -enum journal_space_from { - journal_space_discarded, - journal_space_clean_ondisk, - journal_space_clean, -}; +static inline void journal_reclaim_kick(struct journal *j) +{ + struct task_struct *p = READ_ONCE(j->reclaim_thread); + + if (p && !j->reclaim_kicked) { + j->reclaim_kicked = true; + if (p) + wake_up_process(p); + } +} unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, @@ -28,34 +33,48 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } +void __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, - journal_pin_flush_fn); +void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); static inline void bch2_journal_pin_add(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) - __bch2_journal_pin_add(j, seq, pin, flush_fn); + bch2_journal_pin_set(j, seq, pin, flush_fn); } -void bch2_journal_pin_update(struct journal *, u64, - struct journal_entry_pin *, - journal_pin_flush_fn); +static inline void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) +{ + /* Guard against racing with journal_pin_drop(src): */ + u64 seq = READ_ONCE(src->seq); -void bch2_journal_pin_copy(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); + if (seq) + bch2_journal_pin_add(j, seq, dst, flush_fn); +} + +static inline void bch2_journal_pin_update(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) + bch2_journal_pin_set(j, seq, pin, flush_fn); +} void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); void bch2_journal_do_discards(struct journal *); -void bch2_journal_reclaim(struct journal *); -void bch2_journal_reclaim_work(struct work_struct *); +int bch2_journal_reclaim(struct journal *); + +void bch2_journal_reclaim_stop(struct journal *); +int bch2_journal_reclaim_start(struct journal *); bool bch2_journal_flush_pins(struct journal *, u64); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index d0f1bbf..e1b63f3 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -118,7 +118,7 @@ out_write_sb: out: mutex_unlock(&c->sb_lock); - return ret; + return ret ?: bch2_blacklist_table_initialize(c); } static int journal_seq_blacklist_table_cmp(const void *_l, @@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) struct journal_seq_blacklist_table *t; unsigned i, nr = blacklist_nr_entries(bl); - BUG_ON(c->journal_seq_blacklist_table); - if (!bl) return 0; @@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) journal_seq_blacklist_table_cmp, NULL); + kfree(c->journal_seq_blacklist_table); c->journal_seq_blacklist_table = t; return 0; } diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 154b51b..c24bc4a 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -9,16 +9,18 @@ #include "super_types.h" #include "fifo.h" -struct journal_res; +#define JOURNAL_BUF_BITS 2 +#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) +#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) /* - * We put two of these in struct journal; we used them for writes to the - * journal that are being staged or in flight. + * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to + * the journal that are being staged or in flight. */ struct journal_buf { struct jset *data; - BKEY_PADDED(key); + __BKEY_PADDED(key, BCH_REPLICAS_MAX); struct closure_waitlist wait; @@ -27,6 +29,9 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ + bool separate_flush; /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; @@ -38,6 +43,7 @@ struct journal_buf { struct journal_entry_pin_list { struct list_head list; + struct list_head key_cache_list; struct list_head flushed; atomic_t count; struct bch_devs_list devs; @@ -45,7 +51,7 @@ struct journal_entry_pin_list { struct journal; struct journal_entry_pin; -typedef void (*journal_pin_flush_fn)(struct journal *j, +typedef int (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *, u64); struct journal_entry_pin { @@ -81,10 +87,12 @@ union journal_res_state { struct { u64 cur_entry_offset:20, - idx:1, - prev_buf_unwritten:1, - buf0_count:21, - buf1_count:21; + idx:2, + unwritten_idx:2, + buf0_count:10, + buf1_count:10, + buf2_count:10, + buf3_count:10; }; }; @@ -98,8 +106,9 @@ union journal_preres_state { }; struct { - u32 reserved; - u32 remaining; + u64 waiting:1, + reserved:31, + remaining:32; }; }; @@ -116,6 +125,20 @@ union journal_preres_state { #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) +struct journal_space { + /* Units of 512 bytes sectors: */ + unsigned next_entry; /* How big the next journal entry can be */ + unsigned total; +}; + +enum journal_space_from { + journal_space_discarded, + journal_space_clean_ondisk, + journal_space_clean, + journal_space_total, + journal_space_nr, +}; + /* * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, * either because something's waiting on the write to complete or because it's @@ -127,8 +150,8 @@ enum { JOURNAL_STARTED, JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, - JOURNAL_NOT_EMPTY, JOURNAL_MAY_GET_UNRESERVED, + JOURNAL_MAY_SKIP_FLUSH, }; /* Embedded in struct bch_fs */ @@ -147,7 +170,14 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - int cur_entry_error; + enum { + cur_entry_ok, + cur_entry_blocked, + cur_entry_journal_full, + cur_entry_journal_pin_full, + cur_entry_journal_stuck, + cur_entry_insufficient_devices, + } cur_entry_error; union journal_preres_state prereserved; @@ -160,7 +190,7 @@ struct journal { * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. */ - struct journal_buf buf[2]; + struct journal_buf buf[JOURNAL_BUF_NR]; spinlock_t lock; @@ -180,7 +210,10 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; + u64 flushed_seq_ondisk; u64 last_seq_ondisk; + u64 err_seq; + u64 last_empty_seq; /* * FIFO of journal entries whose btree updates have not yet been @@ -203,16 +236,24 @@ struct journal { struct journal_entry_pin_list *data; } pin; + struct journal_space space[journal_space_nr]; + u64 replay_journal_seq; u64 replay_journal_seq_end; struct write_point wp; spinlock_t err_lock; - struct delayed_work reclaim_work; struct mutex reclaim_lock; + wait_queue_head_t reclaim_wait; + struct task_struct *reclaim_thread; + bool reclaim_kicked; + u64 nr_direct_reclaim; + u64 nr_background_reclaim; + unsigned long last_flushed; struct journal_entry_pin *flush_in_progress; + bool flush_in_progress_dropped; wait_queue_head_t pin_flush_wait; /* protects advancing ja->discard_idx: */ @@ -221,11 +262,15 @@ struct journal { unsigned write_delay_ms; unsigned reclaim_delay_ms; + unsigned long last_flush_write; u64 res_get_blocked_start; u64 need_write_time; u64 write_start_time; + u64 nr_flush_writes; + u64 nr_noflush_writes; + struct time_stats *write_time; struct time_stats *delay_time; struct time_stats *blocked_time; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 96c8690..ef69a19 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -4,7 +4,7 @@ */ #include "bcachefs.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" @@ -41,10 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - struct bkey_on_stack sk; + struct bkey_buf sk; int ret = 0; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, @@ -53,11 +53,11 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { if (!bch2_bkey_has_device(k, dev_idx)) { - bch2_btree_iter_next(iter); + bch2_btree_iter_advance(iter); continue; } - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), dev_idx, flags, false); @@ -88,9 +88,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags if (ret) break; } + bch2_trans_iter_put(&trans, iter); ret = bch2_trans_exit(&trans) ?: ret; - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); BUG_ON(ret == -EINTR); @@ -99,8 +100,8 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { - return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: - __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); + return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?: + __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink); } static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) @@ -109,6 +110,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) struct btree_iter *iter; struct closure cl; struct btree *b; + struct bkey_buf k; unsigned id; int ret; @@ -116,38 +118,42 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) if (flags & BCH_FORCE_IF_METADATA_LOST) return -EINVAL; + bch2_bkey_buf_init(&k); bch2_trans_init(&trans, c, 0, 0); closure_init_stack(&cl); for (id = 0; id < BTREE_ID_NR; id++) { for_each_btree_node(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; retry: if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), dev_idx)) continue; - bkey_copy(&tmp.k, &b->key); + bch2_bkey_buf_copy(&k, c, &b->key); - ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), + ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true); if (ret) { bch_err(c, "Cannot drop device without losing data"); - goto err; + break; } - ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); + ret = bch2_btree_node_update_key(c, iter, b, k.k); if (ret == -EINTR) { b = bch2_btree_iter_peek_node(iter); + ret = 0; goto retry; } if (ret) { bch_err(c, "Error updating btree node key: %i", ret); - goto err; + break; } } bch2_trans_iter_free(&trans, iter); + + if (ret) + goto err; } /* flush relevant btree updates */ @@ -157,6 +163,7 @@ retry: ret = 0; err: ret = bch2_trans_exit(&trans) ?: ret; + bch2_bkey_buf_exit(&k, c); BUG_ON(ret == -EINTR); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 6633d21..5b10849 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -2,7 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_update_interior.h" @@ -61,8 +61,13 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct migrate_write *m = container_of(op, struct migrate_write, op); struct keylist *keys = &op->insert_keys; + struct bkey_buf _new, _insert; int ret = 0; + bch2_bkey_buf_init(&_new); + bch2_bkey_buf_init(&_insert); + bch2_bkey_buf_realloc(&_insert, c, U8_MAX); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, m->btree_id, @@ -73,21 +78,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct bkey_s_c k; struct bkey_i *insert; struct bkey_i_extent *new; - BKEY_PADDED(k) _new, _insert; const union bch_extent_entry *entry; struct extent_ptr_decoded p; bool did_work = false; - int nr; + bool extending = false, should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; bch2_trans_reset(&trans, 0); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); - if (ret) { - if (ret == -EINTR) - continue; - break; - } + if (ret) + goto err; new = bkey_i_to_extent(bch2_keylist_front(keys)); @@ -95,11 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) goto nomatch; - bkey_reassemble(&_insert.k, k); - insert = &_insert.k; + bkey_reassemble(_insert.k, k); + insert = _insert.k; - bkey_copy(&_new.k, bch2_keylist_front(keys)); - new = bkey_i_to_extent(&_new.k); + bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); + new = bkey_i_to_extent(_new.k); bch2_cut_front(iter->pos, &new->k_i); bch2_cut_front(iter->pos, insert); @@ -144,23 +146,21 @@ static int bch2_migrate_index_update(struct bch_write_op *op) op->opts.background_target, op->opts.data_replicas); - /* - * If we're not fully overwriting @k, and it's compressed, we - * need a reservation for all the pointers in @insert - */ - nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - - m->nr_ptrs_reserved; + ret = bch2_sum_sector_overwrites(&trans, iter, insert, + &extending, + &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + goto err; - if (insert->k.size < k.k->size && - bch2_bkey_sectors_compressed(k) && - nr > 0) { + if (disk_sectors_delta > (s64) op->res.sectors) { ret = bch2_disk_reservation_add(c, &op->res, - keylist_sectors(keys) * nr, 0); + disk_sectors_delta - op->res.sectors, + !should_check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) goto out; - - m->nr_ptrs_reserved += nr; - goto next; } bch2_trans_update(&trans, iter, insert, 0); @@ -168,8 +168,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| m->data_opts.btree_insert_flags); +err: if (!ret) atomic_long_inc(&c->extent_migrate_done); if (ret == -EINTR) @@ -196,7 +196,10 @@ nomatch: goto next; } out: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); BUG_ON(ret == -EINTR); return ret; } @@ -207,9 +210,9 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) BUG_ON(!m->op.wbio.bio.bi_vcnt); m->ptr = rbio->pick.ptr; - m->offset = rbio->pos.offset - rbio->pick.crc.offset; + m->offset = rbio->data_pos.offset - rbio->pick.crc.offset; m->op.devs_have = rbio->devs_have; - m->op.pos = rbio->pos; + m->op.pos = rbio->data_pos; m->op.version = rbio->version; m->op.crc = rbio->pick.crc; m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; @@ -491,7 +494,9 @@ static int bch2_move_extent(struct btree_trans *trans, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(trans, &io->rbio, k, 0, + bch2_read_extent(trans, &io->rbio, + bkey_start_pos(k.k), + btree_id, k, 0, BCH_READ_NODECODE| BCH_READ_LAST_FRAGMENT); return 0; @@ -504,6 +509,32 @@ err: return ret; } +static int lookup_inode(struct btree_trans *trans, struct bpos pos, + struct bch_inode_unpacked *inode) +{ + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) + goto err; + + ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; + if (ret) + goto err; + + ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + if (ret) + goto err; +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + static int __bch2_move_data(struct bch_fs *c, struct moving_context *ctxt, struct bch_ratelimit *rate, @@ -516,7 +547,7 @@ static int __bch2_move_data(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct bkey_on_stack sk; + struct bkey_buf sk; struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; @@ -525,12 +556,12 @@ static int __bch2_move_data(struct bch_fs *c, u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_user; stats->btree_id = btree_id; - stats->pos = POS_MIN; + stats->pos = start; iter = bch2_trans_get_iter(&trans, btree_id, start, BTREE_ITER_PREFETCH); @@ -561,7 +592,7 @@ static int __bch2_move_data(struct bch_fs *c, try_to_freeze(); } } while (delay); -peek: + k = bch2_btree_iter_peek(iter); stats->pos = iter->pos; @@ -577,18 +608,22 @@ peek: if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - if (btree_id == BTREE_ID_EXTENTS && + if (btree_id == BTREE_ID_extents && cur_inum != k.k->p.inode) { struct bch_inode_unpacked inode; - /* don't hold btree locks while looking up inode: */ - bch2_trans_unlock(&trans); - io_opts = bch2_opts_to_inode_opts(c->opts); - if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) + + ret = lookup_inode(&trans, + SPOS(0, k.k->p.inode, k.k->p.snapshot), + &inode); + if (ret == -EINTR) + continue; + + if (!ret) bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); + cur_inum = k.k->p.inode; - goto peek; } switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { @@ -605,13 +640,19 @@ peek: } /* unlock before doing IO: */ - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { + if (ret2 == -EINTR) { + bch2_trans_reset(&trans, 0); + bch2_trans_cond_resched(&trans); + continue; + } + if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); @@ -628,25 +669,28 @@ next: atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), &stats->sectors_seen); next_nondata: - bch2_btree_iter_next(iter); + bch2_btree_iter_advance(iter); bch2_trans_cond_resched(&trans); } out: + + bch2_trans_iter_put(&trans, iter); ret = bch2_trans_exit(&trans) ?: ret; - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } int bch2_move_data(struct bch_fs *c, + enum btree_id start_btree_id, struct bpos start_pos, + enum btree_id end_btree_id, struct bpos end_pos, struct bch_ratelimit *rate, struct write_point_specifier wp, - struct bpos start, - struct bpos end, move_pred_fn pred, void *arg, struct bch_move_stats *stats) { struct moving_context ctxt = { .stats = stats }; + enum btree_id id; int ret; closure_init_stack(&ctxt.cl); @@ -655,10 +699,23 @@ int bch2_move_data(struct bch_fs *c, stats->data_type = BCH_DATA_user; - ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, - pred, arg, stats, BTREE_ID_EXTENTS) ?: - __bch2_move_data(c, &ctxt, rate, wp, start, end, - pred, arg, stats, BTREE_ID_REFLINK); + for (id = start_btree_id; + id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); + id++) { + stats->btree_id = id; + + if (id != BTREE_ID_extents && + id != BTREE_ID_reflink) + continue; + + ret = __bch2_move_data(c, &ctxt, rate, wp, + id == start_btree_id ? start_pos : POS_MIN, + id == end_btree_id ? end_pos : POS_MAX, + pred, arg, stats, id); + if (ret) + break; + } + move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); @@ -672,16 +729,22 @@ int bch2_move_data(struct bch_fs *c, return ret; } +typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *, + struct btree *, struct bch_io_opts *, + struct data_opts *); + static int bch2_move_btree(struct bch_fs *c, - move_pred_fn pred, - void *arg, + enum btree_id start_btree_id, struct bpos start_pos, + enum btree_id end_btree_id, struct bpos end_pos, + move_btree_pred pred, void *arg, struct bch_move_stats *stats) { + bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_trans trans; struct btree_iter *iter; struct btree *b; - unsigned id; + enum btree_id id; struct data_opts data_opts; enum data_cmd cmd; int ret = 0; @@ -690,16 +753,24 @@ static int bch2_move_btree(struct bch_fs *c, stats->data_type = BCH_DATA_btree; - for (id = 0; id < BTREE_ID_NR; id++) { + for (id = start_btree_id; + id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); + id++) { stats->btree_id = id; - for_each_btree_node(&trans, iter, id, POS_MIN, + for_each_btree_node(&trans, iter, id, + id == start_btree_id ? start_pos : POS_MIN, BTREE_ITER_PREFETCH, b) { + if (kthread && kthread_should_stop()) + goto out; + + if ((cmp_int(id, end_btree_id) ?: + bkey_cmp(b->key.k.p, end_pos)) > 0) + break; + stats->pos = iter->pos; - switch ((cmd = pred(c, arg, - bkey_i_to_s_c(&b->key), - &io_opts, &data_opts))) { + switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) { case DATA_SKIP: goto next; case DATA_SCRUB: @@ -719,9 +790,12 @@ next: ret = bch2_trans_iter_free(&trans, iter) ?: ret; } - +out: bch2_trans_exit(&trans); + if (ret) + bch_err(c, "error %i in bch2_move_btree", ret); + return ret; } @@ -778,6 +852,83 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, return DATA_REWRITE; } +static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + +static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + +static bool bformat_needs_redo(struct bkey_format *f) +{ + unsigned i; + + for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f->bits_per_field[i] > unpacked_bits) + return true; + + if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + return true; + + if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & + unpacked_mask) < + field_offset) + return true; + } + + return false; +} + +static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + if (b->version_ondisk != c->sb.version || + btree_node_need_rewrite(b) || + bformat_needs_redo(&b->format)) { + data_opts->target = 0; + data_opts->nr_replicas = 1; + data_opts->btree_insert_flags = 0; + return DATA_REWRITE; + } + + return DATA_SKIP; +} + +int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) +{ + int ret; + + ret = bch2_move_btree(c, + 0, POS_MIN, + BTREE_ID_NR, POS_MAX, + rewrite_old_nodes_pred, c, stats); + if (!ret) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; + c->disk_sb.sb->version_min = c->disk_sb.sb->version; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + return ret; +} + int bch2_data_job(struct bch_fs *c, struct bch_move_stats *stats, struct bch_ioctl_data op) @@ -789,17 +940,20 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); - ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; + ret = bch2_move_btree(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + rereplicate_btree_pred, c, stats) ?: ret; closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); ret = bch2_replicas_gc2(c) ?: ret; - ret = bch2_move_data(c, NULL, - writepoint_hashed((unsigned long) current), - op.start, - op.end, + ret = bch2_move_data(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + NULL, writepoint_hashed((unsigned long) current), rereplicate_pred, c, stats) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; @@ -810,16 +964,22 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; + ret = bch2_move_btree(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + migrate_btree_pred, &op, stats) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; - ret = bch2_move_data(c, NULL, - writepoint_hashed((unsigned long) current), - op.start, - op.end, + ret = bch2_move_data(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + NULL, writepoint_hashed((unsigned long) current), migrate_pred, &op, stats) ?: ret; ret = bch2_replicas_gc2(c) ?: ret; break; + case BCH_DATA_OP_REWRITE_OLD_NODES: + ret = bch2_scan_old_btree_nodes(c, stats); + break; default: ret = -EINVAL; } diff --git a/libbcachefs/move.h b/libbcachefs/move.h index b04bc66..5076153 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -52,9 +52,13 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, struct bch_io_opts *, struct data_opts *); -int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, +int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); + +int bch2_move_data(struct bch_fs *, + enum btree_id, struct bpos, + enum btree_id, struct bpos, + struct bch_ratelimit *, struct write_point_specifier, - struct bpos, struct bpos, move_pred_fn, void *, struct bch_move_stats *); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index ddfda1e..03668e4 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -61,7 +61,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, copygc_heap *h = &c->copygc_heap; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; - struct extent_ptr_decoded p; + struct extent_ptr_decoded p = { 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); @@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; data_opts->rewrite_dev = p.ptr.dev; - if (p.has_ec) { - struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx); - - data_opts->nr_replicas += m->nr_redundant; - } + if (p.has_ec) + data_opts->nr_replicas += p.ec.redundancy; return DATA_REWRITE; } @@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c) bucket_sectors_used(m) >= ca->mi.bucket_size) continue; - WARN_ON(m.stripe && !g->ec_redundancy); + WARN_ON(m.stripe && !g->stripe_redundancy); e = (struct copygc_heap_entry) { .dev = dev_idx, .gen = m.gen, - .replicas = 1 + g->ec_redundancy, + .replicas = 1 + g->stripe_redundancy, .fragmentation = bucket_sectors_used(m) * (1U << 15) / ca->mi.bucket_size, .sectors = bucket_sectors_used(m), @@ -200,6 +197,11 @@ static int bch2_copygc(struct bch_fs *c) return -1; } + /* + * Our btree node allocations also come out of RESERVE_MOVINGGC: + */ + sectors_to_move = (sectors_to_move * 3) / 4; + for (i = h->data; i < h->data + h->used; i++) sectors_to_move += i->sectors * i->replicas; @@ -217,9 +219,11 @@ static int bch2_copygc(struct bch_fs *c) sizeof(h->data[0]), bucket_offset_cmp, NULL); - ret = bch2_move_data(c, &c->copygc_pd.rate, + ret = bch2_move_data(c, + 0, POS_MIN, + BTREE_ID_NR, POS_MAX, + &c->copygc_pd.rate, writepoint_ptr(&c->copygc_write_point), - POS_MIN, POS_MAX, copygc_pred, NULL, &move_stats); @@ -286,7 +290,7 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) fragmented_allowed += ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); - fragmented += usage.sectors_fragmented; + fragmented += usage.d[BCH_DATA_user].fragmented; } return max_t(s64, 0, fragmented_allowed - fragmented); @@ -296,7 +300,7 @@ static int bch2_copygc_thread(void *arg) { struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last, wait; + u64 last, wait; set_freezable(); @@ -304,7 +308,7 @@ static int bch2_copygc_thread(void *arg) if (kthread_wait_freezable(c->copy_gc_enabled)) break; - last = atomic_long_read(&clock->now); + last = atomic64_read(&clock->now); wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { @@ -345,9 +349,11 @@ int bch2_copygc_start(struct bch_fs *c) if (bch2_fs_init_fault("copygc_start")) return -ENOMEM; - t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); - if (IS_ERR(t)) + t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); + if (IS_ERR(t)) { + bch_err(c, "error creating copygc thread: %li", PTR_ERR(t)); return PTR_ERR(t); + } get_task_struct(t); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 97a36ac..0cfbb56 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -9,72 +9,59 @@ #include "super-io.h" #include "util.h" +#define x(t, n) #t, + const char * const bch2_error_actions[] = { - "continue", - "remount-ro", - "panic", + BCH_ERROR_ACTIONS() NULL }; const char * const bch2_sb_features[] = { -#define x(f, n) #f, BCH_SB_FEATURES() -#undef x + NULL +}; + +const char * const bch2_sb_compat[] = { + BCH_SB_COMPAT() + NULL +}; + +const char * const bch2_btree_ids[] = { + BCH_BTREE_IDS() NULL }; const char * const bch2_csum_opts[] = { - "none", - "crc32c", - "crc64", + BCH_CSUM_OPTS() NULL }; const char * const bch2_compression_opts[] = { -#define x(t, n) #t, BCH_COMPRESSION_OPTS() -#undef x NULL }; const char * const bch2_str_hash_types[] = { - "crc32c", - "crc64", - "siphash", + BCH_STR_HASH_OPTS() NULL }; const char * const bch2_data_types[] = { -#define x(t, n) #t, BCH_DATA_TYPES() -#undef x NULL }; const char * const bch2_cache_replacement_policies[] = { - "lru", - "fifo", - "random", + BCH_CACHE_REPLACEMENT_POLICIES() NULL }; -/* Default is -1; we skip past it for struct cached_dev's cache mode */ -const char * const bch2_cache_modes[] = { - "default", - "writethrough", - "writeback", - "writearound", - "none", +const char * const bch2_member_states[] = { + BCH_MEMBER_STATES() NULL }; -const char * const bch2_dev_state[] = { - "readwrite", - "readonly", - "failed", - "spare", - NULL -}; +#undef x void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 710a7ee..001e865 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -10,13 +10,14 @@ extern const char * const bch2_error_actions[]; extern const char * const bch2_sb_features[]; +extern const char * const bch2_sb_compat[]; +extern const char * const bch2_btree_ids[]; extern const char * const bch2_csum_opts[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; extern const char * const bch2_data_types[]; extern const char * const bch2_cache_replacement_policies[]; -extern const char * const bch2_cache_modes[]; -extern const char * const bch2_dev_state[]; +extern const char * const bch2_member_states[]; /* * Mount options; we also store defaults in the superblock. @@ -89,7 +90,7 @@ enum opt_type { x(errors, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ - BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ NULL, "Action to take on filesystem error") \ x(metadata_replicas, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ @@ -114,12 +115,12 @@ enum opt_type { x(metadata_checksum, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_csum_opts), \ - BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ + BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ OPT_STR(bch2_csum_opts), \ - BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ + BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(compression, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ @@ -134,8 +135,13 @@ enum opt_type { x(str_hash, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_str_hash_types), \ - BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_METADATA_TARGET, 0, \ + "(target)", "Device or disk group for metadata writes") \ x(foreground_target, u16, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ OPT_FN(bch2_opt_target), \ @@ -207,16 +213,16 @@ enum opt_type { OPT_BOOL(), \ BCH_SB_PRJQUOTA, false, \ NULL, "Enable project quotas") \ - x(reflink, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_REFLINK, true, \ - NULL, "Enable reflink support") \ x(degraded, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Allow mounting in degraded mode") \ + x(very_degraded, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Allow mounting in when data will be missing") \ x(discard, u8, \ OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index d3032a4..8e27251 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -363,7 +363,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), + for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0), BTREE_ITER_PREFETCH, k, ret) { if (k.k->p.inode != type) break; @@ -435,7 +435,7 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { switch (k.k->type) { case KEY_TYPE_inode: @@ -526,7 +526,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) if (c->opts.usrquota) return -EINVAL; - ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_USR, 0), POS(QTYP_USR + 1, 0), NULL); @@ -538,7 +538,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) if (c->opts.grpquota) return -EINVAL; - ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_GRP, 0), POS(QTYP_GRP + 1, 0), NULL); @@ -550,7 +550,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) if (c->opts.prjquota) return -EINVAL; - ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_PRJ, 0), POS(QTYP_PRJ + 1, 0), NULL); @@ -718,7 +718,7 @@ static int bch2_set_quota_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, + iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); k = bch2_btree_iter_peek_slot(iter); @@ -746,7 +746,6 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, struct qc_dqblk *qdq) { struct bch_fs *c = sb->s_fs_info; - struct btree_trans trans; struct bkey_i_quota new_quota; int ret; @@ -756,14 +755,10 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, bkey_quota_init(&new_quota.k_i); new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); - bch2_trans_init(&trans, c, 0, 0); - ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); - bch2_trans_exit(&trans); - return ret; } diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 44d2651..a0dbf41 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg) unsigned long start, prev_start; unsigned long prev_run_time, prev_run_cputime; unsigned long cputime, prev_cputime; - unsigned long io_start; + u64 io_start; long throttle; set_freezable(); - io_start = atomic_long_read(&clock->now); + io_start = atomic64_read(&clock->now); p = rebalance_work(c); prev_start = jiffies; prev_cputime = curr_cputime(); @@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg) (20 - w.dev_most_full_percent), 50); - if (atomic_long_read(&clock->now) + clock->max_slop < + if (atomic64_read(&clock->now) + clock->max_slop < r->throttled_until_iotime) { r->throttled_until_cputime = start + throttle; r->state = REBALANCE_THROTTLED; @@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg) max(p.dev_most_full_percent, 1U) / max(w.dev_most_full_percent, 1U)); - io_start = atomic_long_read(&clock->now); + io_start = atomic64_read(&clock->now); p = w; prev_start = start; prev_cputime = cputime; @@ -239,10 +239,11 @@ static int bch2_rebalance_thread(void *arg) rebalance_work_reset(c); bch2_move_data(c, + 0, POS_MIN, + BTREE_ID_NR, POS_MAX, /* ratelimiting disabled for now */ NULL, /* &r->pd.rate, */ writepoint_ptr(&c->rebalance_write_point), - POS_MIN, POS_MAX, rebalance_pred, NULL, &r->move_stats); } @@ -274,16 +275,16 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) case REBALANCE_THROTTLED: bch2_hprint(&PBUF(h1), (r->throttled_until_iotime - - atomic_long_read(&c->io_clock[WRITE].now)) << 9); + atomic64_read(&c->io_clock[WRITE].now)) << 9); pr_buf(out, "throttled for %lu sec or %s io\n", (r->throttled_until_cputime - jiffies) / HZ, h1); break; case REBALANCE_RUNNING: - pr_buf(out, "running\n"); - pr_buf(out, "pos %llu:%llu\n", - r->move_stats.pos.inode, - r->move_stats.pos.offset); + pr_buf(out, "running\n" + "pos "); + bch2_bpos_to_text(out, r->move_stats.pos); + pr_buf(out, "\n"); break; } } @@ -311,12 +312,17 @@ int bch2_rebalance_start(struct bch_fs *c) { struct task_struct *p; + if (c->rebalance.thread) + return 0; + if (c->opts.nochanges) return 0; - p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); - if (IS_ERR(p)) + p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); + if (IS_ERR(p)) { + bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p)); return PTR_ERR(p); + } get_task_struct(p); rcu_assign_pointer(c->rebalance.thread, p); diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h index 192c6be..2f62a64 100644 --- a/libbcachefs/rebalance_types.h +++ b/libbcachefs/rebalance_types.h @@ -17,7 +17,7 @@ struct bch_fs_rebalance { atomic64_t work_unknown_dev; enum rebalance_state state; - unsigned long throttled_until_iotime; + u64 throttled_until_iotime; unsigned long throttled_until_cputime; struct bch_move_stats move_stats; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 32fed6b..86593e9 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "alloc_background.h" #include "btree_gc.h" #include "btree_update.h" @@ -15,6 +16,7 @@ #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" +#include "move.h" #include "quota.h" #include "recovery.h" #include "replicas.h" @@ -31,7 +33,7 @@ static void drop_alloc_keys(struct journal_keys *keys) size_t src, dst; for (src = 0, dst = 0; src < keys->nr; src++) - if (keys->d[src].btree_id != BTREE_ID_ALLOC) + if (keys->d[src].btree_id != BTREE_ID_alloc) keys->d[dst++] = keys->d[src]; keys->nr = dst; @@ -39,78 +41,174 @@ static void drop_alloc_keys(struct journal_keys *keys) /* iterate over keys read from the journal: */ -static struct journal_key *journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, unsigned level, - struct bpos pos) +static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bpos_cmp(l_pos, r->k->k.p)); +} + +static int journal_key_cmp(struct journal_key *l, struct journal_key *r) +{ + return (cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + bpos_cmp(l->k->k.p, r->k->k.p)); +} + +static size_t journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { size_t l = 0, r = journal_keys->nr, m; while (l < r) { m = l + ((r - l) >> 1); - if ((cmp_int(id, journal_keys->d[m].btree_id) ?: - cmp_int(level, journal_keys->d[m].level) ?: - bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) + if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0) l = m + 1; else r = m; } BUG_ON(l < journal_keys->nr && - (cmp_int(id, journal_keys->d[l].btree_id) ?: - cmp_int(level, journal_keys->d[l].level) ?: - bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); + __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0); BUG_ON(l && - (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: - cmp_int(level, journal_keys->d[l - 1].level) ?: - bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); + __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0); + + return l; +} + +static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) +{ + struct bkey_i *n = iter->keys->d[idx].k; + struct btree_and_journal_iter *biter = + container_of(iter, struct btree_and_journal_iter, journal); + + if (iter->idx > idx || + (iter->idx == idx && + biter->last && + bpos_cmp(n->k.p, biter->unpacked.p) <= 0)) + iter->idx++; +} + +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, + .allocated = true + }; + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + unsigned idx = journal_key_search(keys, id, level, k->k.p); + + if (idx < keys->nr && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + return 0; + } + + if (keys->nr == keys->size) { + struct journal_keys new_keys = { + .nr = keys->nr, + .size = keys->size * 2, + .journal_seq_base = keys->journal_seq_base, + }; + + new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); + if (!new_keys.d) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); + return -ENOMEM; + } + + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); + *keys = new_keys; + } + + array_insert_item(keys->d, keys->nr, idx, n); - return l < journal_keys->nr ? journal_keys->d + l : NULL; + list_for_each_entry(iter, &c->journal_iters, list) + journal_iter_fix(c, iter, idx); + + return 0; +} + +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i *whiteout = + kmalloc(sizeof(struct bkey), GFP_KERNEL); + int ret; + + if (!whiteout) { + bch_err(c, "%s: error allocating new key", __func__); + return -ENOMEM; + } + + bkey_init(&whiteout->k); + whiteout->k.p = pos; + + ret = bch2_journal_key_insert(c, id, level, whiteout); + if (ret) + kfree(whiteout); + return ret; } static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) { - if (iter->k && - iter->k < iter->keys->d + iter->keys->nr && - iter->k->btree_id == iter->btree_id && - iter->k->level == iter->level) - return iter->k->k; + struct journal_key *k = iter->idx - iter->keys->nr + ? iter->keys->d + iter->idx : NULL; + + if (k && + k->btree_id == iter->btree_id && + k->level == iter->level) + return k->k; - iter->k = NULL; + iter->idx = iter->keys->nr; return NULL; } static void bch2_journal_iter_advance(struct journal_iter *iter) { - if (iter->k) - iter->k++; + if (iter->idx < iter->keys->nr) + iter->idx++; } -static void bch2_journal_iter_init(struct journal_iter *iter, - struct journal_keys *journal_keys, +static void bch2_journal_iter_exit(struct journal_iter *iter) +{ + list_del(&iter->list); +} + +static void bch2_journal_iter_init(struct bch_fs *c, + struct journal_iter *iter, enum btree_id id, unsigned level, struct bpos pos) { iter->btree_id = id; iter->level = level; - iter->keys = journal_keys; - iter->k = journal_key_search(journal_keys, id, level, pos); + iter->keys = &c->journal_keys; + iter->idx = journal_key_search(&c->journal_keys, id, level, pos); + list_add(&iter->list, &c->journal_iters); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) { - return iter->btree - ? bch2_btree_iter_peek(iter->btree) - : bch2_btree_node_iter_peek_unpack(&iter->node_iter, - iter->b, &iter->unpacked); + return bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); } static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) { - if (iter->btree) - bch2_btree_iter_next(iter->btree); - else - bch2_btree_node_iter_advance(&iter->node_iter, iter->b); + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); } void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) @@ -140,7 +238,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); if (btree_k.k && journal_k.k) { - int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); + int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p); if (!cmp) bch2_journal_iter_advance_btree(iter); @@ -158,8 +256,8 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * ret = iter->last == journal ? journal_k : btree_k; if (iter->b && - bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { - iter->journal.k = NULL; + bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) { + iter->journal.idx = iter->journal.keys->nr; iter->last = none; return bkey_s_c_null; } @@ -180,31 +278,50 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter * return bch2_btree_and_journal_iter_peek(iter); } -void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, - struct btree_trans *trans, - struct journal_keys *journal_keys, - enum btree_id id, struct bpos pos) +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) { - memset(iter, 0, sizeof(*iter)); - - iter->btree = bch2_trans_get_iter(trans, id, pos, 0); - bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); + bch2_journal_iter_exit(&iter->journal); } void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct journal_keys *journal_keys, + struct bch_fs *c, struct btree *b) { memset(iter, 0, sizeof(*iter)); iter->b = b; bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); - bch2_journal_iter_init(&iter->journal, journal_keys, + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, b->data->min_key); } /* Walk btree, overlaying keys from the journal: */ +static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, + struct btree_and_journal_iter iter) +{ + unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; + struct bkey_s_c k; + struct bkey_buf tmp; + + BUG_ON(!b->c.level); + + bch2_bkey_buf_init(&tmp); + + while (i < nr && + (k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_buf_reassemble(&tmp, c, k); + + bch2_btree_node_prefetch(c, NULL, tmp.k, + b->c.btree_id, b->c.level - 1); + + bch2_btree_and_journal_iter_advance(&iter); + i++; + } + + bch2_bkey_buf_exit(&tmp, c); +} + static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, struct journal_keys *journal_keys, enum btree_id btree_id, @@ -213,9 +330,12 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b { struct btree_and_journal_iter iter; struct bkey_s_c k; + struct bkey_buf tmp; + struct btree *child; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + bch2_bkey_buf_init(&tmp); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ret = key_fn(c, btree_id, b->c.level, k); @@ -223,34 +343,34 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b break; if (b->c.level) { - struct btree *child; - BKEY_PADDED(k) tmp; - - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); + bch2_bkey_buf_reassemble(&tmp, c, k); bch2_btree_and_journal_iter_advance(&iter); - if (b->c.level > 0) { - child = bch2_btree_node_get_noiter(c, &tmp.k, - b->c.btree_id, b->c.level - 1); - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; + child = bch2_btree_node_get_noiter(c, tmp.k, + b->c.btree_id, b->c.level - 1, + false); - ret = (node_fn ? node_fn(c, b) : 0) ?: - bch2_btree_and_journal_walk_recurse(c, child, - journal_keys, btree_id, node_fn, key_fn); - six_unlock_read(&child->c.lock); + ret = PTR_ERR_OR_ZERO(child); + if (ret) + break; - if (ret) - break; - } + btree_and_journal_iter_prefetch(c, b, iter); + + ret = (node_fn ? node_fn(c, b) : 0) ?: + bch2_btree_and_journal_walk_recurse(c, child, + journal_keys, btree_id, node_fn, key_fn); + six_unlock_read(&child->c.lock); + + if (ret) + break; } else { bch2_btree_and_journal_iter_advance(&iter); } } + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&tmp, c); return ret; } @@ -299,13 +419,19 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) return cmp_int(l->btree_id, r->btree_id) ?: cmp_int(l->level, r->level) ?: - bkey_cmp(l->k->k.p, r->k->k.p) ?: + bpos_cmp(l->k->k.p, r->k->k.p) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } void bch2_journal_keys_free(struct journal_keys *keys) { + struct journal_key *i; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->allocated) + kfree(i->k); + kvfree(keys->d); keys->d = NULL; keys->nr = 0; @@ -313,7 +439,7 @@ void bch2_journal_keys_free(struct journal_keys *keys) static struct journal_keys journal_keys_sort(struct list_head *journal_entries) { - struct journal_replay *p; + struct journal_replay *i; struct jset_entry *entry; struct bkey_i *k, *_n; struct journal_keys keys = { NULL }; @@ -323,35 +449,37 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) if (list_empty(journal_entries)) return keys; - keys.journal_seq_base = - le64_to_cpu(list_last_entry(journal_entries, - struct journal_replay, list)->j.last_seq); - - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + if (!keys.journal_seq_base) + keys.journal_seq_base = le64_to_cpu(i->j.seq); + + for_each_jset_key(k, _n, entry, &i->j) nr_keys++; } + keys.size = roundup_pow_of_two(nr_keys); - keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); + keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL); if (!keys.d) goto err; - list_for_each_entry(p, journal_entries, list) { - if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) + list_for_each_entry(i, journal_entries, list) { + if (i->ignore) continue; - for_each_jset_key(k, _n, entry, &p->j) + BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX); + + for_each_jset_key(k, _n, entry, &i->j) keys.d[keys.nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, - .journal_seq = le64_to_cpu(p->j.seq) - + .journal_seq = le64_to_cpu(i->j.seq) - keys.journal_seq_base, - .journal_offset = k->_data - p->j._data, + .journal_offset = k->_data - i->j._data, }; } @@ -362,7 +490,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) while (src + 1 < keys.d + keys.nr && src[0].btree_id == src[1].btree_id && src[0].level == src[1].level && - !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) + !bpos_cmp(src[0].k->k.p, src[1].k->k.p)) src++; *dst++ = *src++; @@ -384,111 +512,6 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, - struct bkey_i *k) -{ - struct btree_trans trans; - struct btree_iter *iter, *split_iter; - /* - * We might cause compressed extents to be split, so we need to pass in - * a disk_reservation: - */ - struct disk_reservation disk_res = - bch2_disk_reservation_init(c, 0); - struct bkey_i *split; - struct bpos atomic_end; - /* - * Some extents aren't equivalent - w.r.t. what the triggers do - * - if they're split: - */ - bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || - k->k.type == KEY_TYPE_reflink_p; - bool remark = false; - int ret; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -retry: - bch2_trans_begin(&trans); - - iter = bch2_trans_get_iter(&trans, btree_id, - bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - - do { - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto err; - - atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); - - split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); - ret = PTR_ERR_OR_ZERO(split); - if (ret) - goto err; - - if (!remark && - remark_if_split && - bkey_cmp(atomic_end, k->k.p) < 0) { - ret = bch2_disk_reservation_add(c, &disk_res, - k->k.size * - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), - BCH_DISK_RESERVATION_NOFAIL); - BUG_ON(ret); - - remark = true; - } - - bkey_copy(split, k); - bch2_cut_front(iter->pos, split); - bch2_cut_back(atomic_end, split); - - split_iter = bch2_trans_copy_iter(&trans, iter); - ret = PTR_ERR_OR_ZERO(split_iter); - if (ret) - goto err; - - /* - * It's important that we don't go through the - * extent_handle_overwrites() and extent_update_to_keys() path - * here: journal replay is supposed to treat extents like - * regular keys - */ - __bch2_btree_iter_set_pos(split_iter, split->k.p, false); - bch2_trans_update(&trans, split_iter, split, - BTREE_TRIGGER_NORUN); - - bch2_btree_iter_set_pos(iter, split->k.p); - - if (remark) { - ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), - 0, split->k.size, - BTREE_TRIGGER_INSERT); - if (ret) - goto err; - } - } while (bkey_cmp(iter->pos, k->k.p) < 0); - - if (remark) { - ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), - 0, -((s64) k->k.size), - BTREE_TRIGGER_OVERWRITE); - if (ret) - goto err; - } - - ret = bch2_trans_commit(&trans, &disk_res, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY); -err: - if (ret == -EINTR) - goto retry; - - bch2_disk_reservation_put(c, &disk_res); - - return bch2_trans_exit(&trans) ?: ret; -} - static int __bch2_journal_replay_key(struct btree_trans *trans, enum btree_id id, unsigned level, struct bkey_i *k) @@ -499,8 +522,6 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, iter = bch2_trans_get_node_iter(trans, id, k->k.p, BTREE_MAX_DEPTH, level, BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); /* * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run @@ -508,7 +529,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, * want that here, journal replay is supposed to treat extents like * regular keys: */ - __bch2_btree_iter_set_pos(iter, k->k.p, false); + BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); ret = bch2_btree_iter_traverse(iter) ?: bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); @@ -516,14 +537,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, return ret; } -static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) +static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) { - return bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY, - __bch2_journal_replay_key(&trans, id, level, k)); + unsigned commit_flags = BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW; + + if (!k->allocated) + commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; + + return bch2_trans_do(c, NULL, NULL, commit_flags, + __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); } static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) @@ -531,12 +554,11 @@ static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) struct btree_iter *iter; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, + iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p, BTREE_ITER_CACHED| BTREE_ITER_CACHED_NOFILL| BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter) ?: - bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); bch2_trans_iter_put(trans, iter); return ret; } @@ -559,7 +581,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(r->level, l->level) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->k->k.p, r->k->k.p); + bpos_cmp(l->k->k.p, r->k->k.p); } static int bch2_journal_replay(struct bch_fs *c, @@ -584,7 +606,7 @@ static int bch2_journal_replay(struct bch_fs *c, for_each_journal_key(keys, i) { cond_resched(); - if (!i->level && i->btree_id == BTREE_ID_ALLOC) { + if (!i->level && i->btree_id == BTREE_ID_alloc) { j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ret = bch2_alloc_replay_key(c, i->k); if (ret) @@ -600,7 +622,7 @@ static int bch2_journal_replay(struct bch_fs *c, if (i->level) { j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + ret = bch2_journal_replay_key(c, i); if (ret) goto err; } @@ -613,6 +635,7 @@ static int bch2_journal_replay(struct bch_fs *c, */ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); + journal_reclaim_kick(j); j->replay_journal_seq = seq; @@ -622,14 +645,12 @@ static int bch2_journal_replay(struct bch_fs *c, for_each_journal_key(keys, i) { cond_resched(); - if (i->level || i->btree_id == BTREE_ID_ALLOC) + if (i->level || i->btree_id == BTREE_ID_alloc) continue; replay_now_at(j, keys.journal_seq_base + i->journal_seq); - ret = i->k->k.size - ? bch2_extent_replay_key(c, i->btree_id, i->k) - : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + ret = bch2_journal_replay_key(c, i); if (ret) goto err; } @@ -641,47 +662,8 @@ static int bch2_journal_replay(struct bch_fs *c, bch2_journal_flush_all_pins(j); return bch2_journal_error(j); err: - bch_err(c, "journal replay: error %d while replaying key", ret); - return ret; -} - -static bool journal_empty(struct list_head *journal) -{ - return list_empty(journal) || - journal_entry_empty(&list_last_entry(journal, - struct journal_replay, list)->j); -} - -static int -verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, - struct list_head *journal) -{ - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); - u64 start_seq = le64_to_cpu(i->j.last_seq); - u64 end_seq = le64_to_cpu(i->j.seq); - u64 seq = start_seq; - int ret = 0; - - list_for_each_entry(i, journal, list) { - if (le64_to_cpu(i->j.seq) < start_seq) - continue; - - fsck_err_on(seq != le64_to_cpu(i->j.seq), c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - seq, le64_to_cpu(i->j.seq) - 1, - start_seq, end_seq); - - seq = le64_to_cpu(i->j.seq); - - fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, - "found blacklisted journal entry %llu", seq); - - do { - seq++; - } while (bch2_journal_seq_is_blacklisted(c, seq, false)); - } -fsck_err: + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", + ret, bch2_btree_ids[i->btree_id], i->level); return ret; } @@ -738,10 +720,31 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_data_usage: { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); + ret = bch2_replicas_set_usage(c, &u->r, le64_to_cpu(u->v)); break; } + case BCH_JSET_ENTRY_dev_usage: { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); + unsigned i; + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); + + for (i = 0; i < nr_types; i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + + break; + } case BCH_JSET_ENTRY_blacklist: { struct jset_entry_blacklist *bl_entry = container_of(entry, struct jset_entry_blacklist, entry); @@ -760,6 +763,12 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(bl_entry->end) + 1); break; } + case BCH_JSET_ENTRY_clock: { + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + atomic64_set(&c->io_clock[clock->rw].now, clock->time); + } } return ret; @@ -769,13 +778,11 @@ static int journal_replay_early(struct bch_fs *c, struct bch_sb_field_clean *clean, struct list_head *journal) { + struct journal_replay *i; struct jset_entry *entry; int ret; if (clean) { - c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); - for (entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { @@ -784,18 +791,16 @@ static int journal_replay_early(struct bch_fs *c, return ret; } } else { - struct journal_replay *i = - list_last_entry(journal, struct journal_replay, list); - - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); + list_for_each_entry(i, journal, list) { + if (i->ignore) + continue; - list_for_each_entry(i, journal, list) vstruct_for_each(&i->j, entry) { ret = journal_replay_entry_early(c, entry); if (ret) return ret; } + } } bch2_fs_usage_initialize(c); @@ -844,9 +849,6 @@ static int verify_superblock_clean(struct bch_fs *c, struct bch_sb_field_clean *clean = *cleanp; int ret = 0; - if (!c->sb.clean || !j) - return 0; - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", le64_to_cpu(clean->journal_seq), @@ -856,13 +858,6 @@ static int verify_superblock_clean(struct bch_fs *c, return 0; } - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, - "superblock read clock %u doesn't match journal %u after clean shutdown", - clean->read_clock, j->read_clock); - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, - "superblock write clock %u doesn't match journal %u after clean shutdown", - clean->write_clock, j->write_clock); - for (i = 0; i < BTREE_ID_NR; i++) { char buf1[200], buf2[200]; struct bkey_i *k1, *k2; @@ -913,9 +908,11 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) return ERR_PTR(-ENOMEM); } - if (le16_to_cpu(c->disk_sb.sb->version) < - bcachefs_metadata_version_bkey_renumber) - bch2_sb_clean_renumber(clean, READ); + ret = bch2_sb_clean_validate(c, clean, READ); + if (ret) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); + } mutex_unlock(&c->sb_lock); @@ -936,29 +933,29 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (i == BTREE_ID_ALLOC && + if (i == BTREE_ID_alloc && c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); continue; } if (r->error) { - __fsck_err(c, i == BTREE_ID_ALLOC + __fsck_err(c, i == BTREE_ID_alloc ? FSCK_CAN_IGNORE : 0, "invalid btree root %s", bch2_btree_ids[i]); - if (i == BTREE_ID_ALLOC) - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + if (i == BTREE_ID_alloc) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } ret = bch2_btree_root_read(c, i, &r->key, r->level); if (ret) { - __fsck_err(c, i == BTREE_ID_ALLOC + __fsck_err(c, i == BTREE_ID_alloc ? FSCK_CAN_IGNORE : 0, "error reading btree root %s", bch2_btree_ids[i]); - if (i == BTREE_ID_ALLOC) - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + if (i == BTREE_ID_alloc) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } } @@ -973,8 +970,9 @@ int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; - u64 journal_seq; - bool write_sb = false, need_write_alloc = false; + struct jset *last_journal_entry = NULL; + u64 blacklist_seq, journal_seq; + bool write_sb = false; int ret; if (c->sb.clean) @@ -987,30 +985,70 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); + if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { + bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); + ret = -EINVAL; + goto err; + } + + if (!c->sb.clean && + !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { + bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); + ret = -EINVAL; + goto err; + } + + if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { + bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); + ret = -EINVAL; + goto err; + + } + + if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { + bch_info(c, "alloc_v2 feature bit not set, fsck required"); + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; + } + if (!c->replicas.entries || c->opts.rebuild_replicas) { bch_info(c, "building replicas info"); set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); + goto err; + } + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { - struct jset *j; + struct journal_replay *i; - ret = bch2_journal_read(c, &c->journal_entries); + ret = bch2_journal_read(c, &c->journal_entries, + &blacklist_seq, &journal_seq); if (ret) goto err; - if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, + list_for_each_entry_reverse(i, &c->journal_entries, list) + if (!i->ignore) { + last_journal_entry = &i->j; + break; + } + + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, "filesystem marked clean but journal not empty")) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; } - if (!c->sb.clean && list_empty(&c->journal_entries)) { - bch_err(c, "no journal entries found"); - ret = BCH_FSCK_REPAIR_IMPOSSIBLE; - goto err; + if (!last_journal_entry) { + fsck_err_on(!c->sb.clean, c, "no journal entries found"); + goto use_clean; } c->journal_keys = journal_keys_sort(&c->journal_entries); @@ -1019,27 +1057,25 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - j = &list_last_entry(&c->journal_entries, - struct journal_replay, list)->j; - - ret = verify_superblock_clean(c, &clean, j); - if (ret) - goto err; - - journal_seq = le64_to_cpu(j->seq) + 1; + if (c->sb.clean && last_journal_entry) { + ret = verify_superblock_clean(c, &clean, + last_journal_entry); + if (ret) + goto err; + } } else { - journal_seq = le64_to_cpu(clean->journal_seq) + 1; - } +use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; + goto err; - if (!c->sb.clean && - !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { - bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); - ret = -EINVAL; - goto err; + } + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; } if (c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); drop_alloc_keys(&c->journal_keys); } @@ -1047,30 +1083,23 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - if (!c->sb.clean) { + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that + * happened before their corresponding journal writes - those btree + * writes need to be ignored, by skipping and blacklisting the next few + * journal sequence numbers: + */ + if (!c->sb.clean) + journal_seq += 8; + + if (blacklist_seq != journal_seq) { ret = bch2_journal_seq_blacklist_add(c, - journal_seq, - journal_seq + 4); + blacklist_seq, journal_seq); if (ret) { bch_err(c, "error creating new journal seq blacklist entry"); goto err; } - - journal_seq += 4; - - /* - * The superblock needs to be written before we do any btree - * node writes: it will be in the read_write() path - */ - } - - ret = bch2_blacklist_table_initialize(c); - - if (!list_empty(&c->journal_entries)) { - ret = verify_journal_entries_not_blacklisted_or_missing(c, - &c->journal_entries); - if (ret) - goto err; } ret = bch2_fs_journal_start(&c->journal, journal_seq, @@ -1098,36 +1127,20 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { - /* - * interior btree node updates aren't consistent with the - * journal; after an unclean shutdown we have to walk all - * pointers to metadata: - */ - bch_info(c, "starting metadata mark and sweep"); - err = "error in mark and sweep"; - ret = bch2_gc(c, &c->journal_keys, true, true); - if (ret < 0) - goto err; - if (ret) - need_write_alloc = true; - bch_verbose(c, "mark and sweep done"); - } - if (c->opts.fsck || - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bch_info(c, "starting mark and sweep"); err = "error in mark and sweep"; - ret = bch2_gc(c, &c->journal_keys, true, false); - if (ret < 0) - goto err; + ret = bch2_gc(c, true); if (ret) - need_write_alloc = true; + goto err; bch_verbose(c, "mark and sweep done"); } + bch2_stripes_heap_start(c); + clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); @@ -1148,7 +1161,8 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; bch_verbose(c, "journal replay done"); - if (need_write_alloc && !c->opts.nochanges) { + if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && + !c->opts.nochanges) { /* * note that even when filesystem was clean there might be work * to do here, if we ran gc (because of fsck) which recalculated @@ -1163,8 +1177,6 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } bch_verbose(c, "alloc write done"); - - set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); } if (!c->sb.clean) { @@ -1203,18 +1215,30 @@ int bch2_fs_recovery(struct bch_fs *c) bch_verbose(c, "quotas done"); } + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { + struct bch_move_stats stats = { 0 }; + + bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c); + if (ret) + goto err; + + ret = bch2_scan_old_btree_nodes(c, &stats); + if (ret) + goto err; + bch_info(c, "scanning for old btree nodes done"); + } + mutex_lock(&c->sb_lock); if (c->opts.version_upgrade) { - if (c->sb.version < bcachefs_metadata_version_new_versioning) - c->disk_sb.sb->version_min = - le16_to_cpu(bcachefs_metadata_version_min); c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; write_sb = true; } if (!test_bit(BCH_FS_ERROR, &c->flags)) { - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; write_sb = true; } @@ -1265,17 +1289,17 @@ int bch2_fs_initialize(struct bch_fs *c) bch_notice(c, "initializing new filesystem"); mutex_lock(&c->sb_lock); - for_each_online_member(ca, c, i) - bch2_mark_dev_superblock(c, ca, 0); - mutex_unlock(&c->sb_lock); + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; - mutex_lock(&c->sb_lock); - c->disk_sb.sb->version = c->disk_sb.sb->version_min = - le16_to_cpu(bcachefs_metadata_version_current); - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; - c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + if (c->opts.version_upgrade) { + c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + bch2_write_super(c); + } - bch2_write_super(c); + for_each_online_member(ca, c, i) + bch2_mark_dev_superblock(c, ca, 0); mutex_unlock(&c->sb_lock); set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); @@ -1320,10 +1344,11 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; - bch2_inode_pack(&packed_inode, &root_inode); + bch2_inode_pack(c, &packed_inode, &root_inode); + packed_inode.inode.k.p.snapshot = U32_MAX; err = "error creating root directory"; - ret = bch2_btree_insert(c, BTREE_ID_INODES, + ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, NULL, 0); if (ret) @@ -1338,8 +1363,10 @@ int bch2_fs_initialize(struct bch_fs *c) &lostfound, 0, 0, S_IFDIR|0700, 0, NULL, NULL)); - if (ret) + if (ret) { + bch_err(c, "error creating lost+found"); goto err; + } if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index a66827c..fa91851 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -6,10 +6,11 @@ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) struct journal_iter { + struct list_head list; enum btree_id btree_id; unsigned level; + size_t idx; struct journal_keys *keys; - struct journal_key *k; }; /* @@ -17,8 +18,6 @@ struct journal_iter { */ struct btree_and_journal_iter { - struct btree_iter *btree; - struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; @@ -32,16 +31,18 @@ struct btree_and_journal_iter { } last; }; +int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); -void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, - struct btree_trans *, - struct journal_keys *, - enum btree_id, struct bpos); +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct journal_keys *, + struct bch_fs *, struct btree *); typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 8abcbfb..0978ad9 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "btree_update.h" #include "extents.h" #include "inode.h" @@ -119,7 +119,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (orig->k.type == KEY_TYPE_inline_data) bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); - for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, + for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink, POS(0, c->reflink_hint), BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { if (reflink_iter->pos.inode) { @@ -157,8 +157,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_trans_update(trans, reflink_iter, r_v, 0); r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); - if (IS_ERR(r_p)) - return PTR_ERR(r_p); + if (IS_ERR(r_p)) { + ret = PTR_ERR(r_p); + goto err; + } orig->k.type = KEY_TYPE_reflink_p; r_p = bkey_i_to_reflink_p(orig); @@ -198,16 +200,12 @@ s64 bch2_remap_range(struct bch_fs *c, struct btree_trans trans; struct btree_iter *dst_iter, *src_iter; struct bkey_s_c src_k; - BKEY_PADDED(k) new_dst; - struct bkey_on_stack new_src; + struct bkey_buf new_dst, new_src; struct bpos dst_end = dst_start, src_end = src_start; struct bpos dst_want, src_want; u64 src_done, dst_done; int ret = 0, ret2 = 0; - if (!c->opts.reflink) - return -EOPNOTSUPP; - if (!percpu_ref_tryget(&c->writes)) return -EROFS; @@ -216,28 +214,27 @@ s64 bch2_remap_range(struct bch_fs *c, dst_end.offset += remap_sectors; src_end.offset += remap_sectors; - bkey_on_stack_init(&new_src); + bch2_bkey_buf_init(&new_dst); + bch2_bkey_buf_init(&new_src); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); - src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, + src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start, BTREE_ITER_INTENT); - dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, + dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start, BTREE_ITER_INTENT); - while (1) { + while (ret == 0 || ret == -EINTR) { bch2_trans_begin(&trans); - trans.mem_top = 0; - if (fatal_signal_pending(current)) { ret = -EINTR; - goto err; + break; } src_k = get_next_src(src_iter, src_end); ret = bkey_err(src_k); if (ret) - goto btree_err; + continue; src_done = bpos_min(src_iter->pos, src_end).offset - src_start.offset; @@ -246,8 +243,6 @@ s64 bch2_remap_range(struct bch_fs *c, if (bkey_cmp(dst_iter->pos, dst_want) < 0) { ret = bch2_fpunch_at(&trans, dst_iter, dst_want, journal_seq, i_sectors_delta); - if (ret) - goto btree_err; continue; } @@ -257,7 +252,7 @@ s64 bch2_remap_range(struct bch_fs *c, break; if (src_k.k->type != KEY_TYPE_reflink_p) { - bkey_on_stack_reassemble(&new_src, c, src_k); + bch2_bkey_buf_reassemble(&new_src, c, src_k); src_k = bkey_i_to_s_c(new_src.k); bch2_cut_front(src_iter->pos, new_src.k); @@ -266,7 +261,7 @@ s64 bch2_remap_range(struct bch_fs *c, ret = bch2_make_extent_indirect(&trans, src_iter, new_src.k); if (ret) - goto btree_err; + continue; BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); } @@ -275,7 +270,7 @@ s64 bch2_remap_range(struct bch_fs *c, struct bkey_s_c_reflink_p src_p = bkey_s_c_to_reflink_p(src_k); struct bkey_i_reflink_p *dst_p = - bkey_reflink_p_init(&new_dst.k); + bkey_reflink_p_init(new_dst.k); u64 offset = le64_to_cpu(src_p.v->idx) + (src_iter->pos.offset - @@ -286,29 +281,25 @@ s64 bch2_remap_range(struct bch_fs *c, BUG(); } - new_dst.k.k.p = dst_iter->pos; - bch2_key_resize(&new_dst.k.k, + new_dst.k->k.p = dst_iter->pos; + bch2_key_resize(&new_dst.k->k, min(src_k.k->p.offset - src_iter->pos.offset, dst_end.offset - dst_iter->pos.offset)); - ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, + ret = bch2_extent_update(&trans, dst_iter, new_dst.k, NULL, journal_seq, new_i_size, i_sectors_delta); if (ret) - goto btree_err; + continue; dst_done = dst_iter->pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); bch2_btree_iter_set_pos(src_iter, src_want); -btree_err: - if (ret == -EINTR) - ret = 0; - if (ret) - goto err; } + bch2_trans_iter_put(&trans, dst_iter); + bch2_trans_iter_put(&trans, src_iter); - BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); -err: + BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end)); BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); dst_done = dst_iter->pos.offset - dst_start.offset; @@ -330,10 +321,13 @@ err: ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, journal_seq, 0); } + + bch2_trans_iter_put(&trans, inode_iter); } while (ret2 == -EINTR); ret = bch2_trans_exit(&trans) ?: ret; - bkey_on_stack_exit(&new_src, c); + bch2_bkey_buf_exit(&new_src, c); + bch2_bkey_buf_exit(&new_dst, c); percpu_ref_put(&c->writes); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 91518c0..1e29717 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -11,11 +11,6 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, /* Replicas tracking - in memory: */ -static inline int u8_cmp(u8 l, u8 r) -{ - return cmp_int(l, r); -} - static void verify_replicas_entry(struct bch_replicas_entry *e) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -31,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e) #endif } -static void replicas_entry_sort(struct bch_replicas_entry *e) +void bch2_replicas_entry_sort(struct bch_replicas_entry *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); } @@ -127,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, break; } - replicas_entry_sort(e); + bch2_replicas_entry_sort(e); } void bch2_devlist_to_replicas(struct bch_replicas_entry *e, @@ -147,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, for (i = 0; i < devs.nr; i++) e->devs[e->nr_devs++] = devs.devs[i]; - replicas_entry_sort(e); + bch2_replicas_entry_sort(e); } static struct bch_replicas_cpu @@ -164,7 +159,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old, BUG_ON(!new_entry->data_type); verify_replicas_entry(new_entry); - new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); + new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; @@ -202,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, int bch2_replicas_entry_idx(struct bch_fs *c, struct bch_replicas_entry *search) { - replicas_entry_sort(search); + bch2_replicas_entry_sort(search); return __replicas_entry_idx(&c->replicas, search); } @@ -275,53 +270,57 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, static int replicas_table_update(struct bch_fs *c, struct bch_replicas_cpu *new_r) { - struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; - struct bch_fs_usage *new_scratch = NULL; + struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; + struct bch_fs_usage_online *new_scratch = NULL; struct bch_fs_usage __percpu *new_gc = NULL; struct bch_fs_usage *new_base = NULL; - unsigned bytes = sizeof(struct bch_fs_usage) + + unsigned i, bytes = sizeof(struct bch_fs_usage) + + sizeof(u64) * new_r->nr; + unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + sizeof(u64) * new_r->nr; - int ret = -ENOMEM; - - if (!(new_base = kzalloc(bytes, GFP_NOIO)) || - !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), - GFP_NOIO)) || - !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), - GFP_NOIO)) || - !(new_scratch = kmalloc(bytes, GFP_NOIO)) || + int ret = 0; + + memset(new_usage, 0, sizeof(new_usage)); + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (!(new_usage[i] = __alloc_percpu_gfp(bytes, + sizeof(u64), GFP_KERNEL))) + goto err; + + if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || + !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || (c->usage_gc && - !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { - bch_err(c, "error updating replicas table: memory allocation failure"); + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) goto err; - } + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (c->usage[i]) + __replicas_table_update_pcpu(new_usage[i], new_r, + c->usage[i], &c->replicas); if (c->usage_base) __replicas_table_update(new_base, new_r, c->usage_base, &c->replicas); - if (c->usage[0]) - __replicas_table_update_pcpu(new_usage[0], new_r, - c->usage[0], &c->replicas); - if (c->usage[1]) - __replicas_table_update_pcpu(new_usage[1], new_r, - c->usage[1], &c->replicas); if (c->usage_gc) __replicas_table_update_pcpu(new_gc, new_r, c->usage_gc, &c->replicas); + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + swap(c->usage[i], new_usage[i]); swap(c->usage_base, new_base); - swap(c->usage[0], new_usage[0]); - swap(c->usage[1], new_usage[1]); swap(c->usage_scratch, new_scratch); swap(c->usage_gc, new_gc); swap(c->replicas, *new_r); - ret = 0; -err: +out: free_percpu(new_gc); kfree(new_scratch); free_percpu(new_usage[1]); free_percpu(new_usage[0]); kfree(new_base); return ret; +err: + bch_err(c, "error updating replicas table: memory allocation failure"); + ret = -ENOMEM; + goto out; } static unsigned reserve_journal_replicas(struct bch_fs *c, @@ -465,6 +464,36 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, return 0; } +/* replicas delta list: */ + +bool bch2_replicas_delta_list_marked(struct bch_fs *c, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + struct replicas_delta *top = (void *) r->d + r->used; + + percpu_rwsem_assert_held(&c->mark_lock); + + for (d = r->d; d != top; d = replicas_delta_next(d)) + if (bch2_replicas_entry_idx(c, &d->r) < 0) + return false; + return true; +} + +int bch2_replicas_delta_list_mark(struct bch_fs *c, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + struct replicas_delta *top = (void *) r->d + r->used; + int ret = 0; + + for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) + ret = bch2_mark_replicas(c, &d->r); + return ret; +} + +/* bkey replicas: */ + bool bch2_bkey_replicas_marked(struct bch_fs *c, struct bkey_s_c k) { @@ -476,6 +505,11 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) return __bch2_mark_bkey_replicas(c, k, false); } +/* + * Old replicas_gc mechanism: only used for journal replicas entries now, should + * die at some point: + */ + int bch2_replicas_gc_end(struct bch_fs *c, int ret) { unsigned i; @@ -496,9 +530,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) struct bch_replicas_cpu n; if (!__replicas_has_entry(&c->replicas_gc, e) && - (c->usage_base->replicas[i] || - percpu_u64_get(&c->usage[0]->replicas[i]) || - percpu_u64_get(&c->usage[1]->replicas[i]))) { + bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) { n = cpu_replicas_add_entry(&c->replicas_gc, e); if (!n.entries) { ret = -ENOSPC; @@ -553,7 +585,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, c->replicas_gc.entry_size, - GFP_NOIO); + GFP_KERNEL); if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); @@ -571,6 +603,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) return 0; } +/* New much simpler mechanism for clearing out unneeded replicas entries: */ + int bch2_replicas_gc2(struct bch_fs *c) { struct bch_replicas_cpu new = { 0 }; @@ -605,7 +639,9 @@ retry: if (e->data_type == BCH_DATA_journal || c->usage_base->replicas[i] || percpu_u64_get(&c->usage[0]->replicas[i]) || - percpu_u64_get(&c->usage[1]->replicas[i])) + percpu_u64_get(&c->usage[1]->replicas[i]) || + percpu_u64_get(&c->usage[2]->replicas[i]) || + percpu_u64_get(&c->usage[3]->replicas[i])) memcpy(cpu_replicas_entry(&new, new.nr++), e, new.entry_size); } @@ -674,7 +710,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, nr++; } - cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -ENOMEM; @@ -684,7 +720,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, for_each_replicas_entry(sb_r, e) { dst = cpu_replicas_entry(cpu_r, idx++); memcpy(dst, e, replicas_entry_bytes(e)); - replicas_entry_sort(dst); + bch2_replicas_entry_sort(dst); } return 0; @@ -706,7 +742,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, entry_size += sizeof(struct bch_replicas_entry) - sizeof(struct bch_replicas_entry_v0); - cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -ENOMEM; @@ -721,7 +757,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, dst->nr_devs = e->nr_devs; dst->nr_required = 1; memcpy(dst->devs, e->devs, e->nr_devs); - replicas_entry_sort(dst); + bch2_replicas_entry_sort(dst); } return 0; @@ -961,92 +997,53 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { /* Query replicas: */ -struct replicas_status __bch2_replicas_status(struct bch_fs *c, - struct bch_devs_mask online_devs) +bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + unsigned flags, bool print) { - struct bch_sb_field_members *mi; struct bch_replicas_entry *e; - unsigned i, nr_online, nr_offline; - struct replicas_status ret; - - memset(&ret, 0, sizeof(ret)); - - for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) - ret.replicas[i].redundancy = INT_MAX; - - mi = bch2_sb_get_members(c->disk_sb.sb); + bool ret = true; percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(&c->replicas, e) { - if (e->data_type >= ARRAY_SIZE(ret.replicas)) - panic("e %p data_type %u\n", e, e->data_type); - - nr_online = nr_offline = 0; + unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + bool metadata = e->data_type < BCH_DATA_user; for (i = 0; i < e->nr_devs; i++) { - BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, - e->devs[i])); + struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); - if (test_bit(e->devs[i], online_devs.d)) - nr_online++; - else - nr_offline++; + nr_online += test_bit(e->devs[i], devs.d); + nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; } - ret.replicas[e->data_type].redundancy = - min(ret.replicas[e->data_type].redundancy, - (int) nr_online - (int) e->nr_required); - - ret.replicas[e->data_type].nr_offline = - max(ret.replicas[e->data_type].nr_offline, - nr_offline); - } - - percpu_up_read(&c->mark_lock); - - for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) - if (ret.replicas[i].redundancy == INT_MAX) - ret.replicas[i].redundancy = 0; + if (nr_failed == e->nr_devs) + continue; - return ret; -} + if (nr_online < e->nr_required) + dflags |= metadata + ? BCH_FORCE_IF_METADATA_LOST + : BCH_FORCE_IF_DATA_LOST; -struct replicas_status bch2_replicas_status(struct bch_fs *c) -{ - return __bch2_replicas_status(c, bch2_online_devs(c)); -} + if (nr_online < e->nr_devs) + dflags |= metadata + ? BCH_FORCE_IF_METADATA_DEGRADED + : BCH_FORCE_IF_DATA_DEGRADED; -static bool have_enough_devs(struct replicas_status s, - enum bch_data_type type, - bool force_if_degraded, - bool force_if_lost) -{ - return (!s.replicas[type].nr_offline || force_if_degraded) && - (s.replicas[type].redundancy >= 0 || force_if_lost); -} + if (dflags & ~flags) { + if (print) { + char buf[100]; -bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) -{ - return (have_enough_devs(s, BCH_DATA_journal, - flags & BCH_FORCE_IF_METADATA_DEGRADED, - flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_btree, - flags & BCH_FORCE_IF_METADATA_DEGRADED, - flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_user, - flags & BCH_FORCE_IF_DATA_DEGRADED, - flags & BCH_FORCE_IF_DATA_LOST)); -} + bch2_replicas_entry_to_text(&PBUF(buf), e); + bch_err(c, "insufficient devices online (%u) for replicas entry %s", + nr_online, buf); + } + ret = false; + break; + } -int bch2_replicas_online(struct bch_fs *c, bool meta) -{ - struct replicas_status s = bch2_replicas_status(c); + } + percpu_up_read(&c->mark_lock); - return (meta - ? min(s.replicas[BCH_DATA_journal].redundancy, - s.replicas[BCH_DATA_btree].redundancy) - : s.replicas[BCH_DATA_user].redundancy) + 1; + return ret; } unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) @@ -1068,8 +1065,9 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) int bch2_fs_replicas_init(struct bch_fs *c) { - c->journal.entry_u64s_reserved += - reserve_journal_replicas(c, &c->replicas); + bch2_journal_entry_res_resize(&c->journal, + &c->replicas_journal_res, + reserve_journal_replicas(c, &c->replicas)); return replicas_table_update(c, &c->replicas); } diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 8b95164..c77e873 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -5,6 +5,7 @@ #include "eytzinger.h" #include "replicas_types.h" +void bch2_replicas_entry_sort(struct bch_replicas_entry *); void bch2_replicas_entry_to_text(struct printbuf *, struct bch_replicas_entry *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); @@ -25,6 +26,31 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); int bch2_mark_replicas(struct bch_fs *, struct bch_replicas_entry *); +struct replicas_delta { + s64 delta; + struct bch_replicas_entry r; +} __packed; + +struct replicas_delta_list { + unsigned size; + unsigned used; + + struct {} memset_start; + u64 nr_inodes; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + struct {} memset_end; + struct replicas_delta d[0]; +}; + +static inline struct replicas_delta * +replicas_delta_next(struct replicas_delta *d) +{ + return (void *) d + replicas_entry_bytes(&d->r) + 8; +} + +bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *); +int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); + void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); @@ -38,19 +64,9 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, e->devs[0] = dev; } -struct replicas_status { - struct { - int redundancy; - unsigned nr_offline; - } replicas[BCH_DATA_NR]; -}; - -struct replicas_status __bch2_replicas_status(struct bch_fs *, - struct bch_devs_mask); -struct replicas_status bch2_replicas_status(struct bch_fs *); -bool bch2_have_enough_devs(struct replicas_status, unsigned); +bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, + unsigned, bool); -int bch2_replicas_online(struct bch_fs *, bool); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); int bch2_replicas_gc_end(struct bch_fs *, int); diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index dea9b72..9f0bd44 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -18,11 +18,11 @@ static inline enum bch_str_hash_type bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { switch (opt) { - case BCH_STR_HASH_OPT_CRC32C: + case BCH_STR_HASH_OPT_crc32c: return BCH_STR_HASH_CRC32C; - case BCH_STR_HASH_OPT_CRC64: + case BCH_STR_HASH_OPT_crc64: return BCH_STR_HASH_CRC64; - case BCH_STR_HASH_OPT_SIPHASH: + case BCH_STR_HASH_OPT_siphash: return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) ? BCH_STR_HASH_SIPHASH : BCH_STR_HASH_SIPHASH_OLD; @@ -156,7 +156,7 @@ bch2_hash_lookup(struct btree_trans *trans, if (k.k->type == desc.key_type) { if (!desc.cmp_key(k, key)) return iter; - } else if (k.k->type == KEY_TYPE_whiteout) { + } else if (k.k->type == KEY_TYPE_hash_whiteout) { ; } else { /* hole, not found */ @@ -205,14 +205,12 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, int ret; iter = bch2_trans_copy_iter(trans, start); - if (IS_ERR(iter)) - return PTR_ERR(iter); bch2_btree_iter_next_slot(iter); for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { if (k.k->type != desc.key_type && - k.k->type != KEY_TYPE_whiteout) + k.k->type != KEY_TYPE_hash_whiteout) break; if (k.k->type == desc.key_type && @@ -253,13 +251,10 @@ int bch2_hash_set(struct btree_trans *trans, } if (!slot && - !(flags & BCH_HASH_SET_MUST_REPLACE)) { + !(flags & BCH_HASH_SET_MUST_REPLACE)) slot = bch2_trans_copy_iter(trans, iter); - if (IS_ERR(slot)) - return PTR_ERR(slot); - } - if (k.k->type != KEY_TYPE_whiteout) + if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; } @@ -308,7 +303,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, bkey_init(&delete->k); delete->k.p = iter->pos; - delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; + delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; bch2_trans_update(trans, iter, delete, 0); return 0; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index cee6cc9..1793697 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -9,6 +9,7 @@ #include "error.h" #include "io.h" #include "journal.h" +#include "journal_io.h" #include "journal_seq_blacklist.h" #include "replicas.h" #include "quota.h" @@ -276,19 +277,19 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) return "Bad number of member devices"; if (!BCH_SB_META_REPLICAS_WANT(sb) || - BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; if (!BCH_SB_META_REPLICAS_REQ(sb) || - BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; if (!BCH_SB_DATA_REPLICAS_WANT(sb) || - BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) return "Invalid number of data replicas"; if (!BCH_SB_DATA_REPLICAS_REQ(sb) || - BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) return "Invalid number of data replicas"; if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) @@ -361,6 +362,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.uuid = src->uuid; c->sb.user_uuid = src->user_uuid; c->sb.version = le16_to_cpu(src->version); + c->sb.version_min = le16_to_cpu(src->version_min); c->sb.nr_devices = src->nr_devices; c->sb.clean = BCH_SB_CLEAN(src); c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); @@ -375,7 +377,6 @@ static void bch2_sb_update(struct bch_fs *c) ca->mi = bch2_mi_to_cpu(mi->members + i); } -/* doesn't copy member info */ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) { struct bch_sb_field *src_f, *dst_f; @@ -614,9 +615,6 @@ got_super: bdev_logical_block_size(sb->bdev)) goto err; - if (sb->mode & FMODE_WRITE) - bdev_get_queue(sb->bdev)->backing_dev_info->capabilities - |= BDI_CAP_STABLE_WRITES; ret = 0; sb->have_layout = true; out: @@ -636,7 +634,7 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; @@ -712,6 +710,8 @@ int bch2_write_super(struct bch_fs *c) if (test_bit(BCH_FS_ERROR, &c->flags)) SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); + SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); + for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); @@ -770,15 +770,13 @@ int bch2_write_super(struct bch_fs *c) nr_wrote = dev_mask_nr(&sb_written); can_mount_with_written = - bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), - BCH_FORCE_IF_DEGRADED); + bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false); for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) sb_written.d[i] = ~sb_written.d[i]; can_mount_without_written = - bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), - BCH_FORCE_IF_DEGRADED); + bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false); /* * If we would be able to mount _without_ the devices we successfully @@ -789,6 +787,7 @@ int bch2_write_super(struct bch_fs *c) * mount with the devices we did successfully write to: */ if (bch2_fs_fatal_err_on(!nr_wrote || + !can_mount_with_written || (can_mount_without_written && !can_mount_with_written), c, "Unable to write superblock to sufficient devices")) @@ -936,14 +935,23 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { /* BCH_SB_FIELD_clean: */ -void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) +int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) { struct jset_entry *entry; + int ret; for (entry = clean->start; entry < (struct jset_entry *) vstruct_end(&clean->field); - entry = vstruct_next(entry)) - bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); + entry = vstruct_next(entry)) { + ret = bch2_journal_entry_validate(c, "superblock", entry, + le16_to_cpu(c->disk_sb.sb->version), + BCH_SB_BIG_ENDIAN(c->disk_sb.sb), + write); + if (ret) + return ret; + } + + return 0; } int bch2_fs_mark_dirty(struct bch_fs *c) @@ -957,104 +965,118 @@ int bch2_fs_mark_dirty(struct bch_fs *c) mutex_lock(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; - c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; + c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS; ret = bch2_write_super(c); mutex_unlock(&c->sb_lock); return ret; } -static void -entry_init_u64s(struct jset_entry *entry, unsigned u64s) +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) { - memset(entry, 0, u64s * sizeof(u64)); + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + memset(entry, 0, u64s * sizeof(u64)); /* * The u64s field counts from the start of data, ignoring the shared * fields. */ entry->u64s = u64s - 1; -} -static void -entry_init_size(struct jset_entry *entry, size_t size) -{ - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - entry_init_u64s(entry, u64s); + *end = vstruct_next(*end); + return entry; } -struct jset_entry * -bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry *entry, - u64 journal_seq) +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) { - unsigned i; + struct bch_dev *ca; + unsigned i, dev; - percpu_down_write(&c->mark_lock); + percpu_down_read(&c->mark_lock); if (!journal_seq) { - bch2_fs_usage_acc_to_base(c, 0); - bch2_fs_usage_acc_to_base(c, 1); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); } else { - bch2_fs_usage_acc_to_base(c, journal_seq & 1); + bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); } { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_INODES; u->v = cpu_to_le64(c->usage_base->nr_inodes); - - entry = vstruct_next(entry); } { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_KEY_VERSION; u->v = cpu_to_le64(atomic64_read(&c->key_version)); - - entry = vstruct_next(entry); } for (i = 0; i < BCH_REPLICAS_MAX; i++) { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_RESERVED; u->entry.level = i; u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); - - entry = vstruct_next(entry); } for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); - entry_init_size(entry, sizeof(*u) + e->nr_devs); u->entry.type = BCH_JSET_ENTRY_data_usage; u->v = cpu_to_le64(c->usage_base->replicas[i]); memcpy(&u->r, e, replicas_entry_bytes(e)); + } - entry = vstruct_next(entry); + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } } - percpu_up_write(&c->mark_lock); + percpu_up_read(&c->mark_lock); - return entry; + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = atomic64_read(&c->io_clock[i].now); + } } void bch2_fs_mark_clean(struct bch_fs *c) @@ -1062,6 +1084,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) struct bch_sb_field_clean *sb_clean; struct jset_entry *entry; unsigned u64s; + int ret; mutex_lock(&c->sb_lock); if (BCH_SB_CLEAN(c->disk_sb.sb)) @@ -1069,8 +1092,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata; c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); @@ -1083,24 +1106,28 @@ void bch2_fs_mark_clean(struct bch_fs *c) } sb_clean->flags = 0; - sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); /* Trying to catch outstanding bug: */ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); entry = sb_clean->start; - entry = bch2_journal_super_entries_add_common(c, entry, 0); + bch2_journal_super_entries_add_common(c, &entry, 0); entry = bch2_btree_roots_to_journal_entries(c, entry, entry); BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); memset(entry, 0, vstruct_end(&sb_clean->field) - (void *) entry); - if (le16_to_cpu(c->disk_sb.sb->version) < - bcachefs_metadata_version_bkey_renumber) - bch2_sb_clean_renumber(sb_clean, WRITE); + /* + * this should be in the write path, and we should be validating every + * superblock section: + */ + ret = bch2_sb_clean_validate(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); + goto out; + } bch2_write_super(c); out: diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 7a06815..b64ac2f 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -122,11 +122,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) /* BCH_SB_FIELD_clean: */ -struct jset_entry * -bch2_journal_super_entries_add_common(struct bch_fs *, - struct jset_entry *, u64); +void bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry **, u64); -void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); +int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int); int bch2_fs_mark_dirty(struct bch_fs *); void bch2_fs_mark_clean(struct bch_fs *); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 015bbd9..670e9cd 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -149,6 +148,23 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) return c; } +static void bch2_dev_usage_journal_reserve(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, nr = 0, u64s = + ((sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / + sizeof(u64); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + nr++; + rcu_read_unlock(); + + bch2_journal_entry_res_resize(&c->journal, + &c->dev_usage_journal_res, u64s * nr); +} + /* Filesystem RO/RW: */ /* @@ -175,9 +191,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_copygc_stop(c); bch2_gc_thread_stop(c); - bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); - bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); - /* * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: @@ -236,10 +249,7 @@ nowrote_alloc: * the journal kicks off btree writes via reclaim - wait for in flight * writes after stopping journal: */ - if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - bch2_btree_flush_all_writes(c); - else - bch2_btree_verify_flushed(c); + bch2_btree_flush_all_writes(c); /* * After stopping journal: @@ -259,7 +269,7 @@ static void bch2_writes_disabled(struct percpu_ref *writes) void bch2_fs_read_only(struct bch_fs *c) { if (!test_bit(BCH_FS_RW, &c->flags)) { - cancel_delayed_work_sync(&c->journal.reclaim_work); + BUG_ON(c->journal.reclaim_thread); return; } @@ -386,6 +396,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) (!early || c->opts.read_only))) return -EROFS; + bch_info(c, "going read-write"); + ret = bch2_fs_mark_dirty(c); if (ret) goto err; @@ -403,9 +415,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); - bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); - for_each_rw_member(ca, c, i) { ret = bch2_dev_allocator_start(ca); if (ret) { @@ -417,6 +426,15 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); + for_each_rw_member(ca, c, i) + bch2_wake_allocator(ca); + + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) { + bch_err(c, "error starting journal reclaim: %i", ret); + return ret; + } + if (!early) { ret = bch2_fs_read_write_late(c); if (ret) @@ -425,9 +443,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) percpu_ref_reinit(&c->writes); set_bit(BCH_FS_RW, &c->flags); - - queue_delayed_work(c->journal_reclaim_wq, - &c->journal.reclaim_work, 0); return 0; err: __bch2_fs_read_only(c); @@ -451,6 +466,7 @@ int bch2_fs_read_write_early(struct bch_fs *c) static void __bch2_fs_free(struct bch_fs *c) { unsigned i; + int cpu; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); @@ -472,9 +488,16 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_journal_entries_free(&c->journal_entries); percpu_free_rwsem(&c->mark_lock); kfree(c->usage_scratch); - free_percpu(c->usage[1]); - free_percpu(c->usage[0]); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + free_percpu(c->usage[i]); kfree(c->usage_base); + + if (c->btree_iters_bufs) + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); + + free_percpu(c->online_reserved); + free_percpu(c->btree_iters_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); @@ -485,10 +508,9 @@ static void __bch2_fs_free(struct bch_fs *c) kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); + kfree(c->unused_inode_hints); free_heap(&c->copygc_heap); - if (c->journal_reclaim_wq) - destroy_workqueue(c->journal_reclaim_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->wq) @@ -679,6 +701,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_blacklist_entries_gc); INIT_LIST_HEAD(&c->journal_entries); + INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); @@ -708,6 +731,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_cache_init_early(&c->btree_cache); + mutex_init(&c->sectors_available_lock); + if (percpu_init_rwsem(&c->mark_lock)) goto err; @@ -736,12 +761,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); + c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); + if (!(c->wq = alloc_workqueue("bcachefs", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->copygc_wq = alloc_workqueue("bcache_copygc", + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || @@ -750,9 +775,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || + !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || + !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, + sizeof(u64), GFP_KERNEL)) || bch2_io_clock_init(&c->io_clock[READ]) || bch2_io_clock_init(&c->io_clock[WRITE]) || bch2_fs_journal_init(&c->journal) || @@ -774,6 +803,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_dev_alloc(c, i)) goto err; + bch2_journal_entry_res_resize(&c->journal, + &c->btree_root_journal_res, + BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); + bch2_dev_usage_journal_reserve(c); + bch2_journal_entry_res_resize(&c->journal, + &c->clock_journal_res, + (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); + mutex_lock(&bch_fs_list_lock); err = bch2_fs_online(c); mutex_unlock(&bch_fs_list_lock); @@ -971,6 +1008,8 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { + bch2_dev_allocator_stop(ca); + cancel_work_sync(&ca->io_error_work); if (ca->kobj.state_in_sysfs && @@ -1139,6 +1178,14 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) if (!ca) goto err; + ca->fs = c; + + if (ca->mi.state == BCH_MEMBER_STATE_rw && + bch2_dev_allocator_start(ca)) { + bch2_dev_free(ca); + goto err; + } + bch2_dev_attach(c, ca, dev_idx); out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1209,13 +1256,6 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; - if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && - !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { - mutex_lock(&c->sb_lock); - bch2_mark_dev_superblock(ca->fs, ca, 0); - mutex_unlock(&c->sb_lock); - } - bch2_dev_sysfs_online(c, ca); if (c->sb.nr_devices == 1) @@ -1241,23 +1281,22 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_devs_mask new_online_devs; - struct replicas_status s; struct bch_dev *ca2; int i, nr_rw = 0, required; lockdep_assert_held(&c->state_lock); switch (new_state) { - case BCH_MEMBER_STATE_RW: + case BCH_MEMBER_STATE_rw: return true; - case BCH_MEMBER_STATE_RO: - if (ca->mi.state != BCH_MEMBER_STATE_RW) + case BCH_MEMBER_STATE_ro: + if (ca->mi.state != BCH_MEMBER_STATE_rw) return true; /* do we have enough devices to write to? */ for_each_member_device(ca2, c, i) if (ca2 != ca) - nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; + nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ? c->opts.metadata_replicas @@ -1267,19 +1306,17 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, : c->opts.data_replicas_required); return nr_rw >= required; - case BCH_MEMBER_STATE_FAILED: - case BCH_MEMBER_STATE_SPARE: - if (ca->mi.state != BCH_MEMBER_STATE_RW && - ca->mi.state != BCH_MEMBER_STATE_RO) + case BCH_MEMBER_STATE_failed: + case BCH_MEMBER_STATE_spare: + if (ca->mi.state != BCH_MEMBER_STATE_rw && + ca->mi.state != BCH_MEMBER_STATE_ro) return true; /* do we have enough devices to read from? */ new_online_devs = bch2_online_devs(c); __clear_bit(ca->dev_idx, new_online_devs.d); - s = __bch2_replicas_status(c, new_online_devs); - - return bch2_have_enough_devs(s, flags); + return bch2_have_enough_devs(c, new_online_devs, flags, false); default: BUG(); } @@ -1287,14 +1324,18 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, static bool bch2_fs_may_start(struct bch_fs *c) { - struct replicas_status s; struct bch_sb_field_members *mi; struct bch_dev *ca; - unsigned i, flags = c->opts.degraded - ? BCH_FORCE_IF_DEGRADED - : 0; + unsigned i, flags = 0; + + if (c->opts.very_degraded) + flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; + + if (c->opts.degraded) + flags |= BCH_FORCE_IF_DEGRADED; - if (!c->opts.degraded) { + if (!c->opts.degraded && + !c->opts.very_degraded) { mutex_lock(&c->sb_lock); mi = bch2_sb_get_members(c->disk_sb.sb); @@ -1305,8 +1346,8 @@ static bool bch2_fs_may_start(struct bch_fs *c) ca = bch_dev_locked(c, i); if (!bch2_dev_is_online(ca) && - (ca->mi.state == BCH_MEMBER_STATE_RW || - ca->mi.state == BCH_MEMBER_STATE_RO)) { + (ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro)) { mutex_unlock(&c->sb_lock); return false; } @@ -1314,9 +1355,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) mutex_unlock(&c->sb_lock); } - s = bch2_replicas_status(c); - - return bch2_have_enough_devs(s, flags); + return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); } static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) @@ -1341,7 +1380,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); - BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); + BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); @@ -1364,10 +1403,10 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, if (!bch2_dev_state_allowed(c, ca, new_state, flags)) return -EINVAL; - if (new_state != BCH_MEMBER_STATE_RW) + if (new_state != BCH_MEMBER_STATE_rw) __bch2_dev_read_only(c, ca); - bch_notice(ca, "%s", bch2_dev_state[new_state]); + bch_notice(ca, "%s", bch2_member_states[new_state]); mutex_lock(&c->sb_lock); mi = bch2_sb_get_members(c->disk_sb.sb); @@ -1375,7 +1414,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (new_state == BCH_MEMBER_STATE_RW && + if (new_state == BCH_MEMBER_STATE_rw && __bch2_dev_read_write(c, ca)) ret = -ENOMEM; @@ -1408,7 +1447,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) for (i = 0; i < ca->mi.nbuckets; i++) { ret = bch2_btree_key_cache_flush(&trans, - BTREE_ID_ALLOC, POS(ca->dev_idx, i)); + BTREE_ID_alloc, POS(ca->dev_idx, i)); if (ret) break; } @@ -1417,7 +1456,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) if (ret) return ret; - return bch2_btree_delete_range(c, BTREE_ID_ALLOC, + return bch2_btree_delete_range(c, BTREE_ID_alloc, POS(ca->dev_idx, 0), POS(ca->dev_idx + 1, 0), NULL); @@ -1437,7 +1476,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) */ percpu_ref_put(&ca->ref); - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); goto err; } @@ -1517,28 +1556,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->sb_lock); up_write(&c->state_lock); + + bch2_dev_usage_journal_reserve(c); return 0; err: - if (ca->mi.state == BCH_MEMBER_STATE_RW && + if (ca->mi.state == BCH_MEMBER_STATE_rw && !percpu_ref_is_zero(&ca->io_ref)) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return ret; } -static void dev_usage_clear(struct bch_dev *ca) -{ - struct bucket_array *buckets; - - percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); - - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); - up_read(&ca->bucket_lock); -} - /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { @@ -1589,15 +1617,13 @@ int bch2_dev_add(struct bch_fs *c, const char *path) * allocate the journal, reset all the marks, then remark after we * attach... */ - bch2_mark_dev_superblock(ca->fs, ca, 0); + bch2_mark_dev_superblock(NULL, ca, 0); err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); if (ret) goto err; - dev_usage_clear(ca); - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1648,17 +1674,17 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); - bch2_mark_dev_superblock(c, ca, 0); - bch2_write_super(c); mutex_unlock(&c->sb_lock); - err = "alloc write failed"; - ret = bch2_dev_alloc_write(c, ca, 0); + bch2_dev_usage_journal_reserve(c); + + err = "error marking superblock"; + ret = bch2_trans_mark_dev_sb(c, NULL, ca); if (ret) - goto err; + goto err_late; - if (ca->mi.state == BCH_MEMBER_STATE_RW) { + if (ca->mi.state == BCH_MEMBER_STATE_rw) { err = __bch2_dev_read_write(c, ca); if (err) goto err_late; @@ -1677,6 +1703,7 @@ err: bch_err(c, "Unable to add device: %s", err); return ret; err_late: + up_write(&c->state_lock); bch_err(c, "Error going rw after adding device: %s", err); return -EINVAL; } @@ -1712,7 +1739,13 @@ int bch2_dev_online(struct bch_fs *c, const char *path) } ca = bch_dev_locked(c, dev_idx); - if (ca->mi.state == BCH_MEMBER_STATE_RW) { + + if (bch2_trans_mark_dev_sb(c, NULL, ca)) { + err = "bch2_trans_mark_dev_sb() error"; + goto err; + } + + if (ca->mi.state == BCH_MEMBER_STATE_rw) { err = __bch2_dev_read_write(c, ca); if (err) goto err; @@ -1746,7 +1779,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) return 0; } - if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot offline required disk"); up_write(&c->state_lock); return -EINVAL; @@ -2005,6 +2038,7 @@ static void bcachefs_exit(void) bch2_debug_exit(); bch2_vfs_exit(); bch2_chardev_exit(); + bch2_btree_key_cache_exit(); if (bcachefs_kset) kset_unregister(bcachefs_kset); } @@ -2012,9 +2046,9 @@ static void bcachefs_exit(void) static int __init bcachefs_init(void) { bch2_bkey_pack_test(); - bch2_inode_pack_test(); if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || + bch2_btree_key_cache_init() || bch2_chardev_init() || bch2_vfs_init() || bch2_debug_init()) diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 02c81f3..bef2790 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -34,7 +34,7 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca) static inline bool bch2_dev_is_readable(struct bch_dev *ca) { return bch2_dev_is_online(ca) && - ca->mi.state != BCH_MEMBER_STATE_FAILED; + ca->mi.state != BCH_MEMBER_STATE_failed; } static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) @@ -42,8 +42,8 @@ static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) if (!percpu_ref_tryget(&ca->io_ref)) return false; - if (ca->mi.state == BCH_MEMBER_STATE_RW || - (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) return true; percpu_ref_put(&ca->io_ref); @@ -158,11 +158,11 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, __for_each_online_member(ca, c, iter, ~0) #define for_each_rw_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) + __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) #define for_each_readable_member(ca, c, iter) \ __for_each_online_member(ca, c, iter, \ - (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) + (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) /* * If a key exists that references a device, the device won't be going away and diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 20406eb..069973a 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -20,7 +20,7 @@ struct bch_devs_mask { struct bch_devs_list { u8 nr; - u8 devs[BCH_REPLICAS_MAX + 1]; + u8 devs[BCH_BKEY_PTRS_MAX]; }; struct bch_member_cpu { diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 0cb29f4..2d00897 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -153,6 +153,8 @@ read_attribute(io_latency_stats_read); read_attribute(io_latency_stats_write); read_attribute(congested); +read_attribute(btree_avg_write_size); + read_attribute(bucket_quantiles_last_read); read_attribute(bucket_quantiles_last_write); read_attribute(bucket_quantiles_fragmentation); @@ -165,6 +167,7 @@ read_attribute(journal_debug); read_attribute(journal_pins); read_attribute(btree_updates); read_attribute(dirty_btree_nodes); +read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_transactions); read_attribute(stripes_heap); @@ -198,9 +201,6 @@ read_attribute(new_stripes); rw_attribute(pd_controllers_update_seconds); -read_attribute(meta_replicas_have); -read_attribute(data_replicas_have); - read_attribute(io_timers_read); read_attribute(io_timers_write); @@ -208,12 +208,6 @@ read_attribute(io_timers_write); write_attribute(perf_test); #endif /* CONFIG_BCACHEFS_TESTS */ -#define BCH_DEBUG_PARAM(name, description) \ - rw_attribute(name); - - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - #define x(_name) \ static struct attribute sysfs_time_stat_##_name = \ { .name = #_name, .mode = S_IRUGO }; @@ -238,9 +232,17 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } +static size_t bch2_btree_avg_write_size(struct bch_fs *c) +{ + u64 nr = atomic64_read(&c->btree_writes_nr); + u64 sectors = atomic64_read(&c->btree_writes_sectors); + + return nr ? div64_u64(sectors, nr) : 0; +} + static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) { - struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); + struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); if (!fs_usage) return -ENOMEM; @@ -269,7 +271,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) if (k.k->type == KEY_TYPE_extent) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; @@ -326,6 +328,7 @@ SHOW(bch2_fs) sysfs_print(block_size, block_bytes(c)); sysfs_print(btree_node_size, btree_bytes(c)); sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); + sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); sysfs_print(read_realloc_races, atomic_long_read(&c->read_realloc_races)); @@ -352,9 +355,6 @@ SHOW(bch2_fs) sysfs_print(promote_whole_extents, c->promote_whole_extents); - sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); - sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); - /* Debugging: */ if (attr == &sysfs_alloc_debug) @@ -380,6 +380,11 @@ SHOW(bch2_fs) return out.pos - buf; } + if (attr == &sysfs_btree_cache) { + bch2_btree_cache_to_text(&out, c); + return out.pos - buf; + } + if (attr == &sysfs_btree_key_cache) { bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); return out.pos - buf; @@ -414,10 +419,6 @@ SHOW(bch2_fs) return out.pos - buf; } -#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - return 0; } @@ -462,17 +463,13 @@ STORE(bch2_fs) /* Debugging: */ -#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - if (!test_bit(BCH_FS_STARTED, &c->flags)) return -EPERM; /* Debugging: */ if (attr == &sysfs_trigger_journal_flush) - bch2_journal_meta_async(&c->journal, NULL); + bch2_journal_meta(&c->journal); if (attr == &sysfs_trigger_btree_coalesce) bch2_coalesce(c); @@ -483,7 +480,7 @@ STORE(bch2_fs) */ #if 0 down_read(&c->state_lock); - bch2_gc(c, NULL, false, false); + bch2_gc(c, false, false); up_read(&c->state_lock); #else bch2_gc_gens(c); @@ -511,10 +508,11 @@ STORE(bch2_fs) if (threads_str && !(ret = kstrtouint(threads_str, 10, &threads)) && !(ret = bch2_strtoull_h(nr_str, &nr))) - bch2_btree_perf_test(c, test, nr, threads); - else - size = ret; + ret = bch2_btree_perf_test(c, test, nr, threads); kfree(tmp); + + if (ret) + size = ret; } #endif return size; @@ -526,9 +524,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_block_size, &sysfs_btree_node_size, &sysfs_btree_cache_size, - - &sysfs_meta_replicas_have, - &sysfs_data_replicas_have, + &sysfs_btree_avg_write_size, &sysfs_journal_write_delay_ms, &sysfs_journal_reclaim_delay_ms, @@ -564,6 +560,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_journal_pins, &sysfs_btree_updates, &sysfs_dirty_btree_nodes, + &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_btree_transactions, &sysfs_stripes_heap, @@ -590,11 +587,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_io_timers_write, &sysfs_internal_uuid, - -#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, - BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - NULL }; @@ -716,7 +708,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, { int rw = (private ? 1 : 0); - return bucket_last_io(c, bucket(ca, b), rw); + return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; } static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, @@ -729,7 +721,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, size_t b, void *private) { - return bucket_gc_gen(ca, b); + return bucket_gc_gen(bucket(ca, b)); } static int unsigned_cmp(const void *_l, const void *_r) @@ -808,63 +800,40 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) nr[c->open_buckets[i].type]++; pr_buf(out, - "free_inc: %zu/%zu\n" - "free[RESERVE_BTREE]: %zu/%zu\n" - "free[RESERVE_MOVINGGC]: %zu/%zu\n" - "free[RESERVE_NONE]: %zu/%zu\n" - "buckets:\n" - " capacity: %llu\n" - " alloc: %llu\n" - " sb: %llu\n" - " journal: %llu\n" - " meta: %llu\n" - " user: %llu\n" - " cached: %llu\n" - " erasure coded: %llu\n" - " available: %lli\n" - "sectors:\n" - " sb: %llu\n" - " journal: %llu\n" - " meta: %llu\n" - " user: %llu\n" - " cached: %llu\n" - " erasure coded: %llu\n" - " fragmented: %llu\n" - " copygc threshold: %llu\n" - "freelist_wait: %s\n" - "open buckets: %u/%u (reserved %u)\n" - "open_buckets_wait: %s\n" - "open_buckets_btree: %u\n" - "open_buckets_user: %u\n" - "btree reserve cache: %u\n", - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, - fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, - fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, - ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets_alloc, - stats.buckets[BCH_DATA_sb], - stats.buckets[BCH_DATA_journal], - stats.buckets[BCH_DATA_btree], - stats.buckets[BCH_DATA_user], - stats.buckets[BCH_DATA_cached], - stats.buckets_ec, - __dev_buckets_available(ca, stats), - stats.sectors[BCH_DATA_sb], - stats.sectors[BCH_DATA_journal], - stats.sectors[BCH_DATA_btree], - stats.sectors[BCH_DATA_user], - stats.sectors[BCH_DATA_cached], - stats.sectors_ec, - stats.sectors_fragmented, - c->copygc_threshold, - c->freelist_wait.list.first ? "waiting" : "empty", - c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, - BTREE_NODE_OPEN_BUCKET_RESERVE, - c->open_buckets_wait.list.first ? "waiting" : "empty", - nr[BCH_DATA_btree], - nr[BCH_DATA_user], - c->btree_reserve_cache_nr); + "\t\t buckets\t sectors fragmented\n" + "capacity%16llu\n", + ca->mi.nbuckets - ca->mi.first_bucket); + + for (i = 1; i < BCH_DATA_NR; i++) + pr_buf(out, "%-8s%16llu%16llu%16llu\n", + bch2_data_types[i], stats.d[i].buckets, + stats.d[i].sectors, stats.d[i].fragmented); + + pr_buf(out, + "ec\t%16llu\n" + "available%15llu\n" + "\n" + "free_inc\t\t%zu/%zu\n" + "free[RESERVE_MOVINGGC]\t%zu/%zu\n" + "free[RESERVE_NONE]\t%zu/%zu\n" + "freelist_wait\t\t%s\n" + "open buckets\t\t%u/%u (reserved %u)\n" + "open_buckets_wait\t%s\n" + "open_buckets_btree\t%u\n" + "open_buckets_user\t%u\n" + "btree reserve cache\t%u\n", + stats.buckets_ec, + __dev_buckets_available(ca, stats), + fifo_used(&ca->free_inc), ca->free_inc.size, + fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, + fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, + c->freelist_wait.list.first ? "waiting" : "empty", + c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, + BTREE_NODE_OPEN_BUCKET_RESERVE, + c->open_buckets_wait.list.first ? "waiting" : "empty", + nr[BCH_DATA_btree], + nr[BCH_DATA_user], + c->btree_reserve_cache_nr); } static const char * const bch2_rw[] = { @@ -930,7 +899,7 @@ SHOW(bch2_dev) } if (attr == &sysfs_state_rw) { - bch2_string_opt_to_text(&out, bch2_dev_state, + bch2_string_opt_to_text(&out, bch2_member_states, ca->mi.state); pr_buf(&out, "\n"); return out.pos - buf; diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 4dcace6..7507b6b 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -13,12 +13,12 @@ static void delete_test_keys(struct bch_fs *c) { int ret; - ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, + ret = bch2_btree_delete_range(c, BTREE_ID_extents, POS(0, 0), POS(0, U64_MAX), NULL); BUG_ON(ret); - ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, POS(0, 0), POS(0, U64_MAX), NULL); BUG_ON(ret); @@ -26,7 +26,7 @@ static void delete_test_keys(struct bch_fs *c) /* unit tests */ -static void test_delete(struct bch_fs *c, u64 nr) +static int test_delete(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; @@ -37,28 +37,42 @@ static void test_delete(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(iter); - BUG_ON(ret); + if (ret) { + bch_err(c, "lookup error in test_delete: %i", ret); + goto err; + } ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &k.k_i, 0)); - BUG_ON(ret); + if (ret) { + bch_err(c, "update error in test_delete: %i", ret); + goto err; + } pr_info("deleting once"); ret = bch2_btree_delete_at(&trans, iter, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "delete error (first) in test_delete: %i", ret); + goto err; + } pr_info("deleting twice"); ret = bch2_btree_delete_at(&trans, iter, 0); - BUG_ON(ret); - + if (ret) { + bch_err(c, "delete error (second) in test_delete: %i", ret); + goto err; + } +err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); + return ret; } -static void test_delete_written(struct bch_fs *c, u64 nr) +static int test_delete_written(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; @@ -69,31 +83,42 @@ static void test_delete_written(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(iter); - BUG_ON(ret); + if (ret) { + bch_err(c, "lookup error in test_delete_written: %i", ret); + goto err; + } ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &k.k_i, 0)); - BUG_ON(ret); + if (ret) { + bch_err(c, "update error in test_delete_written: %i", ret); + goto err; + } bch2_journal_flush_all_pins(&c->journal); ret = bch2_btree_delete_at(&trans, iter, 0); - BUG_ON(ret); - + if (ret) { + bch_err(c, "delete error in test_delete_written: %i", ret); + goto err; + } +err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); + return ret; } -static void test_iterate(struct bch_fs *c, u64 nr) +static int test_iterate(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter *iter = NULL; struct bkey_s_c k; u64 i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -107,16 +132,19 @@ static void test_iterate(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.offset = i; - ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "insert error in test_iterate: %i", ret); + goto err; + } } pr_info("iterating forwards"); i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) { if (k.k->p.inode) break; @@ -132,17 +160,19 @@ static void test_iterate(struct bch_fs *c, u64 nr) BUG_ON(k.k->p.offset != --i); BUG_ON(i); - +err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); + return ret; } -static void test_iterate_extents(struct bch_fs *c, u64 nr) +static int test_iterate_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter *iter = NULL; struct bkey_s_c k; u64 i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -157,16 +187,19 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) k.k.p.offset = i + 8; k.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "insert error in test_iterate_extents: %i", ret); + goto err; + } } pr_info("iterating forwards"); i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) { BUG_ON(bkey_start_offset(k.k) != i); i = k.k->p.offset; @@ -182,17 +215,19 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr) } BUG_ON(i); - +err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); + return ret; } -static void test_iterate_slots(struct bch_fs *c, u64 nr) +static int test_iterate_slots(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; u64 i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -206,16 +241,19 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p.offset = i * 2; - ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "insert error in test_iterate_slots: %i", ret); + goto err; + } } pr_info("iterating forwards"); i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) { if (k.k->p.inode) break; @@ -223,7 +261,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) BUG_ON(k.k->p.offset != i); i += 2; } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); BUG_ON(i != nr * 2); @@ -231,7 +269,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, BTREE_ITER_SLOTS, k, ret) { BUG_ON(k.k->p.offset != i); BUG_ON(bkey_deleted(k.k) != (i & 1)); @@ -240,17 +278,19 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr) if (i == nr * 2) break; } - + bch2_trans_iter_put(&trans, iter); +err: bch2_trans_exit(&trans); + return ret; } -static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) +static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; u64 i; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); @@ -265,22 +305,25 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) k.k.p.offset = i + 16; k.k.size = 8; - ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) { + bch_err(c, "insert error in test_iterate_slots_extents: %i", ret); + goto err; + } } pr_info("iterating forwards"); i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) { BUG_ON(bkey_start_offset(k.k) != i + 8); BUG_ON(k.k->size != 8); i += 16; } - bch2_trans_iter_free(&trans, iter); + bch2_trans_iter_put(&trans, iter); BUG_ON(i != nr); @@ -288,7 +331,7 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, BTREE_ITER_SLOTS, k, ret) { BUG_ON(bkey_deleted(k.k) != !(i % 16)); @@ -299,15 +342,17 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) if (i == nr) break; } - + bch2_trans_iter_put(&trans, iter); +err: bch2_trans_exit(&trans); + return 0; } /* * XXX: we really want to make sure we've got a btree with depth > 0 for these * tests */ -static void test_peek_end(struct bch_fs *c, u64 nr) +static int test_peek_end(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; @@ -315,7 +360,7 @@ static void test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); k = bch2_btree_iter_peek(iter); BUG_ON(k.k); @@ -323,10 +368,13 @@ static void test_peek_end(struct bch_fs *c, u64 nr) k = bch2_btree_iter_peek(iter); BUG_ON(k.k); + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return 0; } -static void test_peek_end_extents(struct bch_fs *c, u64 nr) +static int test_peek_end_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; @@ -334,7 +382,7 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0); k = bch2_btree_iter_peek(iter); BUG_ON(k.k); @@ -342,15 +390,18 @@ static void test_peek_end_extents(struct bch_fs *c, u64 nr) k = bch2_btree_iter_peek(iter); BUG_ON(k.k); + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return 0; } /* extent unit tests */ u64 test_version; -static void insert_test_extent(struct bch_fs *c, - u64 start, u64 end) +static int insert_test_extent(struct bch_fs *c, + u64 start, u64 end) { struct bkey_i_cookie k; int ret; @@ -362,44 +413,49 @@ static void insert_test_extent(struct bch_fs *c, k.k_i.k.size = end - start; k.k_i.k.version.lo = test_version++; - ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); - BUG_ON(ret); + if (ret) + bch_err(c, "insert error in insert_test_extent: %i", ret); + return ret; } -static void __test_extent_overwrite(struct bch_fs *c, +static int __test_extent_overwrite(struct bch_fs *c, u64 e1_start, u64 e1_end, u64 e2_start, u64 e2_end) { - insert_test_extent(c, e1_start, e1_end); - insert_test_extent(c, e2_start, e2_end); + int ret; + + ret = insert_test_extent(c, e1_start, e1_end) ?: + insert_test_extent(c, e2_start, e2_end); delete_test_keys(c); + return ret; } -static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) +static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) { - __test_extent_overwrite(c, 0, 64, 0, 32); - __test_extent_overwrite(c, 8, 64, 0, 32); + return __test_extent_overwrite(c, 0, 64, 0, 32) ?: + __test_extent_overwrite(c, 8, 64, 0, 32); } -static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) +static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) { - __test_extent_overwrite(c, 0, 64, 32, 64); - __test_extent_overwrite(c, 0, 64, 32, 72); + return __test_extent_overwrite(c, 0, 64, 32, 64) ?: + __test_extent_overwrite(c, 0, 64, 32, 72); } -static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) +static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) { - __test_extent_overwrite(c, 0, 64, 32, 40); + return __test_extent_overwrite(c, 0, 64, 32, 40); } -static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) +static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) { - __test_extent_overwrite(c, 32, 64, 0, 64); - __test_extent_overwrite(c, 32, 64, 0, 128); - __test_extent_overwrite(c, 32, 64, 32, 64); - __test_extent_overwrite(c, 32, 64, 32, 128); + return __test_extent_overwrite(c, 32, 64, 0, 64) ?: + __test_extent_overwrite(c, 32, 64, 0, 128) ?: + __test_extent_overwrite(c, 32, 64, 32, 64) ?: + __test_extent_overwrite(c, 32, 64, 32, 128); } /* perf tests */ @@ -415,11 +471,11 @@ static u64 test_rand(void) return v; } -static void rand_insert(struct bch_fs *c, u64 nr) +static int rand_insert(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct bkey_i_cookie k; - int ret; + int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); @@ -427,51 +483,67 @@ static void rand_insert(struct bch_fs *c, u64 nr) for (i = 0; i < nr; i++) { bkey_cookie_init(&k.k_i); k.k.p.offset = test_rand(); + k.k.p.snapshot = U32_MAX; ret = __bch2_trans_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); - - BUG_ON(ret); + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); + if (ret) { + bch_err(c, "error in rand_insert: %i", ret); + break; + } } bch2_trans_exit(&trans); + return ret; } -static void rand_lookup(struct bch_fs *c, u64 nr) +static int rand_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; + int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); for (i = 0; i < nr; i++) { - iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, - POS(0, test_rand()), 0); + bch2_btree_iter_set_pos(iter, POS(0, test_rand())); k = bch2_btree_iter_peek(iter); - bch2_trans_iter_free(&trans, iter); + ret = bkey_err(k); + if (ret) { + bch_err(c, "error in rand_lookup: %i", ret); + break; + } } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); + return ret; } -static void rand_mixed(struct bch_fs *c, u64 nr) +static int rand_mixed(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - int ret; + int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); for (i = 0; i < nr; i++) { - iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, - POS(0, test_rand()), 0); + bch2_btree_iter_set_pos(iter, POS(0, test_rand())); k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) { + bch_err(c, "lookup error in rand_mixed: %i", ret); + break; + } if (!(i & 3) && k.k) { struct bkey_i_cookie k; @@ -481,14 +553,16 @@ static void rand_mixed(struct bch_fs *c, u64 nr) ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &k.k_i, 0)); - - BUG_ON(ret); + if (ret) { + bch_err(c, "update error in rand_mixed: %i", ret); + break; + } } - - bch2_trans_iter_free(&trans, iter); } + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); + return ret; } static int __do_delete(struct btree_trans *trans, struct bpos pos) @@ -498,17 +572,16 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) struct bkey_s_c k; int ret = 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, + iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - goto err; - k = bch2_btree_iter_peek(iter); ret = bkey_err(k); if (ret) goto err; + if (!k.k) + goto err; + bkey_init(&delete.k); delete.k.p = k.k->p; @@ -518,10 +591,10 @@ err: return ret; } -static void rand_delete(struct bch_fs *c, u64 nr) +static int rand_delete(struct bch_fs *c, u64 nr) { struct btree_trans trans; - int ret; + int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); @@ -531,64 +604,76 @@ static void rand_delete(struct bch_fs *c, u64 nr) ret = __bch2_trans_do(&trans, NULL, NULL, 0, __do_delete(&trans, pos)); - BUG_ON(ret); + if (ret) { + bch_err(c, "error in rand_delete: %i", ret); + break; + } } bch2_trans_exit(&trans); + return ret; } -static void seq_insert(struct bch_fs *c, u64 nr) +static int seq_insert(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; struct bkey_i_cookie insert; - int ret; + int ret = 0; u64 i = 0; bkey_cookie_init(&insert.k_i); bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { insert.k.p = iter->pos; ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &insert.k_i, 0)); - - BUG_ON(ret); + if (ret) { + bch_err(c, "error in seq_insert: %i", ret); + break; + } if (++i == nr) break; } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; } -static void seq_lookup(struct bch_fs *c, u64 nr) +static int seq_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) ; + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; } -static void seq_overwrite(struct bch_fs *c, u64 nr) +static int seq_overwrite(struct bch_fs *c, u64 nr) { struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; - int ret; + int ret = 0; bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, BTREE_ITER_INTENT, k, ret) { struct bkey_i_cookie u; @@ -596,23 +681,30 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_trans_update(&trans, iter, &u.k_i, 0)); - - BUG_ON(ret); + if (ret) { + bch_err(c, "error in seq_overwrite: %i", ret); + break; + } } + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + return ret; } -static void seq_delete(struct bch_fs *c, u64 nr) +static int seq_delete(struct bch_fs *c, u64 nr) { int ret; - ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, POS(0, 0), POS(0, U64_MAX), NULL); - BUG_ON(ret); + if (ret) + bch_err(c, "error in seq_delete: %i", ret); + return ret; } -typedef void (*perf_test_fn)(struct bch_fs *, u64); +typedef int (*perf_test_fn)(struct bch_fs *, u64); struct test_job { struct bch_fs *c; @@ -628,11 +720,13 @@ struct test_job { u64 start; u64 finish; + int ret; }; static int btree_perf_test_thread(void *data) { struct test_job *j = data; + int ret; if (atomic_dec_and_test(&j->ready)) { wake_up(&j->ready_wait); @@ -641,7 +735,9 @@ static int btree_perf_test_thread(void *data) wait_event(j->ready_wait, !atomic_read(&j->ready)); } - j->fn(j->c, j->nr / j->nr_threads); + ret = j->fn(j->c, j->nr / j->nr_threads); + if (ret) + j->ret = ret; if (atomic_dec_and_test(&j->done)) { j->finish = sched_clock(); @@ -651,8 +747,8 @@ static int btree_perf_test_thread(void *data) return 0; } -void bch2_btree_perf_test(struct bch_fs *c, const char *testname, - u64 nr, unsigned nr_threads) +int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + u64 nr, unsigned nr_threads) { struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; char name_buf[20], nr_buf[20], per_sec_buf[20]; @@ -695,7 +791,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname, if (!j.fn) { pr_err("unknown test %s", testname); - return; + return -EINVAL; } //pr_info("running test %s:", testname); @@ -720,6 +816,7 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname, time / NSEC_PER_SEC, time * nr_threads / nr, per_sec_buf); + return j.ret; } #endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/libbcachefs/tests.h b/libbcachefs/tests.h index 551d076..c73b18a 100644 --- a/libbcachefs/tests.h +++ b/libbcachefs/tests.h @@ -6,7 +6,7 @@ struct bch_fs; #ifdef CONFIG_BCACHEFS_TESTS -void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); +int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); #else diff --git a/libbcachefs/util.c b/libbcachefs/util.c index fd4044a..2709163 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -520,7 +520,7 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) { while (size) { struct page *page = alloc_page(gfp_mask); - unsigned len = min(PAGE_SIZE, size); + unsigned len = min_t(size_t, PAGE_SIZE, size); if (!page) return -ENOMEM; diff --git a/libbcachefs/util.h b/libbcachefs/util.h index f48c638..c69b05d 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -37,17 +37,6 @@ struct closure; #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) -#define memcpy(dst, src, len) \ -({ \ - void *_dst = (dst); \ - const void *_src = (src); \ - size_t _len = (len); \ - \ - BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ - (void *) (_dst) + (_len) <= (void *) (_src))); \ - memcpy(_dst, _src, _len); \ -}) - #else /* DEBUG */ #define EBUG_ON(cond) @@ -758,4 +747,9 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); #define cmp_int(l, r) ((l > r) - (l < r)) +static inline int u8_cmp(u8 l, u8 r) +{ + return cmp_int(l, r); +} + #endif /* _BCACHEFS_UTIL_H */ diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c new file mode 100644 index 0000000..a3d252c --- /dev/null +++ b/libbcachefs/varint.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "varint.h" + +int bch2_varint_encode(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + } else { + *out++ = 255; + bytes = 9; + } + + put_unaligned_le64(v, out); + return bytes; +} + +int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) +{ + u64 v = get_unaligned_le64(in); + unsigned bytes = ffz(v & 255) + 1; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + v >>= bytes; + v &= ~(~0ULL << (7 * bytes)); + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} diff --git a/libbcachefs/varint.h b/libbcachefs/varint.h new file mode 100644 index 0000000..8daf813 --- /dev/null +++ b/libbcachefs/varint.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_VARINT_H +#define _BCACHEFS_VARINT_H + +int bch2_varint_encode(u8 *, u64); +int bch2_varint_decode(const u8 *, const u8 *, u64 *); + +#endif /* _BCACHEFS_VARINT_H */ diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 21f64cb..858aa87 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -61,7 +61,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) } const struct bch_hash_desc bch2_xattr_hash_desc = { - .btree_id = BTREE_ID_XATTRS, + .btree_id = BTREE_ID_xattrs, .key_type = KEY_TYPE_xattr, .hash_key = xattr_hash_key, .hash_bkey = xattr_hash_bkey, @@ -121,6 +121,7 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, const char *name, void *buffer, size_t size, int type) { + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c_xattr xattr; @@ -128,16 +129,13 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, bch2_trans_init(&trans, c, 0, 0); - iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, - &inode->ei_str_hash, inode->v.i_ino, + iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash, + inode->v.i_ino, &X_SEARCH(type, name, strlen(name)), 0); - if (IS_ERR(iter)) { - bch2_trans_exit(&trans); - BUG_ON(PTR_ERR(iter) == -EINTR); - - return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); - } + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + goto err; xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ret = le16_to_cpu(xattr.v->x_val_len); @@ -147,9 +145,12 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, else memcpy(buffer, xattr_val(xattr.v), ret); } - + bch2_trans_iter_put(&trans, iter); +err: bch2_trans_exit(&trans); - return ret; + + BUG_ON(ret == -EINTR); + return ret == -ENOENT ? -ENODATA : ret; } int bch2_xattr_set(struct btree_trans *trans, u64 inum, @@ -239,7 +240,7 @@ static int bch2_xattr_emit(struct dentry *dentry, } static int bch2_xattr_list_bcachefs(struct bch_fs *c, - struct bch_inode_info *inode, + struct bch_inode_unpacked *inode, struct xattr_buf *buf, bool all) { @@ -249,12 +250,12 @@ static int bch2_xattr_list_bcachefs(struct bch_fs *c, u64 v; for (id = 0; id < Inode_opt_nr; id++) { - v = bch2_inode_opt_get(&inode->ei_inode, id); + v = bch2_inode_opt_get(inode, id); if (!v) continue; if (!all && - !(inode->ei_inode.bi_fields_set & (1 << id))) + !(inode->bi_fields_set & (1 << id))) continue; ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], @@ -279,7 +280,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS(inum, 0), 0, k, ret) { BUG_ON(k.k->p.inode < inum); @@ -293,16 +294,18 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (ret) break; } + bch2_trans_iter_put(&trans, iter); + ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; - ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); + ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); if (ret) return ret; - ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); + ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); if (ret) return ret; @@ -326,10 +329,10 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, - bch2_xattr_set(&trans, inode->v.i_ino, - &inode->ei_str_hash, + bch2_xattr_set(&trans, inode->v.i_ino, &hash, name, value, size, handler->flags, flags)); } diff --git a/linux/generic-radix-tree.c b/linux/generic-radix-tree.c index 4f43d0b..7857017 100644 --- a/linux/generic-radix-tree.c +++ b/linux/generic-radix-tree.c @@ -147,6 +147,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter, struct genradix_root *r; struct genradix_node *n; unsigned level, i; + + if (iter->offset == SIZE_MAX) + return NULL; + restart: r = READ_ONCE(radix->root); if (!r) @@ -165,10 +169,17 @@ restart: (GENRADIX_ARY - 1); while (!n->children[i]) { + size_t objs_per_ptr = genradix_depth_size(level); + + if (iter->offset + objs_per_ptr < iter->offset) { + iter->offset = SIZE_MAX; + iter->pos = SIZE_MAX; + return NULL; + } + i++; - iter->offset = round_down(iter->offset + - genradix_depth_size(level), - genradix_depth_size(level)); + iter->offset = round_down(iter->offset + objs_per_ptr, + objs_per_ptr); iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) diff --git a/linux/kthread.c b/linux/kthread.c index 65e824b..41bfca2 100644 --- a/linux/kthread.c +++ b/linux/kthread.c @@ -80,7 +80,7 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data), ret = pthread_create(&p->thread, &attr, kthread_start_fn, p); if (ret) - die("pthread_create error %s", strerror(ret)); + return ERR_PTR(-ret); pthread_setname_np(p->thread, p->comm); return p; } diff --git a/linux/rhashtable.c b/linux/rhashtable.c index 351eac7..ba2196f 100644 --- a/linux/rhashtable.c +++ b/linux/rhashtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Resizable, Scalable, Concurrent Hash Table * @@ -8,27 +9,29 @@ * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include -#include #include #include #include +#include #include #include #include +#include #include #include #include +#include #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U -#define BUCKET_LOCKS_PER_CPU 32UL + +union nested_table { + union nested_table __rcu *table; + struct rhash_lock_head __rcu *bucket; +}; static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, @@ -37,40 +40,75 @@ static u32 head_hashfn(struct rhashtable *ht, return rht_head_hashfn(ht, tbl, he, ht->p); } -static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, - gfp_t gfp) -{ - unsigned int i, size; - unsigned int nr_pcpus = num_possible_cpus(); +#ifdef CONFIG_PROVE_LOCKING +#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) - nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL); - size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); +int lockdep_rht_mutex_is_held(struct rhashtable *ht) +{ + return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; +} +EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); - /* Never allocate more than 0.5 locks per bucket */ - size = min_t(unsigned int, size, tbl->size >> 1); +int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) +{ + if (!debug_locks) + return 1; + if (unlikely(tbl->nest)) + return 1; + return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]); +} +EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); +#else +#define ASSERT_RHT_MUTEX(HT) +#endif - if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; +static inline union nested_table *nested_table_top( + const struct bucket_table *tbl) +{ + /* The top-level bucket entry does not need RCU protection + * because it's set at the same time as tbl->nest. + */ + return (void *)rcu_dereference_protected(tbl->buckets[0], 1); +} - if (!tbl->locks) - tbl->locks = kmalloc_array(size, sizeof(spinlock_t), - gfp); - if (!tbl->locks) - return -ENOMEM; - for (i = 0; i < size; i++) - spin_lock_init(&tbl->locks[i]); +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_protected(ntbl->table, 1); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); } - tbl->locks_mask = size - 1; - return 0; + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = nested_table_top(tbl); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); } static void bucket_table_free(struct bucket_table *tbl) { - if (tbl) - kvfree(tbl->locks); + if (tbl->nest) + nested_bucket_table_free(tbl); kvfree(tbl); } @@ -80,6 +118,59 @@ static void bucket_table_free_rcu(struct rcu_head *head) bucket_table_free(container_of(head, struct bucket_table, rcu)); } +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + bool leaf) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + + if (ntbl && leaf) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket); + } + + if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL) + return ntbl; + /* Raced with another thread. */ + kfree(ntbl); + return rcu_dereference(*prev); +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = kzalloc(size, gfp); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + false)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) @@ -88,28 +179,27 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t size; int i; - size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || - gfp != GFP_KERNEL) - tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); - if (tbl == NULL && gfp == GFP_KERNEL) - tbl = vzalloc(size); - if (tbl == NULL) - return NULL; + tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp); - tbl->size = nbuckets; + size = nbuckets; - if (alloc_bucket_locks(ht, tbl, gfp) < 0) { - bucket_table_free(tbl); - return NULL; + if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; } + if (tbl == NULL) + return NULL; + + tbl->size = size; + + rcu_head_init(&tbl->rcu); INIT_LIST_HEAD(&tbl->walkers); - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + tbl->hash_rnd = get_random_u32(); for (i = 0; i < nbuckets; i++) - INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); + INIT_RHT_NULLS_HEAD(tbl->buckets[i]); return tbl; } @@ -127,18 +217,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht, return new_tbl; } -static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) +static int rhashtable_rehash_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - struct bucket_table *new_tbl = rhashtable_last_table(ht, - rht_dereference_rcu(old_tbl->future_tbl, ht)); - struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; - int err = -ENOENT; + struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl); + int err = -EAGAIN; struct rhash_head *head, *next, *entry; - spinlock_t *new_bucket_lock; + struct rhash_head __rcu **pprev = NULL; unsigned int new_hash; - rht_for_each(entry, old_tbl, old_hash) { + if (new_tbl->nest) + goto out; + + err = -ENOENT; + + rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash), + old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -153,57 +249,58 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) new_hash = head_hashfn(ht, new_tbl, entry); - new_bucket_lock = rht_bucket_lock(new_tbl, new_hash); + rht_lock(new_tbl, &new_tbl->buckets[new_hash]); - spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING); - head = rht_dereference_bucket(new_tbl->buckets[new_hash], - new_tbl, new_hash); + head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash); RCU_INIT_POINTER(entry->next, head); - rcu_assign_pointer(new_tbl->buckets[new_hash], entry); - spin_unlock(new_bucket_lock); + rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry); - rcu_assign_pointer(*pprev, next); + if (pprev) + rcu_assign_pointer(*pprev, next); + else + /* Need to preserved the bit lock. */ + rht_assign_locked(bkt, next); out: return err; } -static void rhashtable_rehash_chain(struct rhashtable *ht, +static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); - spinlock_t *old_bucket_lock; + struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash); + int err; - old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); + if (!bkt) + return 0; + rht_lock(old_tbl, bkt); - spin_lock_bh(old_bucket_lock); - while (!rhashtable_rehash_one(ht, old_hash)) + while (!(err = rhashtable_rehash_one(ht, bkt, old_hash))) ; - old_tbl->rehash++; - spin_unlock_bh(old_bucket_lock); + + if (err == -ENOENT) + err = 0; + rht_unlock(old_tbl, bkt); + + return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, struct bucket_table *old_tbl, struct bucket_table *new_tbl) { - /* Protect future_tbl using the first bucket lock. */ - spin_lock_bh(old_tbl->locks); - - /* Did somebody beat us to it? */ - if (rcu_access_pointer(old_tbl->future_tbl)) { - spin_unlock_bh(old_tbl->locks); - return -EEXIST; - } - /* Make insertions go into the new, empty table right away. Deletions * and lookups will be attempted in both tables until we synchronize. + * As cmpxchg() provides strong barriers, we do not need + * rcu_assign_pointer(). */ - rcu_assign_pointer(old_tbl->future_tbl, new_tbl); - spin_unlock_bh(old_tbl->locks); + if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL, + new_tbl) != NULL) + return -EEXIST; return 0; } @@ -214,13 +311,18 @@ static int rhashtable_rehash_table(struct rhashtable *ht) struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; + int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) - rhashtable_rehash_chain(ht, old_hash); + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + cond_resched(); + } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -228,25 +330,30 @@ static int rhashtable_rehash_table(struct rhashtable *ht) spin_lock(&ht->lock); list_for_each_entry(walker, &old_tbl->walkers, list) walker->tbl = NULL; - spin_unlock(&ht->lock); /* Wait for readers. All new readers will see the new * table, and thus no references to the old table will * remain. + * We do this inside the locked region so that + * rhashtable_walk_stop() can use rcu_head_after_call_rcu() + * to check if it should not re-link the table. */ call_rcu(&old_tbl->rcu, bucket_table_free_rcu); + spin_unlock(&ht->lock); return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } -static int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; int err; - old_tbl = rhashtable_last_table(ht, old_tbl); + ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; @@ -257,12 +364,27 @@ static int rhashtable_expand(struct rhashtable *ht) return err; } +/** + * rhashtable_shrink - Shrink hash table while allowing concurrent lookups + * @ht: the hash table to shrink + * + * This function shrinks the hash table to fit, i.e., the smallest + * size would not cause it to expand right away automatically. + * + * The caller must ensure that no concurrent resizing occurs by holding + * ht->mutex. + * + * The caller must ensure that no concurrent table mutations take place. + * It is however valid to have concurrent lookups if they are RCU protected. + * + * It is valid to have concurrent insertions and deletions protected by per + * bucket locks or concurrent RCU protected lookups and traversals. + */ static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; - int err; if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); @@ -275,15 +397,7 @@ static int rhashtable_shrink(struct rhashtable *ht) if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (new_tbl == NULL) - return -ENOMEM; - - err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); - if (err) - bucket_table_free(new_tbl); - - return err; + return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) @@ -299,11 +413,18 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) - rhashtable_expand(ht); + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) - rhashtable_shrink(ht); + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); + + if (!err || err == -EEXIST) { + int nerr; - err = rhashtable_rehash_table(ht); + nerr = rhashtable_rehash_table(ht); + err = err ?: nerr; + } mutex_unlock(&ht->mutex); @@ -311,22 +432,8 @@ static void rht_deferred_worker(struct work_struct *work) schedule_work(&ht->run_work); } -static bool rhashtable_check_elasticity(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - unsigned int elasticity = ht->elasticity; - struct rhash_head *head; - - rht_for_each(head, tbl, hash) - if (!--elasticity) - return true; - - return false; -} - -int rhashtable_insert_rehash(struct rhashtable *ht, - struct bucket_table *tbl) +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; @@ -347,7 +454,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht, err = -ENOMEM; - new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); if (new_tbl == NULL) goto fail; @@ -363,7 +470,7 @@ int rhashtable_insert_rehash(struct rhashtable *ht, fail: /* Do not fail the insert if someone else did a rehash. */ - if (likely(rcu_dereference_raw(tbl->future_tbl))) + if (likely(rcu_access_pointer(tbl->future_tbl))) return 0; /* Schedule async rehash to retry allocation in process context. */ @@ -373,57 +480,485 @@ fail: return err; } -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) { + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev = NULL; struct rhash_head *head; - unsigned int hash; - int err; + int elasticity; + + elasticity = RHT_ELASTICITY; + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; + + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) { + pprev = &head->next; + continue; + } - tbl = rhashtable_last_table(ht, tbl); - hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); + if (!ht->rhlist) + return rht_obj(ht, head); - err = -EEXIST; - if (key && rhashtable_lookup_fast(ht, key, ht->p)) - goto exit; + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto exit; + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + if (pprev) + rcu_assign_pointer(*pprev, obj); + else + /* Need to preserve the bit lock */ + rht_assign_locked(bkt, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one( + struct rhashtable *ht, struct rhash_lock_head __rcu **bkt, + struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj, + void *data) +{ + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); - err = -EAGAIN; - if (rhashtable_check_elasticity(ht, tbl, hash) || - rht_grow_above_100(ht, tbl)) - goto exit; + if (unlikely(rht_grow_above_max(ht, tbl))) + return ERR_PTR(-E2BIG); - err = 0; + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; - rcu_assign_pointer(tbl->buckets[hash], obj); + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } + + /* bkt is always the head of the list, so it holds + * the lock, which we need to preserve + */ + rht_assign_locked(bkt, obj); atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); + + return NULL; +} + +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + struct rhash_lock_head __rcu **bkt; + unsigned int hash; + void *data; + + new_tbl = rcu_dereference(ht->tbl); + + do { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + if (rcu_access_pointer(tbl->future_tbl)) + /* Failure is OK */ + bkt = rht_bucket_var(tbl, hash); + else + bkt = rht_bucket_insert(ht, tbl, hash); + if (bkt == NULL) { + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); + data = ERR_PTR(-EAGAIN); + } else { + rht_lock(tbl, bkt); + data = rhashtable_lookup_one(ht, bkt, tbl, + hash, key, obj); + new_tbl = rhashtable_insert_one(ht, bkt, tbl, + hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + rht_unlock(tbl, bkt); + } + } while (!IS_ERR_OR_NULL(new_tbl)); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); -exit: - spin_unlock(rht_bucket_lock(tbl, hash)); + return data; +} +EXPORT_SYMBOL_GPL(rhashtable_insert_slow); - if (err == 0) +/** + * rhashtable_walk_enter - Initialise an iterator + * @ht: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may be called from any process context, including + * non-preemptable context, but cannot be called from softirq or + * hardirq context. + * + * You must call rhashtable_walk_exit after this function returns. + */ +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) +{ + iter->ht = ht; + iter->p = NULL; + iter->slot = 0; + iter->skip = 0; + iter->end_of_table = 0; + + spin_lock(&ht->lock); + iter->walker.tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); + spin_unlock(&ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); + +/** + * rhashtable_walk_exit - Free an iterator + * @iter: Hash table Iterator + * + * This function frees resources allocated by rhashtable_walk_enter. + */ +void rhashtable_walk_exit(struct rhashtable_iter *iter) +{ + spin_lock(&iter->ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&iter->ht->lock); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_exit); + +/** + * rhashtable_walk_start_check - Start a hash table walk + * @iter: Hash table iterator + * + * Start a hash table walk at the current iterator position. Note that we take + * the RCU lock in all cases including when we return an error. So you must + * always call rhashtable_walk_stop to clean up. + * + * Returns zero if successful. + * + * Returns -EAGAIN if resize event occured. Note that the iterator + * will rewind back to the beginning and you may use it immediately + * by calling rhashtable_walk_next. + * + * rhashtable_walk_start is defined as an inline variant that returns + * void. This is preferred in cases where the caller would ignore + * resize events and always continue. + */ +int rhashtable_walk_start_check(struct rhashtable_iter *iter) + __acquires(RCU) +{ + struct rhashtable *ht = iter->ht; + bool rhlist = ht->rhlist; + + rcu_read_lock(); + + spin_lock(&ht->lock); + if (iter->walker.tbl) + list_del(&iter->walker.list); + spin_unlock(&ht->lock); + + if (iter->end_of_table) + return 0; + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); + iter->slot = 0; + iter->skip = 0; + return -EAGAIN; + } + + if (iter->p && !rhlist) { + /* + * We need to validate that 'p' is still in the table, and + * if so, update 'skip' + */ + struct rhash_head *p; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + skip++; + if (p == iter->p) { + iter->skip = skip; + goto found; + } + } + iter->p = NULL; + } else if (iter->p && rhlist) { + /* Need to validate that 'list' is still in the table, and + * if so, update 'skip' and 'p'. + */ + struct rhash_head *p; + struct rhlist_head *list; + int skip = 0; + rht_for_each_rcu(p, iter->walker.tbl, iter->slot) { + for (list = container_of(p, struct rhlist_head, rhead); + list; + list = rcu_dereference(list->next)) { + skip++; + if (list == iter->list) { + iter->p = p; + iter->skip = skip; + goto found; + } + } + } + iter->p = NULL; + } +found: + return 0; +} +EXPORT_SYMBOL_GPL(rhashtable_walk_start_check); + +/** + * __rhashtable_walk_find_next - Find the next element in a table (or the first + * one in case of a new walk). + * + * @iter: Hash table iterator + * + * Returns the found object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. + */ +static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter) +{ + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (!tbl) return NULL; - else if (err == -EAGAIN) - return tbl; + + for (; iter->slot < tbl->size; iter->slot++) { + int skip = iter->skip; + + rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } + if (!skip) + break; + skip--; + } + +next: + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + iter->skip = 0; + } + + iter->p = NULL; + + /* Ensure we see any new tables. */ + smp_rmb(); + + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { + iter->slot = 0; + iter->skip = 0; + return ERR_PTR(-EAGAIN); + } else { + iter->end_of_table = true; + } + + return NULL; +} + +/** + * rhashtable_walk_next - Return the next object and advance the iterator + * @iter: Hash table iterator + * + * Note that you must call rhashtable_walk_stop when you are finished + * with the walk. + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_next(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; + + if (p) { + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } + if (!rht_is_a_nulls(p)) { + iter->skip++; + iter->p = p; + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); + } + + /* At the end of this slot, switch to next one and then find + * next entry from that point. + */ + iter->skip = 0; + iter->slot++; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_next); + +/** + * rhashtable_walk_peek - Return the next object but don't advance the iterator + * @iter: Hash table iterator + * + * Returns the next object or NULL when the end of the table is reached. + * + * Returns -EAGAIN if resize event occurred. Note that the iterator + * will rewind back to the beginning and you may continue to use it. + */ +void *rhashtable_walk_peek(struct rhashtable_iter *iter) +{ + struct rhlist_head *list = iter->list; + struct rhashtable *ht = iter->ht; + struct rhash_head *p = iter->p; + + if (p) + return rht_obj(ht, ht->rhlist ? &list->rhead : p); + + /* No object found in current iter, find next one in the table. */ + + if (iter->skip) { + /* A nonzero skip value points to the next entry in the table + * beyond that last one that was found. Decrement skip so + * we find the current value. __rhashtable_walk_find_next + * will restore the original value of skip assuming that + * the table hasn't changed. + */ + iter->skip--; + } + + return __rhashtable_walk_find_next(iter); +} +EXPORT_SYMBOL_GPL(rhashtable_walk_peek); + +/** + * rhashtable_walk_stop - Finish a hash table walk + * @iter: Hash table iterator + * + * Finish a hash table walk. Does not reset the iterator to the start of the + * hash table. + */ +void rhashtable_walk_stop(struct rhashtable_iter *iter) + __releases(RCU) +{ + struct rhashtable *ht; + struct bucket_table *tbl = iter->walker.tbl; + + if (!tbl) + goto out; + + ht = iter->ht; + + spin_lock(&ht->lock); + if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu)) + /* This bucket table is being freed, don't re-link it. */ + iter->walker.tbl = NULL; else - return ERR_PTR(err); + list_add(&iter->walker.list, &tbl->walkers); + spin_unlock(&ht->lock); + +out: + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(rhashtable_walk_stop); static size_t rounded_hashtable_size(const struct rhashtable_params *params) { - return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), - (unsigned long)params->min_size); + size_t retsize; + + if (params->nelem_hint) + retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3), + (unsigned long)params->min_size); + else + retsize = max(HASH_DEFAULT_SIZE, + (unsigned long)params->min_size); + + return retsize; } static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) @@ -431,21 +966,58 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) return jhash2(key, length, seed); } +/** + * rhashtable_init - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * + * Initializes a new hash table based on the provided configuration + * parameters. A table can be configured either with a variable or + * fixed length key: + * + * Configuration Example 1: Fixed length keys + * struct test_obj { + * int key; + * void * my_member; + * struct rhash_head node; + * }; + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .key_offset = offsetof(struct test_obj, key), + * .key_len = sizeof(int), + * .hashfn = jhash, + * }; + * + * Configuration Example 2: Variable length keys + * struct test_obj { + * [...] + * struct rhash_head node; + * }; + * + * u32 my_hash_fn(const void *data, u32 len, u32 seed) + * { + * struct test_obj *obj = data; + * + * return [... hash ...]; + * } + * + * struct rhashtable_params params = { + * .head_offset = offsetof(struct test_obj, node), + * .hashfn = jhash, + * .obj_hashfn = my_hash_fn, + * }; + */ int rhashtable_init(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; size_t size; - size = HASH_DEFAULT_SIZE; - if ((!params->key_len && !params->obj_hashfn) || (params->obj_hashfn && !params->obj_cmpfn)) return -EINVAL; - if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) - return -EINVAL; - memset(ht, 0, sizeof(*ht)); mutex_init(&ht->mutex); spin_lock_init(&ht->lock); @@ -454,39 +1026,18 @@ int rhashtable_init(struct rhashtable *ht, if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); - if (params->max_size) - ht->p.max_size = rounddown_pow_of_two(params->max_size); + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; - if (params->insecure_max_entries) - ht->p.insecure_max_entries = - rounddown_pow_of_two(params->insecure_max_entries); - else - ht->p.insecure_max_entries = ht->p.max_size * 2; - - ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); + if (params->max_size) { + ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } - if (params->nelem_hint) - size = rounded_hashtable_size(&ht->p); - - /* The maximum (not average) chain length grows with the - * size of the hash table, at a rate of (log N)/(log log N). - * The value of 16 is selected so that even if the hash - * table grew to 2^32 you would not expect the maximum - * chain length to exceed it unless we are under attack - * (or extremely unlucky). - * - * As this limit is only to detect attacks, we don't need - * to set it to a lower value as you'd need the chain - * length to vastly exceed 16 to have any real effect - * on the system. - */ - if (!params->insecure_elasticity) - ht->elasticity = 16; + ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); - if (params->locks_mul) - ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); - else - ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; + size = rounded_hashtable_size(&ht->p); ht->key_len = ht->p.key_len; if (!params->hashfn) { @@ -498,9 +1049,16 @@ int rhashtable_init(struct rhashtable *ht, } } + /* + * This is api initialization and thus we need to guarantee the + * initial rhashtable allocation. Upon failure, retry with the + * smallest possible size with __GFP_NOFAIL semantics. + */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (tbl == NULL) - return -ENOMEM; + if (unlikely(tbl == NULL)) { + size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); + } atomic_set(&ht->nelems, 0); @@ -510,15 +1068,170 @@ int rhashtable_init(struct rhashtable *ht, return 0; } +EXPORT_SYMBOL_GPL(rhashtable_init); -void rhashtable_destroy(struct rhashtable *ht) +/** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) { - struct bucket_table *tbl; + int err; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** + * rhashtable_free_and_destroy - free elements and destroy hash table + * @ht: the hash table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * Stops an eventual async resize. If defined, invokes free_fn for each + * element to releasal resources. Please note that RCU protected + * readers may still be accessing the elements. Releasing of resources + * must occur in a compatible manner. Then frees the bucket array. + * + * This function will eventually sleep to wait for an async resize + * to complete. The caller is responsible that no further write operations + * occurs in parallel. + */ +void rhashtable_free_and_destroy(struct rhashtable *ht, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct bucket_table *tbl, *next_tbl; + unsigned int i; cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); tbl = rht_dereference(ht->tbl, ht); +restart: + if (free_fn) { + for (i = 0; i < tbl->size; i++) { + struct rhash_head *pos, *next; + + cond_resched(); + for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL; + !rht_is_a_nulls(pos); + pos = next, + next = !rht_is_a_nulls(pos) ? + rht_dereference(pos->next, ht) : NULL) + rhashtable_free_one(ht, pos, free_fn, arg); + } + } + + next_tbl = rht_dereference(tbl->future_tbl, ht); bucket_table_free(tbl); + if (next_tbl) { + tbl = next_tbl; + goto restart; + } mutex_unlock(&ht->mutex); } +EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); + +void rhashtable_destroy(struct rhashtable *ht) +{ + return rhashtable_free_and_destroy(ht, NULL, NULL); +} +EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_lock_head __rcu **__rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket_rcu(ntbl[index].table, + tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return NULL; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(__rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested( + const struct bucket_table *tbl, unsigned int hash) +{ + static struct rhash_lock_head __rcu *rhnull; + + if (!rhnull) + INIT_RHT_NULLS_HEAD(rhnull); + return __rht_bucket_nested(tbl, hash) ?: &rhnull; +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_lock_head __rcu **rht_bucket_nested_insert( + struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + + ntbl = nested_table_top(tbl); + hash >>= tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift)); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/linux/sched.c b/linux/sched.c index d926e20..1c7198d 100644 --- a/linux/sched.c +++ b/linux/sched.c @@ -106,8 +106,6 @@ static void sched_init(void) { struct task_struct *p = malloc(sizeof(*p)); - mlockall(MCL_CURRENT|MCL_FUTURE); - memset(p, 0, sizeof(*p)); p->state = TASK_RUNNING; diff --git a/linux/shrinker.c b/linux/shrinker.c index 7926be0..f6c979a 100644 --- a/linux/shrinker.c +++ b/linux/shrinker.c @@ -28,7 +28,6 @@ void unregister_shrinker(struct shrinker *shrinker) struct meminfo { u64 total; u64 available; - }; static u64 parse_meminfo_line(const char *line) @@ -50,7 +49,7 @@ static struct meminfo read_meminfo(void) f = fopen("/proc/meminfo", "r"); if (!f) - die("error opening /proc/meminfo: %m"); + return ret; while ((len = getline(&line, &n, f)) != -1) { if ((v = strcmp_prefix(line, "MemTotal:"))) @@ -77,10 +76,18 @@ void run_shrinkers(void) return; info = read_meminfo(); - want_shrink = (info.total >> 2) - info.available; - if (want_shrink <= 0) - return; + if (info.total && info.available) { + want_shrink = (info.total >> 2) - info.available; + + if (want_shrink <= 0) + return; + } else { + /* If we weren't able to read /proc/meminfo, we must be pretty + * low: */ + + want_shrink = 8 << 20; + } mutex_lock(&shrinker_lock); list_for_each_entry(shrinker, &shrinker_list, list) { diff --git a/linux/six.c b/linux/six.c index 49d46ed..fca1208 100644 --- a/linux/six.c +++ b/linux/six.c @@ -2,11 +2,13 @@ #include #include +#include #include #include #include #include #include +#include #ifdef DEBUG #define EBUG_ON(cond) BUG_ON(cond) @@ -41,7 +43,7 @@ struct six_lock_vals { #define LOCK_VALS { \ [SIX_LOCK_read] = { \ .lock_val = __SIX_VAL(read_lock, 1), \ - .lock_fail = __SIX_LOCK_HELD_write, \ + .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\ .unlock_val = -__SIX_VAL(read_lock, 1), \ .held_mask = __SIX_LOCK_HELD_read, \ .unlock_wakeup = SIX_LOCK_write, \ @@ -76,36 +78,196 @@ static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, } } +static inline unsigned pcpu_read_count(struct six_lock *lock) +{ + unsigned read_count = 0; + int cpu; + + for_each_possible_cpu(cpu) + read_count += *per_cpu_ptr(lock->readers, cpu); + return read_count; +} + +struct six_lock_waiter { + struct list_head list; + struct task_struct *task; +}; + +/* This is probably up there with the more evil things I've done */ +#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) + +static inline void six_lock_wakeup(struct six_lock *lock, + union six_lock_state state, + unsigned waitlist_id) +{ + if (waitlist_id == SIX_LOCK_write) { + if (state.write_locking && !state.read_lock) { + struct task_struct *p = READ_ONCE(lock->owner); + if (p) + wake_up_process(p); + } + } else { + struct list_head *wait_list = &lock->wait_list[waitlist_id]; + struct six_lock_waiter *w, *next; + + if (!(state.waiters & (1 << waitlist_id))) + return; + + clear_bit(waitlist_bitnr(waitlist_id), + (unsigned long *) &lock->state.v); + + raw_spin_lock(&lock->wait_lock); + + list_for_each_entry_safe(w, next, wait_list, list) { + list_del_init(&w->list); + + if (wake_up_process(w->task) && + waitlist_id != SIX_LOCK_read) { + if (!list_empty(wait_list)) + set_bit(waitlist_bitnr(waitlist_id), + (unsigned long *) &lock->state.v); + break; + } + } + + raw_spin_unlock(&lock->wait_lock); + } +} + static __always_inline bool do_six_trylock_type(struct six_lock *lock, - enum six_lock_type type) + enum six_lock_type type, + bool try) { const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old; - u64 v = READ_ONCE(lock->state.v); + union six_lock_state old, new; + bool ret; + u64 v; EBUG_ON(type == SIX_LOCK_write && lock->owner != current); + EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1)); - do { - old.v = v; + EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking))); - EBUG_ON(type == SIX_LOCK_write && - ((old.v & __SIX_LOCK_HELD_write) || - !(old.v & __SIX_LOCK_HELD_intent))); + /* + * Percpu reader mode: + * + * The basic idea behind this algorithm is that you can implement a lock + * between two threads without any atomics, just memory barriers: + * + * For two threads you'll need two variables, one variable for "thread a + * has the lock" and another for "thread b has the lock". + * + * To take the lock, a thread sets its variable indicating that it holds + * the lock, then issues a full memory barrier, then reads from the + * other thread's variable to check if the other thread thinks it has + * the lock. If we raced, we backoff and retry/sleep. + */ - if (old.v & l[type].lock_fail) - return false; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, - old.v + l[type].lock_val)) != old.v); + if (type == SIX_LOCK_read && lock->readers) { +retry: + preempt_disable(); + this_cpu_inc(*lock->readers); /* signal that we own lock */ - six_set_owner(lock, type, old); - return true; + smp_mb(); + + old.v = READ_ONCE(lock->state.v); + ret = !(old.v & l[type].lock_fail); + + this_cpu_sub(*lock->readers, !ret); + preempt_enable(); + + /* + * If we failed because a writer was trying to take the + * lock, issue a wakeup because we might have caused a + * spurious trylock failure: + */ + if (old.write_locking) { + struct task_struct *p = READ_ONCE(lock->owner); + + if (p) + wake_up_process(p); + } + + /* + * If we failed from the lock path and the waiting bit wasn't + * set, set it: + */ + if (!try && !ret) { + v = old.v; + + do { + new.v = old.v = v; + + if (!(old.v & l[type].lock_fail)) + goto retry; + + if (new.waiters & (1 << type)) + break; + + new.waiters |= 1 << type; + } while ((v = atomic64_cmpxchg(&lock->state.counter, + old.v, new.v)) != old.v); + } + } else if (type == SIX_LOCK_write && lock->readers) { + if (try) { + atomic64_add(__SIX_VAL(write_locking, 1), + &lock->state.counter); + smp_mb__after_atomic(); + } + + ret = !pcpu_read_count(lock); + + /* + * On success, we increment lock->seq; also we clear + * write_locking unless we failed from the lock path: + */ + v = 0; + if (ret) + v += __SIX_VAL(seq, 1); + if (ret || try) + v -= __SIX_VAL(write_locking, 1); + + if (try && !ret) { + old.v = atomic64_add_return(v, &lock->state.counter); + six_lock_wakeup(lock, old, SIX_LOCK_read); + } else { + atomic64_add(v, &lock->state.counter); + } + } else { + v = READ_ONCE(lock->state.v); + do { + new.v = old.v = v; + + if (!(old.v & l[type].lock_fail)) { + new.v += l[type].lock_val; + + if (type == SIX_LOCK_write) + new.write_locking = 0; + } else if (!try && type != SIX_LOCK_write && + !(new.waiters & (1 << type))) + new.waiters |= 1 << type; + else + break; /* waiting bit already set */ + } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, + old.v, new.v)) != old.v); + + ret = !(old.v & l[type].lock_fail); + + EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); + } + + if (ret) + six_set_owner(lock, type, old); + + EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking)); + + return ret; } __always_inline __flatten static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) { - if (!do_six_trylock_type(lock, type)) + if (!do_six_trylock_type(lock, type, true)) return false; if (type != SIX_LOCK_write) @@ -119,8 +281,43 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old; - u64 v = READ_ONCE(lock->state.v); + u64 v; + + EBUG_ON(type == SIX_LOCK_write); + + if (type == SIX_LOCK_read && + lock->readers) { + bool ret; + preempt_disable(); + this_cpu_inc(*lock->readers); + + smp_mb(); + + old.v = READ_ONCE(lock->state.v); + ret = !(old.v & l[type].lock_fail) && old.seq == seq; + + this_cpu_sub(*lock->readers, !ret); + preempt_enable(); + + /* + * Similar to the lock path, we may have caused a spurious write + * lock fail and need to issue a wakeup: + */ + if (old.write_locking) { + struct task_struct *p = READ_ONCE(lock->owner); + + if (p) + wake_up_process(p); + } + + if (ret) + six_acquire(&lock->dep_map, 1); + + return ret; + } + + v = READ_ONCE(lock->state.v); do { old.v = v; @@ -136,14 +333,6 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, return true; } -struct six_lock_waiter { - struct list_head list; - struct task_struct *task; -}; - -/* This is probably up there with the more evil things I've done */ -#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) - #ifdef CONFIG_LOCK_SPIN_ON_OWNER static inline int six_can_spin_on_owner(struct six_lock *lock) @@ -218,7 +407,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type if (owner && !six_spin_on_owner(lock, owner)) break; - if (do_six_trylock_type(lock, type)) { + if (do_six_trylock_type(lock, type, false)) { osq_unlock(&lock->osq); preempt_enable(); return true; @@ -270,18 +459,22 @@ noinline static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p) { - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old, new; + union six_lock_state old; struct six_lock_waiter wait; int ret = 0; - u64 v; + + if (type == SIX_LOCK_write) { + EBUG_ON(lock->state.write_locking); + atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter); + smp_mb__after_atomic(); + } ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; if (ret) - return ret; + goto out_before_sleep; if (six_optimistic_spin(lock, type)) - return 0; + goto out_before_sleep; lock_contended(&lock->dep_map, _RET_IP_); @@ -298,32 +491,16 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty raw_spin_unlock(&lock->wait_lock); } - ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; - if (ret) + if (do_six_trylock_type(lock, type, false)) break; - v = READ_ONCE(lock->state.v); - do { - new.v = old.v = v; - - if (!(old.v & l[type].lock_fail)) - new.v += l[type].lock_val; - else if (!(new.waiters & (1 << type))) - new.waiters |= 1 << type; - else - break; /* waiting bit already set */ - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, new.v)) != old.v); - - if (!(old.v & l[type].lock_fail)) + ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; + if (ret) break; schedule(); } - if (!ret) - six_set_owner(lock, type, old); - __set_current_state(TASK_RUNNING); if (!list_empty_careful(&wait.list)) { @@ -331,6 +508,12 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty list_del_init(&wait.list); raw_spin_unlock(&lock->wait_lock); } +out_before_sleep: + if (ret && type == SIX_LOCK_write) { + old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1), + &lock->state.counter); + six_lock_wakeup(lock, old, SIX_LOCK_read); + } return ret; } @@ -344,7 +527,7 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, if (type != SIX_LOCK_write) six_acquire(&lock->dep_map, 0); - ret = do_six_trylock_type(lock, type) ? 0 + ret = do_six_trylock_type(lock, type, true) ? 0 : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); if (ret && type != SIX_LOCK_write) @@ -355,54 +538,12 @@ static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, return ret; } -static inline void six_lock_wakeup(struct six_lock *lock, - union six_lock_state state, - unsigned waitlist_id) -{ - struct list_head *wait_list = &lock->wait_list[waitlist_id]; - struct six_lock_waiter *w, *next; - - if (waitlist_id == SIX_LOCK_write && state.read_lock) - return; - - if (!(state.waiters & (1 << waitlist_id))) - return; - - clear_bit(waitlist_bitnr(waitlist_id), - (unsigned long *) &lock->state.v); - - if (waitlist_id == SIX_LOCK_write) { - struct task_struct *p = READ_ONCE(lock->owner); - - if (p) - wake_up_process(p); - return; - } - - raw_spin_lock(&lock->wait_lock); - - list_for_each_entry_safe(w, next, wait_list, list) { - list_del_init(&w->list); - - if (wake_up_process(w->task) && - waitlist_id != SIX_LOCK_read) { - if (!list_empty(wait_list)) - set_bit(waitlist_bitnr(waitlist_id), - (unsigned long *) &lock->state.v); - break; - } - } - - raw_spin_unlock(&lock->wait_lock); -} - __always_inline __flatten static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state state; - EBUG_ON(!(lock->state.v & l[type].held_mask)); EBUG_ON(type == SIX_LOCK_write && !(lock->state.v & __SIX_LOCK_HELD_intent)); @@ -420,8 +561,18 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) lock->owner = NULL; } - state.v = atomic64_add_return_release(l[type].unlock_val, - &lock->state.counter); + if (type == SIX_LOCK_read && + lock->readers) { + smp_mb(); /* unlock barrier */ + this_cpu_dec(*lock->readers); + smp_mb(); /* between unlocking and checking for waiters */ + state.v = READ_ONCE(lock->state.v); + } else { + EBUG_ON(!(lock->state.v & l[type].held_mask)); + state.v = atomic64_add_return_release(l[type].unlock_val, + &lock->state.counter); + } + six_lock_wakeup(lock, state, l[type].unlock_wakeup); } @@ -467,26 +618,28 @@ EXPORT_SYMBOL_GPL(six_lock_downgrade); bool six_lock_tryupgrade(struct six_lock *lock) { - const struct six_lock_vals l[] = LOCK_VALS; union six_lock_state old, new; u64 v = READ_ONCE(lock->state.v); do { new.v = old.v = v; - EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask)); - - new.v += l[SIX_LOCK_read].unlock_val; - - if (new.v & l[SIX_LOCK_intent].lock_fail) + if (new.intent_lock) return false; - new.v += l[SIX_LOCK_intent].lock_val; + if (!lock->readers) { + EBUG_ON(!new.read_lock); + new.read_lock--; + } + + new.intent_lock = 1; } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, old.v, new.v)) != old.v); + if (lock->readers) + this_cpu_dec(*lock->readers); + six_set_owner(lock, SIX_LOCK_intent, old); - six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup); return true; } @@ -518,16 +671,22 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; - EBUG_ON(type == SIX_LOCK_write); six_acquire(&lock->dep_map, 0); /* XXX: assert already locked, and that we don't overflow: */ switch (type) { case SIX_LOCK_read: - atomic64_add(l[type].lock_val, &lock->state.counter); + if (lock->readers) { + this_cpu_inc(*lock->readers); + } else { + EBUG_ON(!lock->state.read_lock && + !lock->state.intent_lock); + atomic64_add(l[type].lock_val, &lock->state.counter); + } break; case SIX_LOCK_intent: + EBUG_ON(!lock->state.intent_lock); lock->intent_lock_recurse++; break; case SIX_LOCK_write: @@ -551,3 +710,50 @@ void six_lock_wakeup_all(struct six_lock *lock) raw_spin_unlock(&lock->wait_lock); } EXPORT_SYMBOL_GPL(six_lock_wakeup_all); + +struct free_pcpu_rcu { + struct rcu_head rcu; + void __percpu *p; +}; + +static void free_pcpu_rcu_fn(struct rcu_head *_rcu) +{ + struct free_pcpu_rcu *rcu = + container_of(_rcu, struct free_pcpu_rcu, rcu); + + free_percpu(rcu->p); + kfree(rcu); +} + +void six_lock_pcpu_free_rcu(struct six_lock *lock) +{ + struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL); + + if (!rcu) + return; + + rcu->p = lock->readers; + lock->readers = NULL; + + call_rcu(&rcu->rcu, free_pcpu_rcu_fn); +} +EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu); + +void six_lock_pcpu_free(struct six_lock *lock) +{ + BUG_ON(lock->readers && pcpu_read_count(lock)); + BUG_ON(lock->state.read_lock); + + free_percpu(lock->readers); + lock->readers = NULL; +} +EXPORT_SYMBOL_GPL(six_lock_pcpu_free); + +void six_lock_pcpu_alloc(struct six_lock *lock) +{ +#ifdef __KERNEL__ + if (!lock->readers) + lock->readers = alloc_percpu(unsigned); +#endif +} +EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); diff --git a/tools-util.c b/tools-util.c index 88e923c..361419a 100644 --- a/tools-util.c +++ b/tools-util.c @@ -663,3 +663,22 @@ int dev_mounted(char *dev) return 1; return 2; } + +struct bpos bpos_parse(char *buf) +{ + char *s = buf, *field; + u64 inode_v = 0, offset_v = 0; + + if (!(field = strsep(&s, ":")) || + kstrtoull(field, 10, &inode_v)) + die("invalid bpos %s", buf); + + if ((field = strsep(&s, ":")) && + kstrtoull(field, 10, &offset_v)) + die("invalid bpos %s", buf); + + if (s) + die("invalid bpos %s", buf); + + return (struct bpos) { .inode = inode_v, .offset = offset_v }; +} diff --git a/tools-util.h b/tools-util.h index d6814bc..01898e2 100644 --- a/tools-util.h +++ b/tools-util.h @@ -172,4 +172,6 @@ do { \ _ret; \ }) +struct bpos bpos_parse(char *); + #endif /* _TOOLS_UTIL_H */ -- 2.39.2