CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall \
-Wno-pointer-sign \
-Wno-zero-length-bounds \
+ -Wno-stringop-overflow \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \
-I. -Iinclude -Iraid \
git add linux/six.c
cp $(LINUX_DIR)/include/linux/six.h include/linux/
git add include/linux/six.h
+ cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/
+ git add include/linux/list_nulls.h
+ cp $(LINUX_DIR)/include/linux/poison.h include/linux/
+ git add include/linux/poison.h
$(RM) libbcachefs/*.mod.c
git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
git add .bcachefs_revision
"\n"
"Commands for managing filesystem data:\n"
" data rereplicate Rereplicate degraded data\n"
+ " data job Kick off low level data jobs\n"
"\n"
"Encryption:\n"
" unlock Unlock an encrypted filesystem prior to running/mounting\n"
return cmd_device_set_state(argc, argv);
if (!strcmp(cmd, "resize"))
return cmd_device_resize(argc, argv);
+ if (!strcmp(cmd, "resize-journal"))
+ return cmd_device_resize_journal(argc, argv);
usage();
return 0;
if (!strcmp(cmd, "rereplicate"))
return cmd_data_rereplicate(argc, argv);
+ if (!strcmp(cmd, "job"))
+ return cmd_data_job(argc, argv);
usage();
return 0;
#include <sys/ioctl.h>
#include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/btree_cache.h"
#include "cmds.h"
#include "libbcachefs.h"
die("too many arguments");
return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) {
- .op = BCH_DATA_OP_REREPLICATE,
- .start = POS_MIN,
- .end = POS_MAX,
+ .op = BCH_DATA_OP_REREPLICATE,
+ .start_btree = 0,
+ .start_pos = POS_MIN,
+ .end_btree = BTREE_ID_NR,
+ .end_pos = POS_MAX,
});
}
+
+static void data_job_usage(void)
+{
+ puts("bcachefs data job\n"
+ "Usage: bcachefs data job [job} filesystem\n"
+ "\n"
+ "Kick off a data job and report progress\n"
+ "\n"
+ "job: one of scrub, rereplicate, migrate, or rewrite_old_nodes\n"
+ "\n"
+ "Options:\n"
+ " -b btree btree to operate on\n"
+ " -s inode:offset start position\n"
+ " -e inode:offset end position\n"
+ " -h, --help display this help and exit\n"
+ "Report bugs to <linux-bcache@vger.kernel.org>");
+ exit(EXIT_SUCCESS);
+}
+
+const char * const data_jobs[] = {
+ "scrub",
+ "rereplicate",
+ "migrate",
+ "rewrite_old_nodes",
+ NULL
+};
+
+int cmd_data_job(int argc, char *argv[])
+{
+ struct bch_ioctl_data op = {
+ .start_btree = 0,
+ .start_pos = POS_MIN,
+ .end_btree = BTREE_ID_NR,
+ .end_pos = POS_MAX,
+ };
+ int opt;
+
+ while ((opt = getopt(argc, argv, "s:e:h")) != -1)
+ switch (opt) {
+ case 'b':
+ op.start_btree = read_string_list_or_die(optarg,
+ bch2_btree_ids, "btree id");
+ op.end_btree = op.start_btree;
+ break;
+ case 's':
+ op.start_pos = bpos_parse(optarg);
+ break;
+ op.end_pos = bpos_parse(optarg);
+ case 'e':
+ break;
+ case 'h':
+ data_job_usage();
+ }
+ args_shift(optind);
+
+ char *job = arg_pop();
+ if (!job)
+ die("please specify which type of job");
+
+ op.op = read_string_list_or_die(job, data_jobs, "bad job type");
+
+ if (op.op == BCH_DATA_OP_SCRUB)
+ die("scrub not implemented yet");
+
+ char *fs_path = arg_pop();
+ if (!fs_path)
+ fs_path = ".";
+
+ if (argc)
+ die("too many arguments");
+
+ return bchu_data(bcache_fs_open(fs_path), op);
+}
opt_set(opts, nochanges, true);
opt_set(opts, norecovery, true);
opt_set(opts, degraded, true);
- opt_set(opts, errors, BCH_ON_ERROR_CONTINUE);
+ opt_set(opts, errors, BCH_ON_ERROR_continue);
opt_set(opts, fix_errors, FSCK_OPT_YES);
while ((opt = getopt(argc, argv, "o:fvh")) != -1)
sectors = vstruct_sectors(bne, c->block_bits);
}
- fprintf(stdout, " offset %u journal seq %llu\n",
- offset, le64_to_cpu(i->journal_seq));
+ fprintf(stdout, " offset %u version %u, journal seq %llu\n",
+ offset,
+ le16_to_cpu(i->version),
+ le64_to_cpu(i->journal_seq));
offset += sectors;
- for (k = i->start;
- k != vstruct_last(i);
- k = bkey_next_skip_noops(k, vstruct_last(i))) {
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) {
struct bkey u;
char buf[4096];
bch2_trans_exit(&trans);
}
-static struct bpos parse_pos(char *buf)
-{
- char *s = buf, *field;
- u64 inode_v = 0, offset_v = 0;
-
- if (!(field = strsep(&s, ":")) ||
- kstrtoull(field, 10, &inode_v))
- die("invalid bpos %s", buf);
-
- if ((field = strsep(&s, ":")) &&
- kstrtoull(field, 10, &offset_v))
- die("invalid bpos %s", buf);
-
- if (s)
- die("invalid bpos %s", buf);
-
- return (struct bpos) { .inode = inode_v, .offset = offset_v };
-}
-
static void list_keys_usage(void)
{
puts("bcachefs list - list filesystem metadata to stdout\n"
opt_set(opts, nochanges, true);
opt_set(opts, norecovery, true);
opt_set(opts, degraded, true);
- opt_set(opts, errors, BCH_ON_ERROR_CONTINUE);
+ opt_set(opts, errors, BCH_ON_ERROR_continue);
while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1)
switch (opt) {
btree_id_end = btree_id_start + 1;
break;
case 's':
- start = parse_pos(optarg);
+ start = bpos_parse(optarg);
break;
case 'e':
- end = parse_pos(optarg);
+ end = bpos_parse(optarg);
break;
case 'i':
if (kstrtoull(optarg, 10, &inum))
opt_set(opts, nochanges, true);
opt_set(opts, norecovery, true);
opt_set(opts, degraded, true);
- opt_set(opts, errors, BCH_ON_ERROR_CONTINUE);
+ opt_set(opts, errors, BCH_ON_ERROR_continue);
opt_set(opts, fix_errors, FSCK_OPT_YES);
opt_set(opts, keep_journal, true);
printf("journal entry %8llu\n"
" version %8u\n"
" last seq %8llu\n"
- " read clock %8u\n"
- " write clock %8u\n"
,
le64_to_cpu(p->j.seq),
- le32_to_cpu(p->j.seq),
- le64_to_cpu(p->j.last_seq),
- le16_to_cpu(p->j.read_clock),
- le16_to_cpu(p->j.write_clock));
+ le32_to_cpu(p->j.version),
+ le64_to_cpu(p->j.last_seq));
for_each_jset_key(k, _n, entry, &p->j) {
char buf[200];
#include <sys/types.h>
#include <unistd.h>
+#include "libbcachefs/bcachefs.h"
#include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/journal.h"
#include "libbcachefs/super-io.h"
#include "cmds.h"
#include "libbcachefs.h"
{
puts("bcachefs device_remove - remove a device from a filesystem\n"
"Usage:\n"
- " bcachefs device remove device\n"
- " bcachefs device remove --by-id path devid\n"
+ " bcachefs device remove <device>|<devid> <path>\n"
"\n"
"Options:\n"
- " -i, --by-id Remove device by device id\n"
" -f, --force Force removal, even if some data\n"
" couldn't be migrated\n"
" -F, --force-metadata Force removal, even if some metadata\n"
};
struct bchfs_handle fs;
bool by_id = false;
- int opt, flags = BCH_FORCE_IF_DEGRADED;
- unsigned dev_idx;
+ int opt, flags = BCH_FORCE_IF_DEGRADED, dev_idx;
while ((opt = getopt_long(argc, argv, "fh", longopts, NULL)) != -1)
switch (opt) {
- case 'i':
- by_id = true;
- break;
case 'f':
flags |= BCH_FORCE_IF_DATA_LOST;
break;
}
args_shift(optind);
- if (by_id) {
- char *path = arg_pop();
- if (!path)
- die("Please supply filesystem to remove device from");
+ char *dev_str = arg_pop();
+ if (!dev_str)
+ die("Please supply a device");
- dev_idx = (intptr_t) arg_pop();
- if (!dev_idx)
- die("Please supply device id");
+ char *end;
+ dev_idx = strtoul(dev_str, &end, 10);
+ if (*dev_str && !*end)
+ by_id = true;
- fs = bcache_fs_open(path);
+ char *fs_path = arg_pop();
+ if (fs_path) {
+ fs = bcache_fs_open(fs_path);
+
+ if (!by_id) {
+ dev_idx = bchu_dev_path_to_idx(fs, dev_str);
+ if (dev_idx < 0)
+ die("%s does not seem to be a member of %s",
+ dev_str, fs_path);
+ }
+ } else if (!by_id) {
+ fs = bchu_fs_open_by_dev(dev_str, &dev_idx);
} else {
- char *dev = arg_pop();
- if (!dev)
- die("Please supply a device to remove");
-
- fs = bchu_fs_open_by_dev(dev, &dev_idx);
+ die("Filesystem path required when specifying device by id");
}
- if (argc)
- die("too many arguments");
-
bchu_disk_remove(fs, dev_idx, flags);
return 0;
}
if (argc)
die("too many arguments");
- unsigned dev_idx;
+ int dev_idx;
struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
bchu_disk_online(fs, dev);
return 0;
if (argc)
die("too many arguments");
- unsigned dev_idx;
+ int dev_idx;
struct bchfs_handle fs = bchu_fs_open_by_dev(dev, &dev_idx);
bchu_disk_offline(fs, dev_idx, flags);
return 0;
if (argc)
die("too many arguments");
- unsigned dev_idx;
+ int dev_idx;
struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
struct bch_ioctl_dev_usage u = bchu_dev_usage(fs, dev_idx);
- if (u.state == BCH_MEMBER_STATE_RW) {
+ if (u.state == BCH_MEMBER_STATE_rw) {
printf("Setting %s readonly\n", dev_path);
- bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_RO, 0);
+ bchu_disk_set_state(fs, dev_idx, BCH_MEMBER_STATE_ro, 0);
}
return bchu_data(fs, (struct bch_ioctl_data) {
.op = BCH_DATA_OP_MIGRATE,
- .start = POS_MIN,
- .end = POS_MAX,
+ .start_btree = 0,
+ .start_pos = POS_MIN,
+ .end_btree = BTREE_ID_NR,
+ .end_pos = POS_MAX,
.migrate.dev = dev_idx,
});
}
static void device_set_state_usage(void)
{
puts("bcachefs device set-state\n"
- "Usage: bcachefs device set-state device new-state\n"
+ "Usage: bcachefs device set-state <new-state> <device>|<devid> <path>\n"
+ "\n"
+ "<new-state>: one of rw, ro, failed or spare\n"
+ "<path>: path to mounted filesystem, optional unless specifying device by id\n"
"\n"
"Options:\n"
" -f, --force Force, if data redundancy will be degraded\n"
{ "help", 0, NULL, 'h' },
{ NULL }
};
- int opt, flags = 0;
+ struct bchfs_handle fs;
+ bool by_id = false;
+ int opt, flags = 0, dev_idx;
bool offline = false;
while ((opt = getopt_long(argc, argv, "foh", longopts, NULL)) != -1)
}
args_shift(optind);
- char *dev_path = arg_pop();
- if (!dev_path)
- die("Please supply a device");
-
char *new_state_str = arg_pop();
if (!new_state_str)
die("Please supply a device state");
unsigned new_state = read_string_list_or_die(new_state_str,
- bch2_dev_state, "device state");
+ bch2_member_states, "device state");
- if (!offline) {
- unsigned dev_idx;
- struct bchfs_handle fs = bchu_fs_open_by_dev(dev_path, &dev_idx);
+ char *dev_str = arg_pop();
+ if (!dev_str)
+ die("Please supply a device");
- bchu_disk_set_state(fs, dev_idx, new_state, flags);
+ char *end;
+ dev_idx = strtoul(dev_str, &end, 10);
+ if (*dev_str && !*end)
+ by_id = true;
- bcache_fs_close(fs);
- } else {
+ if (offline) {
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = { NULL };
- int ret = bch2_read_super(dev_path, &opts, &sb);
+ if (by_id)
+ die("Cannot specify offline device by id");
+
+ int ret = bch2_read_super(dev_str, &opts, &sb);
if (ret)
- die("error opening %s: %s", dev_path, strerror(-ret));
+ die("error opening %s: %s", dev_str, strerror(-ret));
struct bch_member *m = bch2_sb_get_members(sb.sb)->members + sb.sb->dev_idx;
bch2_super_write(sb.bdev->bd_fd, sb.sb);
bch2_free_super(&sb);
+ return 0;
+ }
+
+ char *fs_path = arg_pop();
+ if (fs_path) {
+ fs = bcache_fs_open(fs_path);
+
+ if (!by_id) {
+ dev_idx = bchu_dev_path_to_idx(fs, dev_str);
+ if (dev_idx < 0)
+ die("%s does not seem to be a member of %s",
+ dev_str, fs_path);
+ }
+ } else if (!by_id) {
+ fs = bchu_fs_open_by_dev(dev_str, &dev_idx);
+ } else {
+ die("Filesystem path required when specifying device by id");
}
+ bchu_disk_set_state(fs, dev_idx, new_state, flags);
+
return 0;
}
}
return 0;
}
+
+static void device_resize_journal_usage(void)
+{
+ puts("bcachefs device resize-journal \n"
+ "Usage: bcachefs device resize-journal device [ size ]\n"
+ "\n"
+ "Options:\n"
+ " -h, --help display this help and exit\n"
+ "Report bugs to <linux-bcache@vger.kernel.org>");
+ exit(EXIT_SUCCESS);
+}
+
+int cmd_device_resize_journal(int argc, char *argv[])
+{
+ static const struct option longopts[] = {
+ { "help", 0, NULL, 'h' },
+ { NULL }
+ };
+ u64 size;
+ int opt;
+
+ while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+ switch (opt) {
+ case 'h':
+ device_resize_journal_usage();
+ }
+ args_shift(optind);
+
+ char *dev = arg_pop();
+ if (!dev)
+ die("Please supply a device");
+
+ int dev_fd = xopen(dev, O_RDONLY);
+
+ char *size_arg = arg_pop();
+ if (!size_arg)
+ size = get_size(dev, dev_fd);
+ else if (bch2_strtoull_h(size_arg, &size))
+ die("invalid size");
+
+ size >>= 9;
+
+ if (argc)
+ die("Too many arguments");
+
+ struct stat dev_stat = xfstat(dev_fd);
+
+ struct mntent *mount = dev_to_mount(dev);
+ if (mount) {
+ if (!S_ISBLK(dev_stat.st_mode))
+ die("%s is mounted but isn't a block device?!", dev);
+
+ struct bchfs_handle fs = bcache_fs_open(mount->mnt_dir);
+
+ unsigned idx = bchu_disk_get_idx(fs, dev_stat.st_rdev);
+
+ struct bch_sb *sb = bchu_read_super(fs, -1);
+ if (idx >= sb->nr_devices)
+ die("error reading superblock: dev idx >= sb->nr_devices");
+
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ if (!mi)
+ die("error reading superblock: no member info");
+
+ /* could also just read this out of sysfs... meh */
+ struct bch_member *m = mi->members + idx;
+
+ u64 nbuckets = size / le16_to_cpu(m->bucket_size);
+
+ printf("resizing journal on %s to %llu buckets\n", dev, nbuckets);
+ bchu_disk_resize_journal(fs, idx, nbuckets);
+ } else {
+ printf("%s is offline - starting:\n", dev);
+
+ struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty());
+ if (IS_ERR(c))
+ die("error opening %s: %s", dev, strerror(-PTR_ERR(c)));
+
+ struct bch_dev *ca, *resize = NULL;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ if (resize)
+ die("confused: more than one online device?");
+ resize = ca;
+ percpu_ref_get(&resize->io_ref);
+ }
+
+ u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size);
+
+ printf("resizing journal on %s to %llu buckets\n", dev, nbuckets);
+ int ret = bch2_set_nr_journal_buckets(c, resize, nbuckets);
+ if (ret)
+ fprintf(stderr, "resize error: %s\n", strerror(-ret));
+
+ percpu_ref_put(&resize->io_ref);
+ bch2_fs_stop(c);
+ }
+ return 0;
+}
x('L', label, required_argument) \
x('U', uuid, required_argument) \
x(0, fs_size, required_argument) \
+x(0, superblock_size, required_argument) \
x(0, bucket_size, required_argument) \
x('g', group, required_argument) \
x(0, discard, no_argument) \
x(0, data_allowed, required_argument) \
x(0, durability, required_argument) \
+x(0, version, required_argument) \
+x(0, no_initialize, no_argument) \
x('f', force, no_argument) \
x('q', quiet, no_argument) \
x('h', help, no_argument)
" --no_passphrase Don't encrypt master encryption key\n"
" -L, --label=label\n"
" -U, --uuid=uuid\n"
+ " --superblock_size=size\n"
"\n"
"Device specific options:");
darray(char *) device_paths;
struct format_opts opts = format_opts_default();
struct dev_opts dev_opts = dev_opts_default(), *dev;
- bool force = false, no_passphrase = false, quiet = false;
+ bool force = false, no_passphrase = false, quiet = false, initialize = true;
unsigned v;
int opt;
dev_opts.size >>= 9;
break;
+ case O_superblock_size:
+ if (bch2_strtouint_h(optarg, &opts.superblock_size))
+ die("invalid filesystem size");
+
+ opts.superblock_size >>= 9;
+ break;
case O_bucket_size:
dev_opts.bucket_size =
hatoi_validate(optarg, "bucket size");
dev_opts.durability > BCH_REPLICAS_MAX)
die("invalid durability");
break;
+ case O_version:
+ if (kstrtouint(optarg, 10, &opts.version))
+ die("invalid version");
+ break;
+ case O_no_initialize:
+ initialize = false;
+ break;
case O_no_opt:
darray_append(device_paths, optarg);
dev_opts.path = optarg;
if (darray_empty(devices))
die("Please supply a device");
- if (opts.encrypted && !no_passphrase)
+ if (opts.encrypted && !no_passphrase) {
opts.passphrase = read_passphrase_twice("Enter passphrase: ");
+ initialize = false;
+ }
darray_foreach(dev, devices)
dev->fd = open_for_format(dev->path, force);
darray_free(devices);
- if (!opts.passphrase) {
+ if (initialize) {
/*
* Start the filesystem once, to allocate the journal and create
* the root directory:
u64 frag = max((s64) buckets * bucket_size - (s64) sectors, 0LL);
printf_pad(20, " %s:", type);
- printf("%12s%12llu%12s\n",
+ printf(" %15s %15llu %15s\n",
pr_units(sectors, units),
buckets,
pr_units(frag, units));
printf("\n");
printf_pad(20, "%s (device %u):", d->label ?: "(no label)", d->idx);
- printf("%24s%12s\n", d->dev ?: "(device not found)", bch2_dev_state[u.state]);
+ printf("%30s%16s\n", d->dev ?: "(device not found)", bch2_member_states[u.state]);
- printf("%-20s%12s%12s%12s\n",
+ printf("%-20s%16s%16s%16s\n",
"", "data", "buckets", "fragmented");
- for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++) {
+ for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++)
print_dev_usage_type(bch2_data_types[i],
u.bucket_size,
u.buckets[i],
u.sectors[i],
units);
- }
print_dev_usage_type("erasure coded",
u.bucket_size,
units);
printf_pad(20, " available:");
- printf("%12s%12llu\n",
+ printf(" %15s %15llu\n",
pr_units(u.available_buckets * u.bucket_size, units),
u.available_buckets);
printf_pad(20, " capacity:");
- printf("%12s%12llu\n",
+ printf(" %15s %15llu\n",
pr_units(u.nr_buckets * u.bucket_size, units),
u.nr_buckets);
}
exit(8);
}
- if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags))
+ if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags)) {
+ fprintf(stderr, "%s: errors fixed\n", c->name);
ret |= 1;
- if (test_bit(BCH_FS_ERROR, &c->flags))
+ }
+ if (test_bit(BCH_FS_ERROR, &c->flags)) {
+ fprintf(stderr, "%s: still has errors\n", c->name);
ret |= 4;
+ }
bch2_fs_stop(c);
return ret;
struct bkey_inode_buf packed;
int ret;
- bch2_inode_pack(&packed, inode);
- ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+ bch2_inode_pack(c, &packed, inode);
+ ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
NULL, NULL, 0);
if (ret)
die("error updating inode: %s", strerror(-ret));
while (length) {
struct bkey_i_extent *e;
- BKEY_PADDED(k) k;
+ __BKEY_PADDED(k, BKEY_EXTENT_VAL_U64s_MAX) k;
u64 b = sector_to_bucket(ca, physical);
struct disk_reservation res;
unsigned sectors;
bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c);
- ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
&res, NULL, 0);
if (ret)
die("btree insert error %s", strerror(-ret));
bch2_alloc_write(c, false);
}
-static void find_superblock_space(ranges extents, struct dev_opts *dev)
+static void find_superblock_space(ranges extents,
+ struct format_opts opts,
+ struct dev_opts *dev)
{
struct range *i;
u64 end = round_down(i->end,
dev->bucket_size << 9);
- if (start + (128 << 10) <= end) {
+ /* Need space for two superblocks: */
+ if (start + (opts.superblock_size << 9) * 2 <= end) {
dev->sb_offset = start >> 9;
- dev->sb_end = dev->sb_offset + 256;
+ dev->sb_end = dev->sb_offset + opts.superblock_size * 2;
return;
}
}
get_size(dev.path, dev.fd) / 5,
&bcachefs_inum, stat.st_dev, force);
- find_superblock_space(extents, &dev);
+ find_superblock_space(extents, format_opts, &dev);
struct bch_sb *sb = bch2_format(fs_opt_strs,
fs_opts,format_opts, &dev, 1);
int cmd_device_evacuate(int argc, char *argv[]);
int cmd_device_set_state(int argc, char *argv[]);
int cmd_device_resize(int argc, char *argv[]);
+int cmd_device_resize_journal(int argc, char *argv[]);
int cmd_data_rereplicate(int argc, char *argv[]);
+int cmd_data_job(int argc, char *argv[]);
int cmd_unlock(int argc, char *argv[]);
int cmd_set_passphrase(int argc, char *argv[]);
+bcachefs-tools (0.1+git20210404.ce906d66-1) UNRELEASED; urgency=medium
+
+ * New upstream snapshot
+ * Update standards version to 4.5.1
+
+ Currently unreleased due to test failures.
+
+ -- Jonathan Carter <jcc@debian.org> Tue, 06 Apr 2021 15:11:27 +0200
+
bcachefs-tools (0.1+git20201025.742dbbdb-1) unstable; urgency=medium
* New upstream snapshot
Maintainer: Jonathan Carter <jcc@debian.org>
Section: utils
Priority: optional
-Standards-Version: 4.5.0
+Standards-Version: 4.5.1
Rules-Requires-Root: no
Build-Depends: debhelper-compat (= 13),
pkg-config,
-bcachefs-tools_0.1+git20201025.742dbbdb-1_source.buildinfo utils optional
+bcachefs-tools_0.1+git20210404.ce906d66-1_source.buildinfo utils optional
return (old & mask) != 0;
}
+static inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *) addr) + BIT_WORD(nr);
+ unsigned long old;
+
+ old = __atomic_fetch_and(p, ~mask, __ATOMIC_RELAXED);
+
+ return (old & mask) != 0;
+}
+
static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
#define cpu_present(cpu) ((cpu) == 0)
#define cpu_active(cpu) ((cpu) == 0)
+#define raw_smp_processor_id() 0U
+
#define for_each_cpu(cpu, mask) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask) \
static inline void __genradix_iter_advance(struct genradix_iter *iter,
size_t obj_size)
{
+ size_t new_offset = iter->offset + obj_size;
+
+ if (new_offset < iter->offset) {
+ iter->offset = SIZE_MAX;
+ iter->pos = SIZE_MAX;
+ return;
+ }
+
iter->offset += obj_size;
if (!is_power_of_2(obj_size) &&
#define POISON_FREE 0x6b
+static inline void dump_stack(void) {}
+
#endif
#define list_for_each_entry(p, h, m) cds_list_for_each_entry(p, h, m)
#define list_for_each_entry_reverse(p, h, m) cds_list_for_each_entry_reverse(p, h, m)
#define list_for_each_entry_safe(p, n, h, m) cds_list_for_each_entry_safe(p, n, h, m)
-#define list_for_each_entry_safe_reverse(p, n, h, m) cds_list_for_each_entry_safe_reverse(p, n, h, m)
static inline int list_empty_careful(const struct list_head *head)
{
#define list_first_entry_or_null(ptr, type, member) \
(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+#define list_prev_entry(pos, member) \
+ list_entry((pos)->member.prev, typeof(*(pos)), member)
+
+#define list_for_each_entry_safe_reverse(pos, n, head, member) \
+ for (pos = list_last_entry(head, typeof(*pos), member), \
+ n = list_prev_entry(pos, member); \
+ &pos->member != (head); \
+ pos = n, n = list_prev_entry(n, member))
+
/* hlists: */
#include <urcu/hlist.h>
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+#include <linux/poison.h>
+#include <linux/const.h>
+
+/*
+ * Special version of lists, where end of list is not a NULL pointer,
+ * but a 'nulls' marker, which can have many different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * In the standard hlist, termination of a list is the NULL pointer.
+ * In this special 'nulls' variant, we use the fact that objects stored in
+ * a list are aligned on a word (4 or 8 bytes alignment).
+ * We therefore use the last significant bit of 'ptr' :
+ * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ */
+
+struct hlist_nulls_head {
+ struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+ struct hlist_nulls_node *next, **pprev;
+};
+#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_nulls_entry_safe(ptr, type, member) \
+ ({ typeof(ptr) ____ptr = (ptr); \
+ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
+ })
+/**
+ * ptr_is_a_nulls - Test if a ptr is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
+{
+ return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Get the 'nulls' value of the end of chain
+ * @ptr: end of chain
+ *
+ * Should be called only if is_a_nulls(ptr);
+ */
+static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
+{
+ return ((unsigned long)ptr) >> 1;
+}
+
+/**
+ * hlist_nulls_unhashed - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not.
+ */
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+ return !h->pprev;
+}
+
+/**
+ * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
+ * @h: Node to be checked
+ *
+ * Not that not all removal functions will leave a node in unhashed state.
+ * For example, hlist_del_init_rcu() leaves the node in unhashed state,
+ * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this
+ * function may be used locklessly.
+ */
+static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
+{
+ return !READ_ONCE(h->pprev);
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+ return is_a_nulls(READ_ONCE(h->first));
+}
+
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+ struct hlist_nulls_head *h)
+{
+ struct hlist_nulls_node *first = h->first;
+
+ n->next = first;
+ WRITE_ONCE(n->pprev, &h->first);
+ h->first = n;
+ if (!is_a_nulls(first))
+ WRITE_ONCE(first->pprev, &n->next);
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+ struct hlist_nulls_node *next = n->next;
+ struct hlist_nulls_node **pprev = n->pprev;
+
+ WRITE_ONCE(*pprev, next);
+ if (!is_a_nulls(next))
+ WRITE_ONCE(next->pprev, pprev);
+}
+
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+ __hlist_nulls_del(n);
+ WRITE_ONCE(n->pprev, LIST_POISON2);
+}
+
+/**
+ * hlist_nulls_for_each_entry - iterate over list of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct hlist_node to use as a loop cursor.
+ * @head: the head for your list.
+ * @member: the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member) \
+ for (pos = (head)->first; \
+ (!is_a_nulls(pos)) && \
+ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct hlist_node to use as a loop cursor.
+ * @member: the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member) \
+ for (; (!is_a_nulls(pos)) && \
+ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = pos->next)
+
+#endif
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define __LINUX_OVERFLOW_H
+
+#include <linux/compiler.h>
+#include <linux/limits.h>
+
+/*
+ * In the fallback code below, we need to compute the minimum and
+ * maximum values representable in a given type. These macros may also
+ * be useful elsewhere, so we provide them outside the
+ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block.
+ *
+ * It would seem more obvious to do something like
+ *
+ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0)
+ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0)
+ *
+ * Unfortunately, the middle expressions, strictly speaking, have
+ * undefined behaviour, and at least some versions of gcc warn about
+ * the type_max expression (but not if -fsanitize=undefined is in
+ * effect; in that case, the warning is deferred to runtime...).
+ *
+ * The slightly excessive casting in type_min is to make sure the
+ * macros also produce sensible values for the exotic type _Bool. [The
+ * overflow checkers only almost work for _Bool, but that's
+ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on
+ * _Bools. Besides, the gcc builtins don't allow _Bool* as third
+ * argument.]
+ *
+ * Idea stolen from
+ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
+ * credit to Christian Biere.
+ */
+#define is_signed_type(type) (((type)(-1)) < (type)1)
+#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
+#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
+#define type_min(T) ((T)((T)-type_max(T)-(T)1))
+
+/*
+ * Avoids triggering -Wtype-limits compilation warning,
+ * while using unsigned data types to check a < 0.
+ */
+#define is_non_negative(a) ((a) > 0 || (a) == 0)
+#define is_negative(a) (!(is_non_negative(a)))
+
+/*
+ * Allows for effectively applying __must_check to a macro so we can have
+ * both the type-agnostic benefits of the macros while also being able to
+ * enforce that the return value is, in fact, checked.
+ */
+static inline bool __must_check __must_check_overflow(bool overflow)
+{
+ return unlikely(overflow);
+}
+
+#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
+/*
+ * For simplicity and code hygiene, the fallback code below insists on
+ * a, b and *d having the same type (similar to the min() and max()
+ * macros), whereas gcc's type-generic overflow checkers accept
+ * different types. Hence we don't just make check_add_overflow an
+ * alias for __builtin_add_overflow, but add type checks similar to
+ * below.
+ */
+#define check_add_overflow(a, b, d) __must_check_overflow(({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_add_overflow(__a, __b, __d); \
+}))
+
+#define check_sub_overflow(a, b, d) __must_check_overflow(({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_sub_overflow(__a, __b, __d); \
+}))
+
+#define check_mul_overflow(a, b, d) __must_check_overflow(({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ __builtin_mul_overflow(__a, __b, __d); \
+}))
+
+#else
+
+
+/* Checking for unsigned overflow is relatively easy without causing UB. */
+#define __unsigned_add_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a + __b; \
+ *__d < __a; \
+})
+#define __unsigned_sub_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a - __b; \
+ __a < __b; \
+})
+/*
+ * If one of a or b is a compile-time constant, this avoids a division.
+ */
+#define __unsigned_mul_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = __a * __b; \
+ __builtin_constant_p(__b) ? \
+ __b > 0 && __a > type_max(typeof(__a)) / __b : \
+ __a > 0 && __b > type_max(typeof(__b)) / __a; \
+})
+
+/*
+ * For signed types, detecting overflow is much harder, especially if
+ * we want to avoid UB. But the interface of these macros is such that
+ * we must provide a result in *d, and in fact we must produce the
+ * result promised by gcc's builtins, which is simply the possibly
+ * wrapped-around value. Fortunately, we can just formally do the
+ * operations in the widest relevant unsigned type (u64) and then
+ * truncate the result - gcc is smart enough to generate the same code
+ * with and without the (u64) casts.
+ */
+
+/*
+ * Adding two signed integers can overflow only if they have the same
+ * sign, and overflow has happened iff the result has the opposite
+ * sign.
+ */
+#define __signed_add_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a + (u64)__b; \
+ (((~(__a ^ __b)) & (*__d ^ __a)) \
+ & type_min(typeof(__a))) != 0; \
+})
+
+/*
+ * Subtraction is similar, except that overflow can now happen only
+ * when the signs are opposite. In this case, overflow has happened if
+ * the result has the opposite sign of a.
+ */
+#define __signed_sub_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a - (u64)__b; \
+ ((((__a ^ __b)) & (*__d ^ __a)) \
+ & type_min(typeof(__a))) != 0; \
+})
+
+/*
+ * Signed multiplication is rather hard. gcc always follows C99, so
+ * division is truncated towards 0. This means that we can write the
+ * overflow check like this:
+ *
+ * (a > 0 && (b > MAX/a || b < MIN/a)) ||
+ * (a < -1 && (b > MIN/a || b < MAX/a) ||
+ * (a == -1 && b == MIN)
+ *
+ * The redundant casts of -1 are to silence an annoying -Wtype-limits
+ * (included in -Wextra) warning: When the type is u8 or u16, the
+ * __b_c_e in check_mul_overflow obviously selects
+ * __unsigned_mul_overflow, but unfortunately gcc still parses this
+ * code and warns about the limited range of __b.
+ */
+
+#define __signed_mul_overflow(a, b, d) ({ \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ typeof(d) __d = (d); \
+ typeof(a) __tmax = type_max(typeof(a)); \
+ typeof(a) __tmin = type_min(typeof(a)); \
+ (void) (&__a == &__b); \
+ (void) (&__a == __d); \
+ *__d = (u64)__a * (u64)__b; \
+ (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \
+ (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \
+ (__b == (typeof(__b))-1 && __a == __tmin); \
+})
+
+
+#define check_add_overflow(a, b, d) __must_check_overflow( \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_add_overflow(a, b, d), \
+ __unsigned_add_overflow(a, b, d)))
+
+#define check_sub_overflow(a, b, d) __must_check_overflow( \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_sub_overflow(a, b, d), \
+ __unsigned_sub_overflow(a, b, d)))
+
+#define check_mul_overflow(a, b, d) __must_check_overflow( \
+ __builtin_choose_expr(is_signed_type(typeof(a)), \
+ __signed_mul_overflow(a, b, d), \
+ __unsigned_mul_overflow(a, b, d)))
+
+#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
+
+/** check_shl_overflow() - Calculate a left-shifted value and check overflow
+ *
+ * @a: Value to be shifted
+ * @s: How many bits left to shift
+ * @d: Pointer to where to store the result
+ *
+ * Computes *@d = (@a << @s)
+ *
+ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't
+ * make sense. Example conditions:
+ * - 'a << s' causes bits to be lost when stored in *d.
+ * - 's' is garbage (e.g. negative) or so large that the result of
+ * 'a << s' is guaranteed to be 0.
+ * - 'a' is negative.
+ * - 'a << s' sets the sign bit, if any, in '*d'.
+ *
+ * '*d' will hold the results of the attempted shift, but is not
+ * considered "safe for use" if false is returned.
+ */
+#define check_shl_overflow(a, s, d) __must_check_overflow(({ \
+ typeof(a) _a = a; \
+ typeof(s) _s = s; \
+ typeof(d) _d = d; \
+ u64 _a_full = _a; \
+ unsigned int _to_shift = \
+ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \
+ *_d = (_a_full << _to_shift); \
+ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \
+ (*_d >> _to_shift) != _a); \
+}))
+
+/**
+ * array_size() - Calculate size of 2-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ *
+ * Calculates size of 2-dimensional array: @a * @b.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array_size(size_t a, size_t b)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(a, b, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+/**
+ * array3_size() - Calculate size of 3-dimensional array.
+ *
+ * @a: dimension one
+ * @b: dimension two
+ * @c: dimension three
+ *
+ * Calculates size of 3-dimensional array: @a * @b * @c.
+ *
+ * Returns: number of bytes needed to represent the array or SIZE_MAX on
+ * overflow.
+ */
+static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(a, b, &bytes))
+ return SIZE_MAX;
+ if (check_mul_overflow(bytes, c, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+/*
+ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
+ * struct_size() below.
+ */
+static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
+{
+ size_t bytes;
+
+ if (check_mul_overflow(a, b, &bytes))
+ return SIZE_MAX;
+ if (check_add_overflow(bytes, c, &bytes))
+ return SIZE_MAX;
+
+ return bytes;
+}
+
+/**
+ * struct_size() - Calculate size of structure with trailing array.
+ * @p: Pointer to the structure.
+ * @member: Name of the array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of memory needed for structure @p followed by an
+ * array of @count number of @member elements.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define struct_size(p, member, count) \
+ __ab_c_size(count, \
+ sizeof(*(p)->member) + __must_be_array((p)->member),\
+ sizeof(*(p)))
+
+/**
+ * flex_array_size() - Calculate size of a flexible array member
+ * within an enclosing structure.
+ *
+ * @p: Pointer to the structure.
+ * @member: Name of the flexible array member.
+ * @count: Number of elements in the array.
+ *
+ * Calculates size of a flexible array of @count number of @member
+ * elements, at the end of structure @p.
+ *
+ * Return: number of bytes needed or SIZE_MAX on overflow.
+ */
+#define flex_array_size(p, member, count) \
+ array_size(count, \
+ sizeof(*(p)->member) + __must_be_array((p)->member))
+
+#endif /* __LINUX_OVERFLOW_H */
#define kmap_atomic(page) page_address(page)
#define kunmap_atomic(addr) do {} while (0)
+#define PageHighMem(page) false
+
static const char zero_page[PAGE_SIZE];
#define ZERO_PAGE(o) ((struct page *) &zero_page[0])
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_POISON_H
+#define _LINUX_POISON_H
+
+/********** include/linux/list.h **********/
+
+/*
+ * Architectures might want to move the poison pointer offset
+ * into some well-recognized area such as 0xdead000000000000,
+ * that is also not mappable by user-space exploits:
+ */
+#ifdef CONFIG_ILLEGAL_POINTER_VALUE
+# define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL)
+#else
+# define POISON_POINTER_DELTA 0
+#endif
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2 ((void *) 0x122 + POISON_POINTER_DELTA)
+
+/********** include/linux/timer.h **********/
+#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
+
+/********** mm/page_poison.c **********/
+#ifdef CONFIG_PAGE_POISONING_ZERO
+#define PAGE_POISON 0x00
+#else
+#define PAGE_POISON 0xaa
+#endif
+
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA)
+
+/********** mm/slab.c **********/
+/*
+ * Magic nums for obj red zoning.
+ * Placed in the first word before and the first word after an obj.
+ */
+#define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */
+#define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */
+
+#define SLUB_RED_INACTIVE 0xbb
+#define SLUB_RED_ACTIVE 0xcc
+
+/* ...and for poisoning */
+#define POISON_INUSE 0x5a /* for use-uninitialised poisoning */
+#define POISON_FREE 0x6b /* for use-after-free poisoning */
+#define POISON_END 0xa5 /* end-byte of poisoning */
+
+/********** arch/$ARCH/mm/init.c **********/
+#define POISON_FREE_INITMEM 0xcc
+
+/********** arch/ia64/hp/common/sba_iommu.c **********/
+/*
+ * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a
+ * value of "SBAIOMMU POISON\0" for spill-over poisoning.
+ */
+
+/********** fs/jbd/journal.c **********/
+#define JBD_POISON_FREE 0x5b
+#define JBD2_POISON_FREE 0x5c
+
+/********** drivers/base/dmapool.c **********/
+#define POOL_POISON_FREED 0xa7 /* !inuse */
+#define POOL_POISON_ALLOCATED 0xa9 /* !initted */
+
+/********** drivers/atm/ **********/
+#define ATM_POISON_FREE 0x12
+#define ATM_POISON 0xdeadbeef
+
+/********** kernel/mutexes **********/
+#define MUTEX_DEBUG_INIT 0x11
+#define MUTEX_DEBUG_FREE 0x22
+#define MUTEX_POISON_WW_CTX ((void *) 0x500 + POISON_POINTER_DELTA)
+
+/********** security/ **********/
+#define KEY_DESTROY 0xbd
+
+#endif
get_random_type(int);
get_random_type(long);
+get_random_type(u32);
get_random_type(u64);
#endif /* _LINUX_RANDOM_H */
#define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v)
+/* Has the specified rcu_head structure been handed to call_rcu()? */
+
+/**
+ * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
+ * @rhp: The rcu_head structure to initialize.
+ *
+ * If you intend to invoke rcu_head_after_call_rcu() to test whether a
+ * given rcu_head structure has already been passed to call_rcu(), then
+ * you must also invoke this rcu_head_init() function on it just after
+ * allocating that structure. Calls to this function must not race with
+ * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
+ */
+static inline void rcu_head_init(struct rcu_head *rhp)
+{
+ rhp->func = (void *)~0L;
+}
+
+static inline bool
+rcu_head_after_call_rcu(struct rcu_head *rhp,
+ void (*f)(struct rcu_head *head))
+{
+ void (*func)(struct rcu_head *head) = READ_ONCE(rhp->func);
+
+ if (func == f)
+ return true;
+ return false;
+}
+
#endif /* __TOOLS_LINUX_RCUPDATE_H */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Simple structures that might be needed in include
+ * files.
+ */
+
+#ifndef _LINUX_RHASHTABLE_TYPES_H
+#define _LINUX_RHASHTABLE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct rhash_head {
+ struct rhash_head __rcu *next;
+};
+
+struct rhlist_head {
+ struct rhash_head rhead;
+ struct rhlist_head __rcu *next;
+};
+
+struct bucket_table;
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+ struct rhashtable *ht;
+ const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+ const void *obj);
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+ u16 nelem_hint;
+ u16 key_len;
+ u16 key_offset;
+ u16 head_offset;
+ unsigned int max_size;
+ u16 min_size;
+ bool automatic_shrinking;
+ rht_hashfn_t hashfn;
+ rht_obj_hashfn_t obj_hashfn;
+ rht_obj_cmpfn_t obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @key_len: Key length for hashfn
+ * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
+ */
+struct rhashtable {
+ struct bucket_table __rcu *tbl;
+ unsigned int key_len;
+ unsigned int max_elems;
+ struct rhashtable_params p;
+ bool rhlist;
+ struct work_struct run_work;
+ struct mutex mutex;
+ spinlock_t lock;
+ atomic_t nelems;
+};
+
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+ struct rhashtable ht;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+ struct list_head list;
+ struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @list: Current hash list pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+ struct rhashtable *ht;
+ struct rhash_head *p;
+ struct rhlist_head *list;
+ struct rhashtable_walker walker;
+ unsigned int slot;
+ unsigned int skip;
+ bool end_of_table;
+};
+
+int rhashtable_init(struct rhashtable *ht,
+ const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+ const struct rhashtable_params *params);
+
+#endif /* _LINUX_RHASHTABLE_TYPES_H */
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Resizable, Scalable, Concurrent Hash Table
*
- * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au>
* Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
* Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
*
#ifndef _LINUX_RHASHTABLE_H
#define _LINUX_RHASHTABLE_H
-#include <linux/atomic.h>
-#include <linux/cache.h>
-#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
-#include <linux/workqueue.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
+#include <linux/list_nulls.h>
#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
-#define RHT_BASE_BITS 4
-#define RHT_HASH_BITS 27
-#define RHT_BASE_SHIFT RHT_HASH_BITS
-#define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1)
+#define BIT(nr) (1UL << (nr))
-struct rhash_head {
- struct rhash_head __rcu *next;
-};
+#include <linux/rhashtable-types.h>
+/*
+ * Objects in an rhashtable have an embedded struct rhash_head
+ * which is linked into as hash chain from the hash table - or one
+ * of two or more hash tables when the rhashtable is being resized.
+ * The end of the chain is marked with a special nulls marks which has
+ * the least significant bit set but otherwise stores the address of
+ * the hash bucket. This allows us to be sure we've found the end
+ * of the right list.
+ * The value stored in the hash bucket has BIT(0) used as a lock bit.
+ * This bit must be atomically set before any changes are made to
+ * the chain. To avoid dereferencing this pointer without clearing
+ * the bit first, we use an opaque 'struct rhash_lock_head *' for the
+ * pointer stored in the bucket. This struct needs to be defined so
+ * that rcu_dereference() works on it, but it has no content so a
+ * cast is needed for it to be useful. This ensures it isn't
+ * used by mistake with clearing the lock bit first.
+ */
+struct rhash_lock_head {};
+/* Maximum chain length before rehash
+ *
+ * The maximum (not average) chain length grows with the size of the hash
+ * table, at a rate of (log N)/(log log N).
+ *
+ * The value of 16 is selected so that even if the hash table grew to
+ * 2^32 you would not expect the maximum chain length to exceed it
+ * unless we are under attack (or extremely unlucky).
+ *
+ * As this limit is only to detect attacks, we don't need to set it to a
+ * lower value as you'd need the chain length to vastly exceed 16 to have
+ * any real effect on the system.
+ */
+#define RHT_ELASTICITY 16u
+
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @nest: Number of bits of first-level nested table.
+ * @rehash: Current bucket being rehashed
+ * @hash_rnd: Random seed to fold into hash
+ * @walkers: List of active walkers
+ * @rcu: RCU structure for freeing the table
+ * @future_tbl: Table under construction during rehashing
+ * @ntbl: Nested table used when out of memory.
+ * @buckets: size * hash buckets
+ */
struct bucket_table {
unsigned int size;
- unsigned int rehash;
+ unsigned int nest;
u32 hash_rnd;
- unsigned int locks_mask;
- spinlock_t *locks;
struct list_head walkers;
struct rcu_head rcu;
struct bucket_table __rcu *future_tbl;
- struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
-};
-
-struct rhashtable_compare_arg {
- struct rhashtable *ht;
- const void *key;
+ struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
};
-typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
- const void *obj);
-
-struct rhashtable_params {
- size_t nelem_hint;
- size_t key_len;
- size_t key_offset;
- size_t head_offset;
- unsigned int insecure_max_entries;
- unsigned int max_size;
- unsigned int min_size;
- u32 nulls_base;
- bool insecure_elasticity;
- bool automatic_shrinking;
- size_t locks_mul;
- rht_hashfn_t hashfn;
- rht_obj_hashfn_t obj_hashfn;
- rht_obj_cmpfn_t obj_cmpfn;
-};
-
-struct rhashtable {
- struct bucket_table __rcu *tbl;
- atomic_t nelems;
- unsigned int key_len;
- unsigned int elasticity;
- struct rhashtable_params p;
- struct work_struct run_work;
- struct mutex mutex;
- spinlock_t lock;
-};
-
-struct rhashtable_walker {
- struct list_head list;
- struct bucket_table *tbl;
-};
-
-#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
-
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
- return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
-#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
- ((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+/*
+ * NULLS_MARKER() expects a hash value with the low
+ * bits mostly likely to be significant, and it discards
+ * the msb.
+ * We give it an address, in which the bottom bit is
+ * always 0, and the msb might be significant.
+ * So we shift the address down one bit to align with
+ * expectations and avoid losing a significant bit.
+ *
+ * We never store the NULLS_MARKER in the hash table
+ * itself as we need the lsb for locking.
+ * Instead we store a NULL
+ */
+#define RHT_NULLS_MARKER(ptr) \
+ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
+#define INIT_RHT_NULLS_HEAD(ptr) \
+ ((ptr) = NULL)
static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
{
static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
unsigned int hash)
{
- return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+ return hash & (tbl->size - 1);
}
-static inline unsigned int rht_key_hashfn(
- struct rhashtable *ht, const struct bucket_table *tbl,
- const void *key, const struct rhashtable_params params)
+static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
+ const void *key, const struct rhashtable_params params,
+ unsigned int hash_rnd)
{
unsigned int hash;
/* params must be equal to ht->p if it isn't constant. */
if (!__builtin_constant_p(params.key_len))
- hash = ht->p.hashfn(key, ht->key_len, tbl->hash_rnd);
+ hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
else if (params.key_len) {
unsigned int key_len = params.key_len;
if (params.hashfn)
- hash = params.hashfn(key, key_len, tbl->hash_rnd);
+ hash = params.hashfn(key, key_len, hash_rnd);
else if (key_len & (sizeof(u32) - 1))
- hash = jhash(key, key_len, tbl->hash_rnd);
+ hash = jhash(key, key_len, hash_rnd);
else
- hash = jhash2(key, key_len / sizeof(u32),
- tbl->hash_rnd);
+ hash = jhash2(key, key_len / sizeof(u32), hash_rnd);
} else {
unsigned int key_len = ht->p.key_len;
if (params.hashfn)
- hash = params.hashfn(key, key_len, tbl->hash_rnd);
+ hash = params.hashfn(key, key_len, hash_rnd);
else
- hash = jhash(key, key_len, tbl->hash_rnd);
+ hash = jhash(key, key_len, hash_rnd);
}
+ return hash;
+}
+
+static inline unsigned int rht_key_hashfn(
+ struct rhashtable *ht, const struct bucket_table *tbl,
+ const void *key, const struct rhashtable_params params)
+{
+ unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd);
+
return rht_bucket_index(tbl, hash);
}
rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
}
+/**
+ * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
+ * @ht: hash table
+ * @tbl: current table
+ */
static inline bool rht_grow_above_75(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
(!ht->p.max_size || tbl->size < ht->p.max_size);
}
+/**
+ * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
+ * @ht: hash table
+ * @tbl: current table
+ */
static inline bool rht_shrink_below_30(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
tbl->size > ht->p.min_size;
}
+/**
+ * rht_grow_above_100 - returns true if nelems > table-size
+ * @ht: hash table
+ * @tbl: current table
+ */
static inline bool rht_grow_above_100(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
(!ht->p.max_size || tbl->size < ht->p.max_size);
}
+/**
+ * rht_grow_above_max - returns true if table is above maximum
+ * @ht: hash table
+ * @tbl: current table
+ */
static inline bool rht_grow_above_max(const struct rhashtable *ht,
const struct bucket_table *tbl)
{
- return ht->p.insecure_max_entries &&
- atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+ return atomic_read(&ht->nelems) >= ht->max_elems;
}
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
- unsigned int hash)
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
+#else
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
- return &tbl->locks[hash & tbl->locks_mask];
+ return 1;
}
-int rhashtable_insert_rehash(struct rhashtable *, struct bucket_table *);
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *,
- const void *,
- struct rhash_head *,
- struct bucket_table *);
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+ u32 hash)
+{
+ return 1;
+}
+#endif /* CONFIG_PROVE_LOCKING */
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+ struct rhash_head *obj);
-int rhashtable_init(struct rhashtable *, const struct rhashtable_params *);
-void rhashtable_destroy(struct rhashtable *);
+void rhashtable_walk_enter(struct rhashtable *ht,
+ struct rhashtable_iter *iter);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
-#define rht_dereference(p, ht) rcu_dereference(p)
-#define rht_dereference_rcu(p, ht) rcu_dereference(p)
-#define rht_dereference_bucket(p, tbl, hash) rcu_dereference(p)
-#define rht_dereference_bucket_rcu(p, tbl, hash) rcu_dereference(p)
+static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+ (void)rhashtable_walk_start_check(iter);
+}
+
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_peek(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
+
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+ void (*free_fn)(void *ptr, void *arg),
+ void *arg);
+void rhashtable_destroy(struct rhashtable *ht);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+ const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+ const struct bucket_table *tbl, unsigned int hash);
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+ struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash);
+
+#define rht_dereference(p, ht) \
+ rcu_dereference(p)
+
+#define rht_dereference_rcu(p, ht) \
+ rcu_dereference(p)
+
+#define rht_dereference_bucket(p, tbl, hash) \
+ rcu_dereference(p)
+
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+ rcu_dereference(p)
#define rht_entry(tpos, pos, member) \
({ tpos = container_of(pos, typeof(*tpos), member); 1; })
-#define rht_for_each_continue(pos, head, tbl, hash) \
- for (pos = rht_dereference_bucket(head, tbl, hash); \
- !rht_is_a_nulls(pos); \
+static inline struct rhash_lock_head __rcu *const *rht_bucket(
+ const struct bucket_table *tbl, unsigned int hash)
+{
+ return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+ &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_var(
+ struct bucket_table *tbl, unsigned int hash)
+{
+ return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
+ &tbl->buckets[hash];
+}
+
+static inline struct rhash_lock_head __rcu **rht_bucket_insert(
+ struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+ return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
+ &tbl->buckets[hash];
+}
+
+/*
+ * We lock a bucket by setting BIT(0) in the pointer - this is always
+ * zero in real pointers. The NULLS mark is never stored in the bucket,
+ * rather we store NULL if the bucket is empty.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer. In that case
+ * we cannot get a lock. For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there. In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer()
+ * provides the same release semantics that bit_spin_unlock() provides,
+ * this is safe.
+ * When we write to a bucket without unlocking, we use rht_assign_locked().
+ */
+
+static inline void rht_lock(struct bucket_table *tbl,
+ struct rhash_lock_head __rcu **bkt)
+{
+ bit_spin_lock(0, (unsigned long *)bkt);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+ struct rhash_lock_head __rcu **bucket,
+ unsigned int subclass)
+{
+ bit_spin_lock(0, (unsigned long *)bucket);
+}
+
+static inline void rht_unlock(struct bucket_table *tbl,
+ struct rhash_lock_head __rcu **bkt)
+{
+ bit_spin_unlock(0, (unsigned long *)bkt);
+}
+
+static inline struct rhash_head *__rht_ptr(
+ struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt)
+{
+ return (struct rhash_head *)
+ ((unsigned long)p & ~BIT(0) ?:
+ (unsigned long)RHT_NULLS_MARKER(bkt));
+}
+
+/*
+ * Where 'bkt' is a bucket and might be locked:
+ * rht_ptr_rcu() dereferences that pointer and clears the lock bit.
+ * rht_ptr() dereferences in a context where the bucket is locked.
+ * rht_ptr_exclusive() dereferences in a context where exclusive
+ * access is guaranteed, such as when destroying the table.
+ */
+static inline struct rhash_head *rht_ptr_rcu(
+ struct rhash_lock_head __rcu *const *bkt)
+{
+ return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline struct rhash_head *rht_ptr(
+ struct rhash_lock_head __rcu *const *bkt,
+ struct bucket_table *tbl,
+ unsigned int hash)
+{
+ return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt);
+}
+
+static inline struct rhash_head *rht_ptr_exclusive(
+ struct rhash_lock_head __rcu *const *bkt)
+{
+ return __rht_ptr(rcu_dereference(*bkt), bkt);
+}
+
+static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
+ struct rhash_head *obj)
+{
+ if (rht_is_a_nulls(obj))
+ obj = NULL;
+ rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0)));
+}
+
+static inline void rht_assign_unlock(struct bucket_table *tbl,
+ struct rhash_lock_head __rcu **bkt,
+ struct rhash_head *obj)
+{
+ if (rht_is_a_nulls(obj))
+ obj = NULL;
+ rcu_assign_pointer(*bkt, (void *)obj);
+ preempt_enable();
+ __release(bitlock);
+}
+
+/**
+ * rht_for_each_from - iterate over hash chain from given head
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @head: the &struct rhash_head to start from
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ */
+#define rht_for_each_from(pos, head, tbl, hash) \
+ for (pos = head; \
+ !rht_is_a_nulls(pos); \
pos = rht_dereference_bucket((pos)->next, tbl, hash))
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ */
#define rht_for_each(pos, tbl, hash) \
- rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+ rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+ tbl, hash)
+
+/**
+ * rht_for_each_entry_from - iterate over hash chain from given head
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @head: the &struct rhash_head to start from
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \
+ for (pos = head; \
+ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
+ pos = rht_dereference_bucket((pos)->next, tbl, hash))
-#define rht_for_each_rcu_continue(pos, head, tbl, hash) \
+/**
+ * rht_for_each_entry - iterate over hash chain of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry(tpos, pos, tbl, hash, member) \
+ rht_for_each_entry_from(tpos, pos, \
+ rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+ tbl, hash, member)
+
+/**
+ * rht_for_each_entry_safe - safely iterate over hash chain of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @next: the &struct rhash_head to use as next in loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive allows for the looped code to
+ * remove the loop cursor from the list.
+ */
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \
+ for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+ next = !rht_is_a_nulls(pos) ? \
+ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
+ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
+ pos = next, \
+ next = !rht_is_a_nulls(pos) ? \
+ rht_dereference_bucket(pos->next, tbl, hash) : NULL)
+
+/**
+ * rht_for_each_rcu_from - iterate over rcu hash chain from given head
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @head: the &struct rhash_head to start from
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_from(pos, head, tbl, hash) \
for (({barrier(); }), \
- pos = rht_dereference_bucket_rcu(head, tbl, hash); \
+ pos = head; \
!rht_is_a_nulls(pos); \
pos = rcu_dereference_raw(pos->next))
-#define rht_for_each_rcu(pos, tbl, hash) \
- rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+/**
+ * rht_for_each_rcu - iterate over rcu hash chain
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu(pos, tbl, hash) \
+ for (({barrier(); }), \
+ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \
+ !rht_is_a_nulls(pos); \
+ pos = rcu_dereference_raw(pos->next))
-#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+/**
+ * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @head: the &struct rhash_head to start from
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
for (({barrier(); }), \
- pos = rht_dereference_bucket_rcu(head, tbl, hash); \
+ pos = head; \
(!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \
pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
-#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \
- rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
- tbl, hash, member)
+/**
+ * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rhash_head to use as a loop cursor.
+ * @tbl: the &struct bucket_table
+ * @hash: the hash value / bucket index
+ * @member: name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \
+ rht_for_each_entry_rcu_from(tpos, pos, \
+ rht_ptr_rcu(rht_bucket(tbl, hash)), \
+ tbl, hash, member)
+
+/**
+ * rhl_for_each_rcu - iterate over rcu hash table list
+ * @pos: the &struct rlist_head to use as a loop cursor.
+ * @list: the head of the list
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_rcu(pos, list) \
+ for (pos = list; pos; pos = rcu_dereference_raw(pos->next))
+
+/**
+ * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct rlist_head to use as a loop cursor.
+ * @list: the head of the list
+ * @member: name of the &struct rlist_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive should be used on the
+ * list returned by rhltable_lookup.
+ */
+#define rhl_for_each_entry_rcu(tpos, pos, list, member) \
+ for (pos = list; pos && rht_entry(tpos, pos, member); \
+ pos = rcu_dereference_raw(pos->next))
static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
const void *obj)
return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
}
-static inline void *rhashtable_lookup_fast(
+/* Internal function, do not use. */
+static inline struct rhash_head *__rhashtable_lookup(
struct rhashtable *ht, const void *key,
const struct rhashtable_params params)
{
.ht = ht,
.key = key,
};
- const struct bucket_table *tbl;
+ struct rhash_lock_head __rcu *const *bkt;
+ struct bucket_table *tbl;
struct rhash_head *he;
unsigned int hash;
- rcu_read_lock();
-
tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
hash = rht_key_hashfn(ht, tbl, key, params);
- rht_for_each_rcu(he, tbl, hash) {
- if (params.obj_cmpfn ?
- params.obj_cmpfn(&arg, rht_obj(ht, he)) :
- rhashtable_compare(&arg, rht_obj(ht, he)))
- continue;
- rcu_read_unlock();
- return rht_obj(ht, he);
- }
+ bkt = rht_bucket(tbl, hash);
+ do {
+ rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) {
+ if (params.obj_cmpfn ?
+ params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+ rhashtable_compare(&arg, rht_obj(ht, he)))
+ continue;
+ return he;
+ }
+ /* An object might have been moved to a different hash chain,
+ * while we walk along it - better check and retry.
+ */
+ } while (he != RHT_NULLS_MARKER(bkt));
/* Ensure we see any new tables. */
smp_rmb();
tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (unlikely(tbl))
goto restart;
- rcu_read_unlock();
return NULL;
}
-static inline int __rhashtable_insert_fast(
- struct rhashtable *ht, const void *key, struct rhash_head *obj,
+/**
+ * rhashtable_lookup - search hash table
+ * @ht: hash table
+ * @key: the pointer to the key
+ * @params: hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup(
+ struct rhashtable *ht, const void *key,
+ const struct rhashtable_params params)
+{
+ struct rhash_head *he = __rhashtable_lookup(ht, key, params);
+
+ return he ? rht_obj(ht, he) : NULL;
+}
+
+/**
+ * rhashtable_lookup_fast - search hash table, without RCU read lock
+ * @ht: hash table
+ * @key: the pointer to the key
+ * @params: hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. The first matching entry is returned.
+ *
+ * Only use this function when you have other mechanisms guaranteeing
+ * that the object won't go away after the RCU read lock is released.
+ *
+ * Returns the first entry on which the compare function returned true.
+ */
+static inline void *rhashtable_lookup_fast(
+ struct rhashtable *ht, const void *key,
+ const struct rhashtable_params params)
+{
+ void *obj;
+
+ rcu_read_lock();
+ obj = rhashtable_lookup(ht, key, params);
+ rcu_read_unlock();
+
+ return obj;
+}
+
+/**
+ * rhltable_lookup - search hash list table
+ * @hlt: hash table
+ * @key: the pointer to the key
+ * @params: hash table parameters
+ *
+ * Computes the hash value for the key and traverses the bucket chain looking
+ * for a entry with an identical key. All matching entries are returned
+ * in a list.
+ *
+ * This must only be called under the RCU read lock.
+ *
+ * Returns the list of entries that match the given key.
+ */
+static inline struct rhlist_head *rhltable_lookup(
+ struct rhltable *hlt, const void *key,
const struct rhashtable_params params)
+{
+ struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params);
+
+ return he ? container_of(he, struct rhlist_head, rhead) : NULL;
+}
+
+/* Internal function, please use rhashtable_insert_fast() instead. This
+ * function returns the existing element already in hashes in there is a clash,
+ * otherwise it returns an error via ERR_PTR().
+ */
+static inline void *__rhashtable_insert_fast(
+ struct rhashtable *ht, const void *key, struct rhash_head *obj,
+ const struct rhashtable_params params, bool rhlist)
{
struct rhashtable_compare_arg arg = {
.ht = ht,
.key = key,
};
- struct bucket_table *tbl, *new_tbl;
+ struct rhash_lock_head __rcu **bkt;
+ struct rhash_head __rcu **pprev;
+ struct bucket_table *tbl;
struct rhash_head *head;
- spinlock_t *lock;
- unsigned int elasticity;
unsigned int hash;
- int err;
+ int elasticity;
+ void *data;
-restart:
rcu_read_lock();
tbl = rht_dereference_rcu(ht->tbl, ht);
+ hash = rht_head_hashfn(ht, tbl, obj, params);
+ elasticity = RHT_ELASTICITY;
+ bkt = rht_bucket_insert(ht, tbl, hash);
+ data = ERR_PTR(-ENOMEM);
+ if (!bkt)
+ goto out;
+ pprev = NULL;
+ rht_lock(tbl, bkt);
- /* All insertions must grab the oldest table containing
- * the hashed bucket that is yet to be rehashed.
- */
- for (;;) {
- hash = rht_head_hashfn(ht, tbl, obj, params);
- lock = rht_bucket_lock(tbl, hash);
- spin_lock_bh(lock);
+ if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
+slow_path:
+ rht_unlock(tbl, bkt);
+ rcu_read_unlock();
+ return rhashtable_insert_slow(ht, key, obj);
+ }
- if (tbl->rehash <= hash)
- break;
+ rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+ struct rhlist_head *plist;
+ struct rhlist_head *list;
- spin_unlock_bh(lock);
- tbl = rht_dereference_rcu(tbl->future_tbl, ht);
- }
+ elasticity--;
+ if (!key ||
+ (params.obj_cmpfn ?
+ params.obj_cmpfn(&arg, rht_obj(ht, head)) :
+ rhashtable_compare(&arg, rht_obj(ht, head)))) {
+ pprev = &head->next;
+ continue;
+ }
- new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
- if (unlikely(new_tbl)) {
- tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
- if (!IS_ERR_OR_NULL(tbl))
- goto slow_path;
+ data = rht_obj(ht, head);
- err = PTR_ERR(tbl);
- goto out;
- }
+ if (!rhlist)
+ goto out_unlock;
- err = -E2BIG;
- if (unlikely(rht_grow_above_max(ht, tbl)))
- goto out;
- if (unlikely(rht_grow_above_100(ht, tbl))) {
-slow_path:
- spin_unlock_bh(lock);
- err = rhashtable_insert_rehash(ht, tbl);
- rcu_read_unlock();
- if (err)
- return err;
+ list = container_of(obj, struct rhlist_head, rhead);
+ plist = container_of(head, struct rhlist_head, rhead);
- goto restart;
+ RCU_INIT_POINTER(list->next, plist);
+ head = rht_dereference_bucket(head->next, tbl, hash);
+ RCU_INIT_POINTER(list->rhead.next, head);
+ if (pprev) {
+ rcu_assign_pointer(*pprev, obj);
+ rht_unlock(tbl, bkt);
+ } else
+ rht_assign_unlock(tbl, bkt, obj);
+ data = NULL;
+ goto out;
}
- err = -EEXIST;
- elasticity = ht->elasticity;
- rht_for_each(head, tbl, hash) {
- if (key &&
- unlikely(!(params.obj_cmpfn ?
- params.obj_cmpfn(&arg, rht_obj(ht, head)) :
- rhashtable_compare(&arg, rht_obj(ht, head)))))
- goto out;
- if (!--elasticity)
- goto slow_path;
- }
+ if (elasticity <= 0)
+ goto slow_path;
+
+ data = ERR_PTR(-E2BIG);
+ if (unlikely(rht_grow_above_max(ht, tbl)))
+ goto out_unlock;
- err = 0;
+ if (unlikely(rht_grow_above_100(ht, tbl)))
+ goto slow_path;
- head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+ /* Inserting at head of list makes unlocking free. */
+ head = rht_ptr(bkt, tbl, hash);
RCU_INIT_POINTER(obj->next, head);
+ if (rhlist) {
+ struct rhlist_head *list;
- rcu_assign_pointer(tbl->buckets[hash], obj);
+ list = container_of(obj, struct rhlist_head, rhead);
+ RCU_INIT_POINTER(list->next, NULL);
+ }
atomic_inc(&ht->nelems);
+ rht_assign_unlock(tbl, bkt, obj);
+
if (rht_grow_above_75(ht, tbl))
schedule_work(&ht->run_work);
+ data = NULL;
out:
- spin_unlock_bh(lock);
rcu_read_unlock();
- return err;
+ return data;
+
+out_unlock:
+ rht_unlock(tbl, bkt);
+ goto out;
}
+/**
+ * rhashtable_insert_fast - insert object into hash table
+ * @ht: hash table
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhashtable_insert_fast(
+ struct rhashtable *ht, struct rhash_head *obj,
+ const struct rhashtable_params params)
+{
+ void *ret;
+
+ ret = __rhashtable_insert_fast(ht, NULL, obj, params, false);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhltable_insert_key - insert object into hash list table
+ * @hlt: hash list table
+ * @key: the pointer to the key
+ * @list: pointer to hash list head inside object
+ * @params: hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert_key(
+ struct rhltable *hlt, const void *key, struct rhlist_head *list,
+ const struct rhashtable_params params)
+{
+ return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead,
+ params, true));
+}
+
+/**
+ * rhltable_insert - insert object into hash list table
+ * @hlt: hash list table
+ * @list: pointer to hash list head inside object
+ * @params: hash table parameters
+ *
+ * Will take the per bucket bitlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
+static inline int rhltable_insert(
+ struct rhltable *hlt, struct rhlist_head *list,
+ const struct rhashtable_params params)
+{
+ const char *key = rht_obj(&hlt->ht, &list->rhead);
+
+ key += params.key_offset;
+
+ return rhltable_insert_key(hlt, key, list, params);
+}
+
+/**
+ * rhashtable_lookup_insert_fast - lookup and insert object into hash table
+ * @ht: hash table
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ */
static inline int rhashtable_lookup_insert_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params)
{
const char *key = rht_obj(ht, obj);
+ void *ret;
BUG_ON(ht->p.obj_hashfn);
- return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
- params);
+ ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+ false);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ return ret == NULL ? 0 : -EEXIST;
}
-static inline int __rhashtable_remove_fast(
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht: hash table
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+ struct rhashtable *ht, struct rhash_head *obj,
+ const struct rhashtable_params params)
+{
+ const char *key = rht_obj(ht, obj);
+
+ BUG_ON(ht->p.obj_hashfn);
+
+ return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+ false);
+}
+
+/**
+ * rhashtable_lookup_insert_key - search and insert object to hash table
+ * with explicit key
+ * @ht: hash table
+ * @key: key
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
+ *
+ * Returns zero on success.
+ */
+static inline int rhashtable_lookup_insert_key(
+ struct rhashtable *ht, const void *key, struct rhash_head *obj,
+ const struct rhashtable_params params)
+{
+ void *ret;
+
+ BUG_ON(!ht->p.obj_hashfn || !key);
+
+ ret = __rhashtable_insert_fast(ht, key, obj, params, false);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
+ return ret == NULL ? 0 : -EEXIST;
+}
+
+/**
+ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
+ * @ht: hash table
+ * @key: key
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_key(), but this function returns the
+ * object if it exists, NULL if it does not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_key(
+ struct rhashtable *ht, const void *key, struct rhash_head *obj,
+ const struct rhashtable_params params)
+{
+ BUG_ON(!ht->p.obj_hashfn || !key);
+
+ return __rhashtable_insert_fast(ht, key, obj, params, false);
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast_one(
struct rhashtable *ht, struct bucket_table *tbl,
- struct rhash_head *obj, const struct rhashtable_params params)
+ struct rhash_head *obj, const struct rhashtable_params params,
+ bool rhlist)
{
+ struct rhash_lock_head __rcu **bkt;
struct rhash_head __rcu **pprev;
struct rhash_head *he;
- spinlock_t * lock;
unsigned int hash;
int err = -ENOENT;
hash = rht_head_hashfn(ht, tbl, obj, params);
- lock = rht_bucket_lock(tbl, hash);
+ bkt = rht_bucket_var(tbl, hash);
+ if (!bkt)
+ return -ENOENT;
+ pprev = NULL;
+ rht_lock(tbl, bkt);
- spin_lock_bh(lock);
+ rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+ struct rhlist_head *list;
+
+ list = container_of(he, struct rhlist_head, rhead);
- pprev = &tbl->buckets[hash];
- rht_for_each(he, tbl, hash) {
if (he != obj) {
+ struct rhlist_head __rcu **lpprev;
+
pprev = &he->next;
- continue;
+
+ if (!rhlist)
+ continue;
+
+ do {
+ lpprev = &list->next;
+ list = rht_dereference_bucket(list->next,
+ tbl, hash);
+ } while (list && obj != &list->rhead);
+
+ if (!list)
+ continue;
+
+ list = rht_dereference_bucket(list->next, tbl, hash);
+ RCU_INIT_POINTER(*lpprev, list);
+ err = 0;
+ break;
}
- rcu_assign_pointer(*pprev, obj->next);
+ obj = rht_dereference_bucket(obj->next, tbl, hash);
+ err = 1;
+
+ if (rhlist) {
+ list = rht_dereference_bucket(list->next, tbl, hash);
+ if (list) {
+ RCU_INIT_POINTER(list->rhead.next, obj);
+ obj = &list->rhead;
+ err = 0;
+ }
+ }
+
+ if (pprev) {
+ rcu_assign_pointer(*pprev, obj);
+ rht_unlock(tbl, bkt);
+ } else {
+ rht_assign_unlock(tbl, bkt, obj);
+ }
+ goto unlocked;
+ }
+
+ rht_unlock(tbl, bkt);
+unlocked:
+ if (err > 0) {
+ atomic_dec(&ht->nelems);
+ if (unlikely(ht->p.automatic_shrinking &&
+ rht_shrink_below_30(ht, tbl)))
+ schedule_work(&ht->run_work);
err = 0;
- break;
}
- spin_unlock_bh(lock);
+ return err;
+}
+
+/* Internal function, please use rhashtable_remove_fast() instead */
+static inline int __rhashtable_remove_fast(
+ struct rhashtable *ht, struct rhash_head *obj,
+ const struct rhashtable_params params, bool rhlist)
+{
+ struct bucket_table *tbl;
+ int err;
+
+ rcu_read_lock();
+
+ tbl = rht_dereference_rcu(ht->tbl, ht);
+
+ /* Because we have already taken (and released) the bucket
+ * lock in old_tbl, if we find that future_tbl is not yet
+ * visible then that guarantees the entry to still be in
+ * the old tbl if it exists.
+ */
+ while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params,
+ rhlist)) &&
+ (tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
+ ;
+
+ rcu_read_unlock();
return err;
}
+/**
+ * rhashtable_remove_fast - remove object from hash table
+ * @ht: hash table
+ * @obj: pointer to hash head inside object
+ * @params: hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
static inline int rhashtable_remove_fast(
struct rhashtable *ht, struct rhash_head *obj,
const struct rhashtable_params params)
+{
+ return __rhashtable_remove_fast(ht, obj, params, false);
+}
+
+/**
+ * rhltable_remove - remove object from hash list table
+ * @hlt: hash list table
+ * @list: pointer to hash list head inside object
+ * @params: hash table parameters
+ *
+ * Since the hash chain is single linked, the removal operation needs to
+ * walk the bucket chain upon removal. The removal operation is thus
+ * considerable slow if the hash table is not correctly sized.
+ *
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found.
+ */
+static inline int rhltable_remove(
+ struct rhltable *hlt, struct rhlist_head *list,
+ const struct rhashtable_params params)
+{
+ return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true);
+}
+
+/* Internal function, please use rhashtable_replace_fast() instead */
+static inline int __rhashtable_replace_fast(
+ struct rhashtable *ht, struct bucket_table *tbl,
+ struct rhash_head *obj_old, struct rhash_head *obj_new,
+ const struct rhashtable_params params)
+{
+ struct rhash_lock_head __rcu **bkt;
+ struct rhash_head __rcu **pprev;
+ struct rhash_head *he;
+ unsigned int hash;
+ int err = -ENOENT;
+
+ /* Minimally, the old and new objects must have same hash
+ * (which should mean identifiers are the same).
+ */
+ hash = rht_head_hashfn(ht, tbl, obj_old, params);
+ if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
+ return -EINVAL;
+
+ bkt = rht_bucket_var(tbl, hash);
+ if (!bkt)
+ return -ENOENT;
+
+ pprev = NULL;
+ rht_lock(tbl, bkt);
+
+ rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
+ if (he != obj_old) {
+ pprev = &he->next;
+ continue;
+ }
+
+ rcu_assign_pointer(obj_new->next, obj_old->next);
+ if (pprev) {
+ rcu_assign_pointer(*pprev, obj_new);
+ rht_unlock(tbl, bkt);
+ } else {
+ rht_assign_unlock(tbl, bkt, obj_new);
+ }
+ err = 0;
+ goto unlocked;
+ }
+
+ rht_unlock(tbl, bkt);
+
+unlocked:
+ return err;
+}
+
+/**
+ * rhashtable_replace_fast - replace an object in hash table
+ * @ht: hash table
+ * @obj_old: pointer to hash head inside object being replaced
+ * @obj_new: pointer to hash head inside object which is new
+ * @params: hash table parameters
+ *
+ * Replacing an object doesn't affect the number of elements in the hash table
+ * or bucket, so we don't need to worry about shrinking or expanding the
+ * table here.
+ *
+ * Returns zero on success, -ENOENT if the entry could not be found,
+ * -EINVAL if hash is not the same for the old and new objects.
+ */
+static inline int rhashtable_replace_fast(
+ struct rhashtable *ht, struct rhash_head *obj_old,
+ struct rhash_head *obj_new,
+ const struct rhashtable_params params)
{
struct bucket_table *tbl;
int err;
* visible then that guarantees the entry to still be in
* the old tbl if it exists.
*/
- while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) &&
+ while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
+ obj_new, params)) &&
(tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
;
- if (err)
- goto out;
-
- atomic_dec(&ht->nelems);
- if (unlikely(ht->p.automatic_shrinking &&
- rht_shrink_below_30(ht, tbl)))
- schedule_work(&ht->run_work);
-
-out:
rcu_read_unlock();
return err;
}
+/**
+ * rhltable_walk_enter - Initialise an iterator
+ * @hlt: Table to walk over
+ * @iter: Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice. Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+static inline void rhltable_walk_enter(struct rhltable *hlt,
+ struct rhashtable_iter *iter)
+{
+ return rhashtable_walk_enter(&hlt->ht, iter);
+}
+
+/**
+ * rhltable_free_and_destroy - free elements and destroy hash list table
+ * @hlt: the hash list table to destroy
+ * @free_fn: callback to release resources of element
+ * @arg: pointer passed to free_fn
+ *
+ * See documentation for rhashtable_free_and_destroy.
+ */
+static inline void rhltable_free_and_destroy(struct rhltable *hlt,
+ void (*free_fn)(void *ptr,
+ void *arg),
+ void *arg)
+{
+ return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg);
+}
+
+static inline void rhltable_destroy(struct rhltable *hlt)
+{
+ return rhltable_free_and_destroy(hlt, NULL, NULL);
+}
+
#endif /* _LINUX_RHASHTABLE_H */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H
-#define PF_MEMALLOC_NOFS 0
+#define PF_MEMALLOC 0x00000800 /* Allocating memory */
+#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
static inline unsigned int memalloc_nofs_save(void)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
}
+static inline unsigned int memalloc_noreclaim_save(void)
+{
+ unsigned int flags = current->flags & PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
+ return flags;
+}
+
+static inline void memalloc_noreclaim_restore(unsigned int flags)
+{
+ current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+}
+
#endif /* _LINUX_SCHED_MM_H */
};
struct {
- unsigned read_lock:28;
+ unsigned read_lock:27;
+ unsigned write_locking:1;
unsigned intent_lock:1;
unsigned waiters:3;
/*
unsigned intent_lock_recurse;
struct task_struct *owner;
struct optimistic_spin_queue osq;
+ unsigned __percpu *readers;
raw_spinlock_t wait_lock;
struct list_head wait_list[2];
void six_lock_wakeup_all(struct six_lock *);
+void six_lock_pcpu_free_rcu(struct six_lock *);
+void six_lock_pcpu_free(struct six_lock *);
+void six_lock_pcpu_alloc(struct six_lock *);
+
#endif /* _LINUX_SIX_H */
#define kzalloc(size, flags) kmalloc(size, flags|__GFP_ZERO)
#define kmalloc_array(n, size, flags) \
((size) != 0 && (n) > SIZE_MAX / (size) \
- ? NULL : kmalloc(n * size, flags))
+ ? NULL : kmalloc((n) * (size), flags))
#define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO)
#define kzfree(p) free(p)
#define kvmalloc(size, flags) kmalloc(size, flags)
+#define kvzalloc(size, flags) kzalloc(size, flags)
#define kvfree(p) kfree(p)
static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
return p;
}
+struct kmem_cache {
+ size_t obj_size;
+};
+
+static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp)
+{
+ return kmalloc(c->obj_size, gfp);
+}
+
+static inline void kmem_cache_free(struct kmem_cache *c, void *p)
+{
+ kfree(p);
+}
+
+static inline void kmem_cache_destroy(struct kmem_cache *p)
+{
+ kfree(p);
+}
+
+static inline struct kmem_cache *kmem_cache_create(size_t obj_size)
+{
+ struct kmem_cache *p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ p->obj_size = obj_size;
+ return p;
+}
+
+#define KMEM_CACHE(_struct, _flags) kmem_cache_create(sizeof(struct _struct))
+
#endif /* __TOOLS_LINUX_SLAB_H */
--- /dev/null
+#ifndef __TOOLS_LINUX_SRCU_H
+#define __TOOLS_LINUX_SRCU_H
+
+struct srcu_struct {
+};
+
+static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) {}
+
+static inline int srcu_read_lock(struct srcu_struct *ssp)
+{
+ return 0;
+}
+
+static inline bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ return false;
+}
+
+static inline unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ return 0;
+}
+
+static inline void cleanup_srcu_struct(struct srcu_struct *ssp) {}
+
+static inline int init_srcu_struct(struct srcu_struct *ssp)
+{
+ return 0;
+}
+
+#endif /* __TOOLS_LINUX_SRCU_H */
#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */
#include <asm/types.h>
+#include <linux/cache.h>
+
#define BITS_PER_LONG __BITS_PER_LONG
struct page;
#define __GFP_IO 0
#define __GFP_NOWARN 0
#define __GFP_NORETRY 0
+#define __GFP_NOFAIL 0
#define __GFP_ZERO 1
#define PAGE_ALLOC_COSTLY_ORDER 6
{
void *p;
+ size = round_up(size, PAGE_SIZE);
+
run_shrinkers();
p = aligned_alloc(PAGE_SIZE, size);
} while (0)
#define wait_event_killable(wq, condition) ({wait_event(wq, condition); 0; })
+#define wait_event_interruptible(wq, condition) ({wait_event(wq, condition); 0; })
#define __wait_event_timeout(wq, condition, timeout) \
___wait_event(wq, ___wait_cond_timeout(condition), \
TP_ARGS(bio)
);
+TRACE_EVENT(journal_reclaim_start,
+ TP_PROTO(struct bch_fs *c, u64 min_nr,
+ u64 prereserved, u64 prereserved_total,
+ u64 btree_cache_dirty, u64 btree_cache_total,
+ u64 btree_key_cache_dirty, u64 btree_key_cache_total),
+ TP_ARGS(c, min_nr, prereserved, prereserved_total,
+ btree_cache_dirty, btree_cache_total,
+ btree_key_cache_dirty, btree_key_cache_total),
+
+ TP_STRUCT__entry(
+ __array(char, uuid, 16 )
+ __field(u64, min_nr )
+ __field(u64, prereserved )
+ __field(u64, prereserved_total )
+ __field(u64, btree_cache_dirty )
+ __field(u64, btree_cache_total )
+ __field(u64, btree_key_cache_dirty )
+ __field(u64, btree_key_cache_total )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->min_nr = min_nr;
+ __entry->prereserved = prereserved;
+ __entry->prereserved_total = prereserved_total;
+ __entry->btree_cache_dirty = btree_cache_dirty;
+ __entry->btree_cache_total = btree_cache_total;
+ __entry->btree_key_cache_dirty = btree_key_cache_dirty;
+ __entry->btree_key_cache_total = btree_key_cache_total;
+ ),
+
+ TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ __entry->uuid,
+ __entry->min_nr,
+ __entry->prereserved,
+ __entry->prereserved_total,
+ __entry->btree_cache_dirty,
+ __entry->btree_cache_total,
+ __entry->btree_key_cache_dirty,
+ __entry->btree_key_cache_total)
+);
+
+TRACE_EVENT(journal_reclaim_finish,
+ TP_PROTO(struct bch_fs *c, u64 nr_flushed),
+ TP_ARGS(c, nr_flushed),
+
+ TP_STRUCT__entry(
+ __array(char, uuid, 16 )
+ __field(u64, nr_flushed )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+ __entry->nr_flushed = nr_flushed;
+ ),
+
+ TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+);
+
/* bset.c: */
DEFINE_EVENT(bpos, bkey_pack_pos_fail,
__entry->ip = ip;
),
- TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+ TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
);
DECLARE_EVENT_CLASS(transaction_restart,
__entry->ip = ip;
),
- TP_printk("%pf", (void *) __entry->ip)
+ TP_printk("%ps", (void *) __entry->ip)
);
DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
TP_ARGS(ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock,
- TP_PROTO(unsigned long ip),
- TP_ARGS(ip)
+TRACE_EVENT(trans_restart_would_deadlock,
+ TP_PROTO(unsigned long trans_ip,
+ unsigned long caller_ip,
+ unsigned reason,
+ enum btree_id have_btree_id,
+ unsigned have_iter_type,
+ enum btree_id want_btree_id,
+ unsigned want_iter_type),
+ TP_ARGS(trans_ip, caller_ip, reason,
+ have_btree_id, have_iter_type,
+ want_btree_id, want_iter_type),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, trans_ip )
+ __field(unsigned long, caller_ip )
+ __field(u8, reason )
+ __field(u8, have_btree_id )
+ __field(u8, have_iter_type )
+ __field(u8, want_btree_id )
+ __field(u8, want_iter_type )
+ ),
+
+ TP_fast_assign(
+ __entry->trans_ip = trans_ip;
+ __entry->caller_ip = caller_ip;
+ __entry->reason = reason;
+ __entry->have_btree_id = have_btree_id;
+ __entry->have_iter_type = have_iter_type;
+ __entry->want_btree_id = want_btree_id;
+ __entry->want_iter_type = want_iter_type;
+ ),
+
+ TP_printk("%ps %pS because %u have %u:%u want %u:%u",
+ (void *) __entry->trans_ip,
+ (void *) __entry->caller_ip,
+ __entry->reason,
+ __entry->have_btree_id,
+ __entry->have_iter_type,
+ __entry->want_btree_id,
+ __entry->want_iter_type)
);
TRACE_EVENT(trans_restart_iters_realloced,
__entry->nr = nr;
),
- TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr)
+ TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr)
);
TRACE_EVENT(trans_restart_mem_realloced,
__entry->bytes = bytes;
),
- TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes)
+ TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes)
);
DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get,
TP_ARGS(ip)
);
+DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim,
+ TP_PROTO(unsigned long ip),
+ TP_ARGS(ip)
+);
+
DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
TP_PROTO(unsigned long ip),
TP_ARGS(ip)
TP_ARGS(ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_atomic,
- TP_PROTO(unsigned long ip),
- TP_ARGS(ip)
-);
-
DECLARE_EVENT_CLASS(node_lock_fail,
TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
TP_ARGS(level, iter_seq, node, node_seq),
return BCH_MIN_NR_NBUCKETS * bucket_size;
}
-static void init_layout(struct bch_sb_layout *l, unsigned block_size,
+static void init_layout(struct bch_sb_layout *l,
+ unsigned block_size,
+ unsigned sb_size,
u64 start, u64 end)
{
- unsigned sb_size;
- u64 backup; /* offset of 2nd sb */
+ unsigned i;
memset(l, 0, sizeof(*l));
- if (start != BCH_SB_SECTOR)
- start = round_up(start, block_size);
- end = round_down(end, block_size);
-
- if (start >= end)
- die("insufficient space for superblocks");
-
- /*
- * Create two superblocks in the allowed range: reserve a maximum of 64k
- */
- sb_size = min_t(u64, 128, end - start / 2);
-
- backup = start + sb_size;
- backup = round_up(backup, block_size);
-
- backup = min(backup, end);
-
- sb_size = min(end - backup, backup- start);
- sb_size = rounddown_pow_of_two(sb_size);
-
- if (sb_size < 8)
- die("insufficient space for superblocks");
-
l->magic = BCACHE_MAGIC;
l->layout_type = 0;
l->nr_superblocks = 2;
l->sb_max_size_bits = ilog2(sb_size);
- l->sb_offset[0] = cpu_to_le64(start);
- l->sb_offset[1] = cpu_to_le64(backup);
+
+ /* Create two superblocks in the allowed range: */
+ for (i = 0; i < l->nr_superblocks; i++) {
+ if (start != BCH_SB_SECTOR)
+ start = round_up(start, block_size);
+
+ l->sb_offset[i] = cpu_to_le64(start);
+ start += sb_size;
+ }
+
+ if (start >= end)
+ die("insufficient space for superblocks");
}
void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
{
- if (!dev->sb_offset) {
- dev->sb_offset = BCH_SB_SECTOR;
- dev->sb_end = BCH_SB_SECTOR + 256;
- }
-
if (!dev->size)
dev->size = get_size(dev->path, dev->fd) >> 9;
if (bch2_sb_realloc(&sb, 0))
die("insufficient memory");
- sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
- sb.sb->version_min = le16_to_cpu(bcachefs_metadata_version_current);
+ sb.sb->version = le16_to_cpu(opts.version);
+ sb.sb->version_min = le16_to_cpu(opts.version);
sb.sb->magic = BCACHE_MAGIC;
sb.sb->block_size = cpu_to_le16(fs_opts.block_size);
sb.sb->user_uuid = opts.uuid;
sb.sb->nr_devices = nr_devs;
+ if (opts.version == bcachefs_metadata_version_current)
+ sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+
uuid_generate(sb.sb->uuid.b);
if (opts.label)
m->first_bucket = 0;
m->bucket_size = cpu_to_le16(i->bucket_size);
- SET_BCH_MEMBER_REPLACEMENT(m, CACHE_REPLACEMENT_LRU);
+ SET_BCH_MEMBER_REPLACEMENT(m, BCH_CACHE_REPLACEMENT_lru);
SET_BCH_MEMBER_DISCARD(m, i->discard);
SET_BCH_MEMBER_DATA_ALLOWED(m, i->data_allowed);
SET_BCH_MEMBER_DURABILITY(m, i->durability + 1);
parse_target(&sb, devs, nr_devs, fs_opt_strs.background_target));
SET_BCH_SB_PROMOTE_TARGET(sb.sb,
parse_target(&sb, devs, nr_devs, fs_opt_strs.promote_target));
+ SET_BCH_SB_METADATA_TARGET(sb.sb,
+ parse_target(&sb, devs, nr_devs, fs_opt_strs.metadata_target));
/* Crypt: */
if (opts.encrypted) {
for (i = devs; i < devs + nr_devs; i++) {
sb.sb->dev_idx = i - devs;
+ if (!i->sb_offset) {
+ i->sb_offset = BCH_SB_SECTOR;
+ i->sb_end = i->size;
+ }
+
init_layout(&sb.sb->layout, fs_opts.block_size,
+ opts.superblock_size,
i->sb_offset, i->sb_end);
if (i->sb_offset == BCH_SB_SECTOR) {
time_str,
BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
- ? bch2_dev_state[BCH_MEMBER_STATE(m)]
+ ? bch2_member_states[BCH_MEMBER_STATE(m)]
: "unknown",
group,
data_allowed_str,
data_has_str,
- BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR
+ BCH_MEMBER_REPLACEMENT(m) < BCH_CACHE_REPLACEMENT_NR
? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
: "unknown",
static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
enum units units)
{
+ struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+
+ printf(" flags: %x", le32_to_cpu(clean->flags));
+ printf(" journal seq: %llx", le64_to_cpu(clean->journal_seq));
}
static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f,
{
struct bch_sb_field_members *mi;
char user_uuid_str[40], internal_uuid_str[40];
- char features_str[200];
+ char features_str[500];
+ char compat_features_str[500];
char fields_have_str[200];
char label[BCH_SB_LABEL_SIZE + 1];
char time_str[64];
char foreground_str[64];
char background_str[64];
char promote_str[64];
+ char metadata_str[64];
struct bch_sb_field *f;
u64 fields_have = 0;
unsigned nr_devices = 0;
bch2_sb_get_target(sb, promote_str, sizeof(promote_str),
BCH_SB_PROMOTE_TARGET(sb));
+ bch2_sb_get_target(sb, metadata_str, sizeof(metadata_str),
+ BCH_SB_METADATA_TARGET(sb));
+
bch2_flags_to_text(&PBUF(features_str),
bch2_sb_features,
le64_to_cpu(sb->features[0]));
+ bch2_flags_to_text(&PBUF(compat_features_str),
+ bch2_sb_compat,
+ le64_to_cpu(sb->compat[0]));
+
vstruct_for_each(sb, f)
fields_have |= 1 << le32_to_cpu(f->type);
bch2_flags_to_text(&PBUF(fields_have_str),
printf("External UUID: %s\n"
"Internal UUID: %s\n"
+ "Device index: %u\n"
"Label: %s\n"
- "Version: %llu\n"
+ "Version: %u\n"
+ "Oldest version on disk: %u\n"
"Created: %s\n"
"Squence number: %llu\n"
"Block_size: %s\n"
"Error action: %s\n"
"Clean: %llu\n"
"Features: %s\n"
+ "Compat features: %s\n"
"Metadata replicas: %llu\n"
"Data replicas: %llu\n"
"Foreground write target: %s\n"
"Background write target: %s\n"
"Promote target: %s\n"
+ "Metadata target: %s\n"
"String hash type: %s (%llu)\n"
"32 bit inodes: %llu\n"
"Superblock size: %llu\n",
user_uuid_str,
internal_uuid_str,
+ sb->dev_idx,
label,
- le64_to_cpu(sb->version),
+ le16_to_cpu(sb->version),
+ le16_to_cpu(sb->version_min),
time_str,
le64_to_cpu(sb->seq),
pr_units(le16_to_cpu(sb->block_size), units),
pr_units(BCH_SB_BTREE_NODE_SIZE(sb), units),
- BCH_SB_ERROR_ACTION(sb) < BCH_NR_ERROR_ACTIONS
+ BCH_SB_ERROR_ACTION(sb) < BCH_ON_ERROR_NR
? bch2_error_actions[BCH_SB_ERROR_ACTION(sb)]
: "unknown",
BCH_SB_CLEAN(sb),
features_str,
+ compat_features_str,
BCH_SB_META_REPLICAS_WANT(sb),
BCH_SB_DATA_REPLICAS_WANT(sb),
foreground_str,
background_str,
promote_str,
+ metadata_str,
BCH_SB_STR_HASH_TYPE(sb) < BCH_STR_HASH_NR
? bch2_str_hash_types[BCH_SB_STR_HASH_TYPE(sb)]
* Given a path to a block device, open the filesystem it belongs to; also
* return the device's idx:
*/
-struct bchfs_handle bchu_fs_open_by_dev(const char *path, unsigned *idx)
+struct bchfs_handle bchu_fs_open_by_dev(const char *path, int *idx)
{
char buf[1024], *uuid_str;
return bcache_fs_open(uuid_str);
}
+int bchu_dev_path_to_idx(struct bchfs_handle fs, const char *dev_path)
+{
+ int idx;
+ struct bchfs_handle fs2 = bchu_fs_open_by_dev(dev_path, &idx);
+
+ if (memcmp(&fs.uuid, &fs2.uuid, sizeof(fs.uuid)))
+ idx = -1;
+ bcache_fs_close(fs2);
+ return idx;
+}
+
int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd)
{
int progress_fd = xioctl(fs.ioctl_fd, BCH_IOCTL_DATA, &cmd);
/* option parsing */
+#define SUPERBLOCK_SIZE_DEFAULT 2048 /* 1 MB */
+
struct bch_opt_strs {
union {
char *by_id[bch2_opts_nr];
struct format_opts {
char *label;
uuid_le uuid;
-
+ unsigned version;
+ unsigned superblock_size;
unsigned encoded_extent_max;
-
bool encrypted;
char *passphrase;
};
static inline struct format_opts format_opts_default()
{
return (struct format_opts) {
+ .version = bcachefs_metadata_version_current,
+ .superblock_size = SUPERBLOCK_SIZE_DEFAULT,
.encoded_extent_max = 128,
};
}
void bcache_fs_close(struct bchfs_handle);
struct bchfs_handle bcache_fs_open(const char *);
-struct bchfs_handle bchu_fs_open_by_dev(const char *, unsigned *);
+struct bchfs_handle bchu_fs_open_by_dev(const char *, int *);
+int bchu_dev_path_to_idx(struct bchfs_handle, const char *);
static inline void bchu_disk_add(struct bchfs_handle fs, char *dev)
{
xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE, &i);
}
+static inline void bchu_disk_resize_journal(struct bchfs_handle fs,
+ unsigned idx,
+ u64 nbuckets)
+{
+ struct bch_ioctl_disk_resize i = {
+ .flags = BCH_BY_INDEX,
+ .dev = idx,
+ .nbuckets = nbuckets,
+ };
+
+ xioctl(fs.ioctl_fd, BCH_IOCTL_DISK_RESIZE_JOURNAL, &i);
+}
+
int bchu_data(struct bchfs_handle, struct bch_ioctl_data);
struct dev_name {
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
bch2_trans_begin(&trans);
iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
- &inode->ei_str_hash, inode->v.i_ino,
+ &hash, inode->v.i_ino,
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
if (IS_ERR(iter)) {
}
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
-
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (!IS_ERR(acl))
set_cached_acl(&inode->v, type, acl);
+ bch2_trans_iter_put(&trans, iter);
out:
bch2_trans_exit(&trans);
return acl;
struct btree_trans trans;
struct btree_iter *inode_iter;
struct bch_inode_unpacked inode_u;
+ struct bch_hash_info hash_info;
struct posix_acl *acl;
umode_t mode;
int ret;
if (type == ACL_TYPE_ACCESS) {
ret = posix_acl_update_mode(&inode->v, &mode, &acl);
if (ret)
- goto err;
+ goto btree_err;
}
- ret = bch2_set_acl_trans(&trans, &inode_u,
- &inode->ei_str_hash,
- acl, type);
+ hash_info = bch2_hash_info_init(c, &inode_u);
+
+ ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
if (ret)
goto btree_err;
&inode->ei_journal_seq,
BTREE_INSERT_NOUNLOCK);
btree_err:
+ bch2_trans_iter_put(&trans, inode_iter);
+
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
}
int bch2_acl_chmod(struct btree_trans *trans,
- struct bch_inode_info *inode,
+ struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
{
+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
struct bkey_i_xattr *new;
struct posix_acl *acl;
- int ret = 0;
+ int ret;
iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
- &inode->ei_str_hash, inode->v.i_ino,
+ &hash_info, inode->bi_inum,
&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ return ret == -ENOENT ? 0 : ret;
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
-
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
- if (IS_ERR_OR_NULL(acl))
- return PTR_ERR(acl);
+ ret = PTR_ERR_OR_ZERO(acl);
+ if (ret || !acl)
+ goto err;
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
if (ret)
*new_acl = acl;
acl = NULL;
err:
+ bch2_trans_iter_put(trans, iter);
kfree(acl);
return ret;
}
const struct bch_hash_info *,
struct posix_acl *, int);
int bch2_set_acl(struct inode *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
umode_t, struct posix_acl **);
#else
}
static inline int bch2_acl_chmod(struct btree_trans *trans,
- struct bch_inode_info *inode,
+ struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
{
#include "ec.h"
#include "error.h"
#include "recovery.h"
+#include "varint.h"
#include <linux/kthread.h>
#include <linux/math64.h>
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
-static const char * const bch2_alloc_field_names[] = {
-#define x(name, bytes) #name,
- BCH_ALLOC_FIELDS()
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+ BCH_ALLOC_FIELDS_V1()
#undef x
- NULL
};
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
/* Ratelimiting/PD controllers */
static void pd_controllers_update(struct work_struct *work)
* reclaimed by copy GC
*/
fragmented += max_t(s64, 0, (bucket_to_sector(ca,
- stats.buckets[BCH_DATA_user] +
- stats.buckets[BCH_DATA_cached]) -
- (stats.sectors[BCH_DATA_user] +
- stats.sectors[BCH_DATA_cached])) << 9);
+ stats.d[BCH_DATA_user].buckets +
+ stats.d[BCH_DATA_cached].buckets) -
+ (stats.d[BCH_DATA_user].sectors +
+ stats.d[BCH_DATA_cached].sectors)) << 9);
}
bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
/* Persistent alloc info: */
-static inline u64 get_alloc_field(const struct bch_alloc *a,
- const void **p, unsigned field)
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+ const void **p, unsigned field)
{
- unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
u64 v;
if (!(a->fields & (1 << field)))
return v;
}
-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
- unsigned field, u64 v)
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
+ unsigned field, u64 v)
{
- unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
if (!v)
return;
*p += bytes;
}
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
{
- struct bkey_alloc_unpacked ret = { .gen = 0 };
+ const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+ const void *d = in->data;
+ unsigned idx = 0;
- if (k.k->type == KEY_TYPE_alloc) {
- const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
- const void *d = a->data;
- unsigned idx = 0;
+ out->gen = in->gen;
- ret.gen = a->gen;
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+}
-#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
- BCH_ALLOC_FIELDS()
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+ const u8 *in = a.v->data;
+ const u8 *end = bkey_val_end(a);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v;
+
+ out->gen = a.v->gen;
+ out->oldest_gen = a.v->oldest_gen;
+ out->data_type = a.v->data_type;
+
+#define x(_name, _bits) \
+ if (fieldnr < a.v->nr_fields) { \
+ ret = bch2_varint_decode(in, end, &v); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v = 0; \
+ } \
+ out->_name = v; \
+ if (v != out->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_ALLOC_FIELDS_V2()
#undef x
- }
- return ret;
+ return 0;
}
-void bch2_alloc_pack(struct bkey_i_alloc *dst,
- const struct bkey_alloc_unpacked src)
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+ const struct bkey_alloc_unpacked src)
{
- unsigned idx = 0;
- void *d = dst->v.data;
+ struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+ u8 *out = a->v.data;
+ u8 *end = (void *) &dst[1];
+ u8 *last_nonzero_field = out;
unsigned bytes;
- dst->v.fields = 0;
- dst->v.gen = src.gen;
+ a->k.p = POS(src.dev, src.bucket);
+ a->v.gen = src.gen;
+ a->v.oldest_gen = src.oldest_gen;
+ a->v.data_type = src.data_type;
+
+#define x(_name, _bits) \
+ nr_fields++; \
+ \
+ if (src._name) { \
+ out += bch2_varint_encode(out, src._name); \
+ \
+ last_nonzero_field = out; \
+ last_nonzero_fieldnr = nr_fields; \
+ } else { \
+ *out++ = 0; \
+ }
-#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name);
- BCH_ALLOC_FIELDS()
+ BCH_ALLOC_FIELDS_V2()
#undef x
+ BUG_ON(out > end);
+
+ out = last_nonzero_field;
+ a->v.nr_fields = last_nonzero_fieldnr;
+
+ bytes = (u8 *) out - (u8 *) &a->v;
+ set_bkey_val_bytes(&a->k, bytes);
+ memset_u64s_tail(&a->v, 0, bytes);
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+ struct bkey_alloc_unpacked ret = {
+ .dev = k.k->p.inode,
+ .bucket = k.k->p.offset,
+ .gen = 0,
+ };
+
+ if (k.k->type == KEY_TYPE_alloc_v2)
+ bch2_alloc_unpack_v2(&ret, k);
+ else if (k.k->type == KEY_TYPE_alloc)
+ bch2_alloc_unpack_v1(&ret, k);
+
+ return ret;
+}
- bytes = (void *) d - (void *) &dst->v;
- set_bkey_val_bytes(&dst->k, bytes);
- memset_u64s_tail(&dst->v, 0, bytes);
+void bch2_alloc_pack(struct bch_fs *c,
+ struct bkey_alloc_buf *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ bch2_alloc_pack_v2(dst, src);
}
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
- for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
if (a->fields & (1 << i))
- bytes += BCH_ALLOC_FIELD_BYTES[i];
+ bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
return NULL;
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
- const void *d = a.v->data;
- unsigned i;
+ struct bkey_alloc_unpacked u;
- pr_buf(out, "gen %u", a.v->gen);
+ if (k.k->p.inode >= c->sb.nr_devices ||
+ !c->devs[k.k->p.inode])
+ return "invalid device";
+
+ if (bch2_alloc_unpack_v2(&u, k))
+ return "unpack error";
- for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
- if (a.v->fields & (1 << i))
- pr_buf(out, " %s %llu",
- bch2_alloc_field_names[i],
- get_alloc_field(a.v, &d, i));
+ return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+ pr_buf(out, "gen %u oldest_gen %u data_type %u",
+ u.gen, u.oldest_gen, u.data_type);
+#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name);
+ BCH_ALLOC_FIELDS_V2()
+#undef x
}
static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
struct bucket *g;
struct bkey_alloc_unpacked u;
- if (level || k.k->type != KEY_TYPE_alloc)
+ if (level ||
+ (k.k->type != KEY_TYPE_alloc &&
+ k.k->type != KEY_TYPE_alloc_v2))
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = __bucket(ca, k.k->p.offset, 0);
+ g = bucket(ca, k.k->p.offset);
u = bch2_alloc_unpack(k);
g->_mark.gen = u.gen;
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ int ret;
down_read(&c->gc_lock);
- ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
+ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
NULL, bch2_alloc_read_fn);
up_read(&c->gc_lock);
return ret;
}
- percpu_down_write(&c->mark_lock);
- bch2_dev_usage_from_buckets(c);
- percpu_up_write(&c->mark_lock);
-
- mutex_lock(&c->bucket_clock[READ].lock);
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- bch2_recalc_oldest_io(c, ca, READ);
- up_read(&ca->bucket_lock);
- }
- mutex_unlock(&c->bucket_clock[READ].lock);
-
- mutex_lock(&c->bucket_clock[WRITE].lock);
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- bch2_recalc_oldest_io(c, ca, WRITE);
- up_read(&ca->bucket_lock);
- }
- mutex_unlock(&c->bucket_clock[WRITE].lock);
-
return 0;
}
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bch_dev *ca;
- struct bucket_array *ba;
struct bucket *g;
struct bucket_mark m;
struct bkey_alloc_unpacked old_u, new_u;
- __BKEY_PADDED(k, 8) alloc_key; /* hack: */
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf a;
int ret;
retry:
bch2_trans_begin(trans);
ret = bch2_btree_key_cache_flush(trans,
- BTREE_ID_ALLOC, iter->pos);
+ BTREE_ID_alloc, iter->pos);
if (ret)
goto err;
percpu_down_read(&c->mark_lock);
ca = bch_dev_bkey_exists(c, iter->pos.inode);
- ba = bucket_array(ca);
-
- g = &ba->b[iter->pos.offset];
+ g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
- new_u = alloc_mem_to_key(g, m);
+ new_u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
if (!bkey_alloc_unpacked_cmp(old_u, new_u))
return 0;
- a = bkey_alloc_init(&alloc_key.k);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, new_u);
-
- bch2_trans_update(trans, iter, &a->k_i,
+ bch2_alloc_pack(c, &a, new_u);
+ bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_NORUN);
ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- flags);
+ BTREE_INSERT_NOFAIL|flags);
err:
if (ret == -EINTR)
goto retry;
return ret;
}
-int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
{
struct btree_trans trans;
struct btree_iter *iter;
- u64 first_bucket, nbuckets;
+ struct bch_dev *ca;
+ unsigned i;
int ret = 0;
- percpu_down_read(&c->mark_lock);
- first_bucket = bucket_array(ca)->first_bucket;
- nbuckets = bucket_array(ca)->nbuckets;
- percpu_up_read(&c->mark_lock);
-
- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
- POS(ca->dev_idx, first_bucket),
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- while (iter->pos.offset < nbuckets) {
- bch2_trans_cond_resched(&trans);
-
- ret = bch2_alloc_write_key(&trans, iter, flags);
- if (ret)
- break;
- bch2_btree_iter_next_slot(iter);
- }
-
- bch2_trans_exit(&trans);
-
- return ret;
-}
+ for_each_member_device(ca, c, i) {
+ bch2_btree_iter_set_pos(iter,
+ POS(ca->dev_idx, ca->mi.first_bucket));
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ while (iter->pos.offset < ca->mi.nbuckets) {
+ bch2_trans_cond_resched(&trans);
- for_each_rw_member(ca, c, i) {
- bch2_dev_alloc_write(c, ca, flags);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- break;
+ ret = bch2_alloc_write_key(&trans, iter, flags);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ goto err;
+ }
+ bch2_btree_iter_next_slot(iter);
}
}
-
+err:
+ bch2_trans_iter_put(&trans, iter);
+ bch2_trans_exit(&trans);
return ret;
}
/* Bucket IO clocks: */
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
- struct bucket_array *buckets = bucket_array(ca);
- struct bucket *g;
- u16 max_last_io = 0;
- unsigned i;
-
- lockdep_assert_held(&c->bucket_clock[rw].lock);
-
- /* Recalculate max_last_io for this device: */
- for_each_bucket(g, buckets)
- max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
- ca->max_last_bucket_io[rw] = max_last_io;
-
- /* Recalculate global max_last_io: */
- max_last_io = 0;
-
- for_each_member_device(ca, c, i)
- max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
- clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
- struct bucket_array *buckets;
- struct bch_dev *ca;
- struct bucket *g;
- unsigned i;
-
- trace_rescale_prios(c);
-
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- g->io_time[rw] = clock->hand -
- bucket_last_io(c, g, rw) / 2;
-
- bch2_recalc_oldest_io(c, ca, rw);
-
- up_read(&ca->bucket_lock);
- }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
- return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
- struct bucket_clock *clock = container_of(timer,
- struct bucket_clock, rescale);
- struct bch_fs *c = container_of(clock,
- struct bch_fs, bucket_clock[clock->rw]);
- struct bch_dev *ca;
- u64 capacity;
- unsigned i;
-
- mutex_lock(&clock->lock);
-
- /* if clock cannot be advanced more, rescale prio */
- if (clock->max_last_io >= U16_MAX - 2)
- bch2_rescale_bucket_io_times(c, clock->rw);
-
- BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
- for_each_member_device(ca, c, i)
- ca->max_last_bucket_io[clock->rw]++;
- clock->max_last_io++;
- clock->hand++;
-
- mutex_unlock(&clock->lock);
-
- capacity = READ_ONCE(c->capacity);
-
- if (!capacity)
- return;
-
- /*
- * we only increment when 0.1% of the filesystem capacity has been read
- * or written too, this determines if it's time
- *
- * XXX: we shouldn't really be going off of the capacity of devices in
- * RW mode (that will be 0 when we're RO, yet we can still service
- * reads)
- */
- timer->expire += bucket_clock_freq(capacity);
-
- bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
- struct bucket_clock *clock = &c->bucket_clock[rw];
-
- clock->hand = 1;
- clock->rw = rw;
- clock->rescale.fn = bch2_inc_clock_hand;
- clock->rescale.expire = bucket_clock_freq(c->capacity);
- mutex_init(&clock->lock);
-}
-
int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
size_t bucket_nr, int rw)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
struct btree_iter *iter;
struct bucket *g;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf *a;
struct bkey_alloc_unpacked u;
- u16 *time;
+ u64 *time, now;
int ret = 0;
- iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
+ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto out;
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
percpu_down_read(&c->mark_lock);
g = bucket(ca, bucket_nr);
- u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+ u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
-
time = rw == READ ? &u.read_time : &u.write_time;
- if (*time == c->bucket_clock[rw].hand)
+ now = atomic64_read(&c->io_clock[rw].now);
+ if (*time == now)
goto out;
- *time = c->bucket_clock[rw].hand;
-
- bch2_alloc_pack(a, u);
+ *time = now;
- ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+ bch2_alloc_pack(c, a, u);
+ ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
out:
bch2_trans_iter_put(trans, iter);
static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
{
unsigned long gc_count = c->gc_count;
- u64 available;
+ s64 available;
+ unsigned i;
int ret = 0;
ca->allocator_state = ALLOCATOR_BLOCKED;
if (gc_count != c->gc_count)
ca->inc_gen_really_needs_gc = 0;
- available = max_t(s64, 0, dev_buckets_available(ca) -
- ca->inc_gen_really_needs_gc);
+ available = dev_buckets_available(ca);
+ available -= ca->inc_gen_really_needs_gc;
+
+ spin_lock(&c->freelist_lock);
+ for (i = 0; i < RESERVE_NR; i++)
+ available -= fifo_used(&ca->free[i]);
+ spin_unlock(&c->freelist_lock);
+
+ available = max(available, 0LL);
if (available > fifo_free(&ca->free_inc) ||
(available &&
- (!fifo_full(&ca->free[RESERVE_BTREE]) ||
- !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
+ !fifo_full(&ca->free[RESERVE_MOVINGGC])))
break;
up_read(&c->gc_lock);
return ret;
}
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
- size_t bucket,
- struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+ struct bucket_mark m)
{
u8 gc_gen;
- if (!is_available_bucket(mark))
+ if (!is_available_bucket(m))
+ return false;
+
+ if (m.owned_by_allocator)
return false;
if (ca->buckets_nouse &&
- test_bit(bucket, ca->buckets_nouse))
+ test_bit(b, ca->buckets_nouse))
return false;
- gc_gen = bucket_gc_gen(ca, bucket);
+ gc_gen = bucket_gc_gen(bucket(ca, b));
if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
ca->inc_gen_needs_gc++;
/*
* Determines what order we're going to reuse buckets, smallest bucket_key()
* first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- * indication of how hot the data is -- we scale the prio so that the prio
- * farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- * indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- * smallest bucket_gc_gen() - since incrementing the same bucket's generation
- * number repeatedly forces us to run mark and sweep gc to avoid generation
- * number wraparound.
*/
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+ u64 now, u64 last_seq_ondisk)
{
- unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
- unsigned max_last_io = ca->max_last_bucket_io[READ];
-
- /*
- * Time since last read, scaled to [0, 8) where larger value indicates
- * more recently read data:
- */
- unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
+ unsigned used = bucket_sectors_used(m);
- /* How much we want to keep the data in this bucket: */
- unsigned long data_wantness =
- (hotness + 1) * bucket_sectors_used(m);
-
- unsigned long needs_journal_commit =
- bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+ if (used) {
+ /*
+ * Prefer to keep buckets that have been read more recently, and
+ * buckets that have more data in them:
+ */
+ u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+ u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
- return (data_wantness << 9) |
- (needs_journal_commit << 8) |
- (bucket_gc_gen(ca, b) / 16);
+ return -last_read_scaled;
+ } else {
+ /*
+ * Prefer to use buckets with smaller gc_gen so that we don't
+ * have to walk the btree and recalculate oldest_gen - but shift
+ * off the low bits so that buckets will still have equal sort
+ * keys when there's only a small difference, so that we can
+ * keep sequential buckets together:
+ */
+ return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+ (bucket_gc_gen(g) >> 4);
+ }
}
static inline int bucket_alloc_cmp(alloc_heap *h,
{
struct bucket_array *buckets;
struct alloc_heap_entry e = { 0 };
+ u64 now, last_seq_ondisk;
size_t b, i, nr = 0;
- ca->alloc_heap.used = 0;
-
- mutex_lock(&c->bucket_clock[READ].lock);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
-
- bch2_recalc_oldest_io(c, ca, READ);
+ ca->alloc_heap.used = 0;
+ now = atomic64_read(&c->io_clock[READ].now);
+ last_seq_ondisk = c->journal.last_seq_ondisk;
/*
* Find buckets with lowest read priority, by building a maxheap sorted
* all buckets have been visited.
*/
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
- struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
- unsigned long key = bucket_sort_key(c, ca, b, m);
+ struct bucket *g = &buckets->b[b];
+ struct bucket_mark m = READ_ONCE(g->mark);
+ unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
}
up_read(&ca->bucket_lock);
- mutex_unlock(&c->bucket_clock[READ].lock);
}
static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
ca->inc_gen_needs_gc = 0;
switch (ca->mi.replacement) {
- case CACHE_REPLACEMENT_LRU:
+ case BCH_CACHE_REPLACEMENT_lru:
find_reclaimable_buckets_lru(c, ca);
break;
- case CACHE_REPLACEMENT_FIFO:
+ case BCH_CACHE_REPLACEMENT_fifo:
find_reclaimable_buckets_fifo(c, ca);
break;
- case CACHE_REPLACEMENT_RANDOM:
+ case BCH_CACHE_REPLACEMENT_random:
find_reclaimable_buckets_random(c, ca);
break;
}
struct btree_iter *iter,
u64 *journal_seq, unsigned flags)
{
-#if 0
- __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
- /* hack: */
- __BKEY_PADDED(k, 8) alloc_key;
-#endif
struct bch_fs *c = trans->c;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf a;
struct bkey_alloc_unpacked u;
struct bucket *g;
struct bucket_mark m;
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read(&c->mark_lock);
- spin_lock(&c->freelist_lock);
-
- verify_not_on_freelist(c, ca, b);
-
- BUG_ON(!fifo_push(&ca->free_inc, b));
-
g = bucket(ca, b);
m = READ_ONCE(g->mark);
- invalidating_cached_data = m.cached_sectors != 0;
+ BUG_ON(m.dirty_sectors);
+
+ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+
+ spin_lock(&c->freelist_lock);
+ verify_not_on_freelist(c, ca, b);
+ BUG_ON(!fifo_push(&ca->free_inc, b));
+ spin_unlock(&c->freelist_lock);
/*
* If we're not invalidating cached data, we only increment the bucket
* gen in memory here, the incremented gen will be updated in the btree
* by bch2_trans_mark_pointer():
*/
+ if (!m.cached_sectors &&
+ !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
+ BUG_ON(m.data_type);
+ bucket_cmpxchg(g, m, m.gen++);
+ percpu_up_read(&c->mark_lock);
+ goto out;
+ }
- if (!invalidating_cached_data)
- bch2_invalidate_bucket(c, ca, b, &m);
- else
- bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-
- spin_unlock(&c->freelist_lock);
percpu_up_read(&c->mark_lock);
- if (!invalidating_cached_data)
- goto out;
-
/*
* If the read-only path is trying to shut down, we can't be generating
* new btree updates:
goto out;
}
- BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
retry:
ret = bch2_btree_iter_traverse(iter);
percpu_down_read(&c->mark_lock);
g = bucket(ca, iter->pos.offset);
m = READ_ONCE(g->mark);
- u = alloc_mem_to_key(g, m);
+ u = alloc_mem_to_key(iter, g, m);
percpu_up_read(&c->mark_lock);
u.data_type = 0;
u.dirty_sectors = 0;
u.cached_sectors = 0;
- u.read_time = c->bucket_clock[READ].hand;
- u.write_time = c->bucket_clock[WRITE].hand;
-
- a = bkey_alloc_init(&alloc_key.k);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
+ u.read_time = atomic64_read(&c->io_clock[READ].now);
+ u.write_time = atomic64_read(&c->io_clock[WRITE].now);
- bch2_trans_update(trans, iter, &a->k_i,
+ bch2_alloc_pack(c, &a, u);
+ bch2_trans_update(trans, iter, &a.k,
BTREE_TRIGGER_BUCKET_INVALIDATE);
/*
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_JOURNAL_RESERVED|
flags);
if (ret == -EINTR)
goto retry;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
POS(ca->dev_idx, 0),
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
(!fifo_empty(&ca->free_inc)
? BTREE_INSERT_NOWAIT : 0));
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
/* If we used NOWAIT, don't return the error: */
return 0;
}
+static inline bool allocator_thread_running(struct bch_dev *ca)
+{
+ return ca->mi.state == BCH_MEMBER_STATE_rw &&
+ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+}
+
/**
* bch_allocator_thread - move buckets from free_inc to reserves
*
int ret;
set_freezable();
- ca->allocator_state = ALLOCATOR_RUNNING;
while (1) {
+ if (!allocator_thread_running(ca)) {
+ ca->allocator_state = ALLOCATOR_STOPPED;
+ if (kthread_wait_freezable(allocator_thread_running(ca)))
+ break;
+ }
+
+ ca->allocator_state = ALLOCATOR_RUNNING;
+
cond_resched();
if (kthread_should_stop())
break;
return 0;
p = kthread_create(bch2_allocator_thread, ca,
- "bch_alloc[%s]", ca->name);
- if (IS_ERR(p))
+ "bch-alloc/%s", ca->name);
+ if (IS_ERR(p)) {
+ bch_err(ca->fs, "error creating allocator thread: %li",
+ PTR_ERR(p));
return PTR_ERR(p);
+ }
get_task_struct(p);
rcu_assign_pointer(ca->alloc_thread, p);
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- bch2_bucket_clock_init(c, READ);
- bch2_bucket_clock_init(c, WRITE);
c->pd_controllers_update_seconds = 5;
INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
#include "debug.h"
struct bkey_alloc_unpacked {
+ u64 bucket;
+ u8 dev;
u8 gen;
+ u8 oldest_gen;
+ u8 data_type;
#define x(_name, _bits) u##_bits _name;
- BCH_ALLOC_FIELDS()
+ BCH_ALLOC_FIELDS_V2()
#undef x
};
+struct bkey_alloc_buf {
+ struct bkey_i k;
+
+ union {
+ struct {
+#define x(_name, _bits) + _bits / 8
+ u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
+#undef x
+ } _v1;
+ struct {
+#define x(_name, _bits) + 8 + _bits / 8
+ u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
+#undef x
+ } _v2;
+ };
+} __attribute__((packed, aligned(8)));
+
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
struct bkey_alloc_unpacked r)
{
- return l.gen != r.gen
-#define x(_name, _bits) || l._name != r._name
- BCH_ALLOC_FIELDS()
+ return l.gen != r.gen ||
+ l.oldest_gen != r.oldest_gen ||
+ l.data_type != r.data_type
+#define x(_name, ...) || l._name != r._name
+ BCH_ALLOC_FIELDS_V2()
#undef x
;
}
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bkey_i_alloc *,
+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
const struct bkey_alloc_unpacked);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+alloc_mem_to_key(struct btree_iter *iter,
+ struct bucket *g, struct bucket_mark m)
{
return (struct bkey_alloc_unpacked) {
+ .dev = iter->pos.inode,
+ .bucket = iter->pos.offset,
.gen = m.gen,
.oldest_gen = g->oldest_gen,
.data_type = m.data_type,
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
- .key_invalid = bch2_alloc_invalid, \
+ .key_invalid = bch2_alloc_v1_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+}
+
+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
size_t bucket)
{
- if (expensive_debug_checks(c)) {
+ if (bch2_expensive_debug_checks) {
size_t iter;
long i;
unsigned j;
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned);
int bch2_alloc_write(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
rcu_read_lock();
buckets = bucket_array(ca);
- for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
- if (is_available_bucket(buckets->b[b].mark))
+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
+ if (is_available_bucket(buckets->b[b].mark) &&
+ !buckets->b[b].mark.owned_by_allocator)
goto success;
b = -1;
success:
static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
{
switch (reserve) {
- case RESERVE_ALLOC:
- return 0;
case RESERVE_BTREE:
+ case RESERVE_BTREE_MOVINGGC:
+ return 0;
+ case RESERVE_MOVINGGC:
return OPEN_BUCKETS_COUNT / 4;
default:
return OPEN_BUCKETS_COUNT / 2;
bool may_alloc_partial,
struct closure *cl)
{
- struct bucket_array *buckets;
struct open_bucket *ob;
- long bucket = 0;
+ long b = 0;
spin_lock(&c->freelist_lock);
return ERR_PTR(-OPEN_BUCKETS_EMPTY);
}
- if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
goto out;
switch (reserve) {
- case RESERVE_ALLOC:
- if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
- goto out;
- break;
- case RESERVE_BTREE:
- if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
- ca->free[RESERVE_BTREE].size &&
- fifo_pop(&ca->free[RESERVE_BTREE], bucket))
- goto out;
- break;
+ case RESERVE_BTREE_MOVINGGC:
case RESERVE_MOVINGGC:
- if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
goto out;
break;
default:
trace_bucket_alloc_fail(ca, reserve);
return ERR_PTR(-FREELIST_EMPTY);
out:
- verify_not_on_freelist(c, ca, bucket);
+ verify_not_on_freelist(c, ca, b);
ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock);
- buckets = bucket_array(ca);
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
ob->ptr = (struct bch_extent_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = buckets->b[bucket].mark.gen,
- .offset = bucket_to_sector(ca, bucket),
+ .gen = bucket(ca, b)->mark.gen,
+ .offset = bucket_to_sector(ca, b),
.dev = ca->dev_idx,
};
* it's to a device we don't want:
*/
-static void bucket_alloc_from_stripe(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- u16 target,
- unsigned erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- unsigned flags)
+static enum bucket_alloc_ret
+bucket_alloc_from_stripe(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct write_point *wp,
+ struct bch_devs_mask *devs_may_alloc,
+ u16 target,
+ unsigned erasure_code,
+ unsigned nr_replicas,
+ unsigned *nr_effective,
+ bool *have_cache,
+ unsigned flags,
+ struct closure *cl)
{
struct dev_alloc_list devs_sorted;
struct ec_stripe_head *h;
unsigned i, ec_idx;
if (!erasure_code)
- return;
+ return 0;
if (nr_replicas < 2)
- return;
+ return 0;
if (ec_open_bucket(c, ptrs))
- return;
+ return 0;
- h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1);
+ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
+ wp == &c->copygc_write_point,
+ cl);
+ if (IS_ERR(h))
+ return -PTR_ERR(h);
if (!h)
- return;
+ return 0;
devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
for (i = 0; i < devs_sorted.nr; i++)
- open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+ for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
+ if (!h->s->blocks[ec_idx])
+ continue;
+
+ ob = c->open_buckets + h->s->blocks[ec_idx];
if (ob->ptr.dev == devs_sorted.devs[i] &&
- !test_and_set_bit(h->s->data_block_idx[ec_idx],
- h->s->blocks_allocated))
+ !test_and_set_bit(ec_idx, h->s->blocks_allocated))
goto got_bucket;
+ }
goto out_put_head;
got_bucket:
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- ob->ec_idx = h->s->data_block_idx[ec_idx];
+ ob->ec_idx = ec_idx;
ob->ec = h->s;
add_new_bucket(c, ptrs, devs_may_alloc,
atomic_inc(&h->s->pin);
out_put_head:
bch2_ec_stripe_head_put(c, h);
+ return 0;
}
/* Sector allocator */
}
if (!ec_open_bucket(c, ptrs)) {
- bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+ ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
target, erasure_code,
nr_replicas, nr_effective,
- have_cache, flags);
+ have_cache, flags, _cl);
+ if (ret == FREELIST_EMPTY ||
+ ret == OPEN_BUCKETS_EMPTY)
+ return ret;
if (*nr_effective >= nr_replicas)
return 0;
}
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
- open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
- drop |= ob2->ptr.dev == ca->dev_idx;
- open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+ for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
+ if (!ob->ec->blocks[j])
+ continue;
+
+ ob2 = c->open_buckets + ob->ec->blocks[j];
drop |= ob2->ptr.dev == ca->dev_idx;
+ }
mutex_unlock(&ob->ec->lock);
}
struct ec_bucket_buf;
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
- /*
- * "now" in (read/write) IO time - incremented whenever we do X amount
- * of reads or writes.
- *
- * Goes with the bucket read/write prios: when we read or write to a
- * bucket we reset the bucket's prio to the current hand; thus hand -
- * prio = time since bucket was last read/written.
- *
- * The units are some amount (bytes/sectors) of data read/written, and
- * the units can change on the fly if we need to rescale to fit
- * everything in a u16 - your only guarantee is that the units are
- * consistent.
- */
- u16 hand;
- u16 max_last_io;
-
- int rw;
-
- struct io_timer rescale;
- struct mutex lock;
-};
-
-/* There is one reserve for each type of btree, one for prios and gens
- * and one for moving GC */
enum alloc_reserve {
- RESERVE_ALLOC = -1,
- RESERVE_BTREE = 0,
- RESERVE_MOVINGGC = 1,
- RESERVE_NONE = 2,
- RESERVE_NR = 3,
+ RESERVE_BTREE_MOVINGGC = -2,
+ RESERVE_BTREE = -1,
+ RESERVE_MOVINGGC = 0,
+ RESERVE_NONE = 1,
+ RESERVE_NR = 2,
};
typedef FIFO(long) alloc_fifo;
u64 last_used;
unsigned long write_point;
enum bch_data_type type;
- bool is_ec;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
#include <linux/semaphore.h>
#include <linux/seqlock.h>
#include <linux/shrinker.h>
+#include <linux/srcu.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
dynamic_fault("bcachefs:meta:write:" name)
#ifdef __KERNEL__
-#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
+#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
#else
-#define bch2_fmt(_c, fmt) fmt "\n"
+#define bch2_fmt(_c, fmt) fmt "\n"
+#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
#endif
#define bch_info(c, fmt, ...) \
printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
#define bch_err_ratelimited(c, fmt, ...) \
printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
+ printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_verbose(c, fmt, ...) \
do { \
BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
+ BCH_DEBUG_PARAM(debug_check_btree_accounting, \
+ "Verify btree accounting for keys within a node") \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
#define BCH_TIME_STATS() \
x(btree_node_mem_alloc) \
x(btree_node_split) \
GC_PHASE_START,
GC_PHASE_SB,
- GC_PHASE_BTREE_EC,
- GC_PHASE_BTREE_EXTENTS,
- GC_PHASE_BTREE_INODES,
- GC_PHASE_BTREE_DIRENTS,
- GC_PHASE_BTREE_XATTRS,
- GC_PHASE_BTREE_ALLOC,
- GC_PHASE_BTREE_QUOTAS,
- GC_PHASE_BTREE_REFLINK,
+ GC_PHASE_BTREE_stripes,
+ GC_PHASE_BTREE_extents,
+ GC_PHASE_BTREE_inodes,
+ GC_PHASE_BTREE_dirents,
+ GC_PHASE_BTREE_xattrs,
+ GC_PHASE_BTREE_alloc,
+ GC_PHASE_BTREE_quotas,
+ GC_PHASE_BTREE_reflink,
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
- struct bch_dev_usage __percpu *usage[2];
+ struct bch_dev_usage *usage_base;
+ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
+ struct bch_dev_usage __percpu *usage_gc;
/* Allocator: */
struct task_struct __rcu *alloc_thread;
size_t fifo_last_bucket;
- /* last calculated minimum prio */
- u16 max_last_bucket_io[2];
-
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
atomic64_t rebalance_work;
struct journal_device journal;
+ u64 prev_journal_sector;
struct work_struct io_error_work;
BCH_FS_ERRORS_FIXED,
/* misc: */
- BCH_FS_FIXED_GENS,
- BCH_FS_ALLOC_WRITTEN,
+ BCH_FS_NEED_ANOTHER_GC,
+ BCH_FS_DELETED_NODES,
+ BCH_FS_NEED_ALLOC_WRITE,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
struct journal_key {
enum btree_id btree_id:8;
unsigned level:8;
+ bool allocated;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;
} *d;
size_t nr;
+ size_t size;
u64 journal_seq_base;
};
+struct btree_iter_buf {
+ struct btree_iter *iter;
+};
+
struct bch_fs {
struct closure cl;
struct bch_replicas_cpu replicas_gc;
struct mutex replicas_gc_lock;
+ struct journal_entry_res btree_root_journal_res;
struct journal_entry_res replicas_journal_res;
+ struct journal_entry_res clock_journal_res;
+ struct journal_entry_res dev_usage_journal_res;
struct bch_disk_groups_cpu __rcu *disk_groups;
uuid_le user_uuid;
u16 version;
+ u16 version_min;
u16 encoded_extent_max;
u8 nr_devices;
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
mempool_t btree_iters_pool;
+ struct btree_iter_buf __percpu *btree_iters_bufs;
+
+ struct srcu_struct btree_trans_barrier;
struct btree_key_cache btree_key_cache;
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
- struct workqueue_struct *journal_reclaim_wq;
/* ALLOCATION */
struct delayed_work pd_controllers_update;
unsigned bucket_size_max;
atomic64_t sectors_available;
+ struct mutex sectors_available_lock;
struct bch_fs_pcpu __percpu *pcpu;
seqcount_t usage_lock;
struct bch_fs_usage *usage_base;
- struct bch_fs_usage __percpu *usage[2];
+ struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
struct bch_fs_usage __percpu *usage_gc;
+ u64 __percpu *online_reserved;
/* single element mempool: */
struct mutex usage_scratch_lock;
- struct bch_fs_usage *usage_scratch;
-
- /*
- * When we invalidate buckets, we use both the priority and the amount
- * of good data to determine which buckets to reuse first - to weight
- * those together consistently we keep track of the smallest nonzero
- * priority of any bucket.
- */
- struct bucket_clock bucket_clock[2];
+ struct bch_fs_usage_online *usage_scratch;
struct io_clock io_clock[2];
* Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
* has been marked by GC.
*
- * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+ * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
*
* Protected by gc_pos_lock. Only written to by GC thread, so GC thread
* can read without a lock.
struct bio_set dio_write_bioset;
struct bio_set dio_read_bioset;
+
+ atomic64_t btree_writes_nr;
+ atomic64_t btree_writes_sectors;
struct bio_list btree_write_error_list;
struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock;
struct mutex verify_lock;
#endif
- u64 unused_inode_hint;
+ u64 *unused_inode_hints;
+ unsigned inode_shard_bits;
/*
* A btree node on disk could have too many bsets for an iterator to fit
struct journal journal;
struct list_head journal_entries;
struct journal_keys journal_keys;
+ struct list_head journal_iters;
u64 last_bucket_seq_cleanup;
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
-#define BCH_DEBUG_PARAM(name, description) bool name;
- BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
struct time_stats times[BCH_TIME_STAT_NR];
};
#define KEY_SNAPSHOT_MAX ((__u32)~0U)
#define KEY_SIZE_MAX ((__u32)~0U)
-static inline struct bpos POS(__u64 inode, __u64 offset)
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
{
- struct bpos ret;
-
- ret.inode = inode;
- ret.offset = offset;
- ret.snapshot = 0;
-
- return ret;
+ return (struct bpos) {
+ .inode = inode,
+ .offset = offset,
+ .snapshot = snapshot,
+ };
}
-#define POS_MIN POS(0, 0)
-#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX)
+#define POS_MIN SPOS(0, 0, 0)
+#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
+#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
/* Empty placeholder struct, for container_of() */
struct bch_val {
x(discard, 1) \
x(error, 2) \
x(cookie, 3) \
- x(whiteout, 4) \
+ x(hash_whiteout, 4) \
x(btree_ptr, 5) \
x(extent, 6) \
x(reservation, 7) \
x(reflink_v, 16) \
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
- x(indirect_inline_data, 19)
+ x(indirect_inline_data, 19) \
+ x(alloc_v2, 20)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
KEY_TYPE_MAX,
};
+struct bch_deleted {
+ struct bch_val v;
+};
+
+struct bch_discard {
+ struct bch_val v;
+};
+
+struct bch_error {
+ struct bch_val v;
+};
+
struct bch_cookie {
struct bch_val v;
__le64 cookie;
};
+struct bch_hash_whiteout {
+ struct bch_val v;
+};
+
/* Extents */
/*
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:5,
block:8,
- idx:51;
+ redundancy:4,
+ idx:47;
#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:51,
+ __u64 idx:47,
+ redundancy:4,
block:8,
type:5;
#endif
__u64 mem_ptr;
__le64 seq;
__le16 sectors_written;
- /* In case we ever decide to do variable size btree nodes: */
- __le16 sectors;
+ __le16 flags;
struct bpos min_key;
struct bch_extent_ptr start[0];
__u64 _data[0];
} __attribute__((packed, aligned(8)));
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+
struct bch_extent {
struct bch_val v;
#define BKEY_EXTENT_VAL_U64s_MAX \
(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
/* * Maximum possible size of an entire extent, key + value: */
#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
} __attribute__((packed, aligned(8)));
#define BCH_INODE_FIELDS() \
- x(bi_atime, 64) \
- x(bi_ctime, 64) \
- x(bi_mtime, 64) \
- x(bi_otime, 64) \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
x(bi_size, 64) \
x(bi_sectors, 64) \
x(bi_uid, 32) \
x(bi_foreground_target, 16) \
x(bi_background_target, 16) \
x(bi_erasure_code, 16) \
- x(bi_fields_set, 16)
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
__BCH_INODE_I_SIZE_DIRTY= 5,
__BCH_INODE_I_SECTORS_DIRTY= 6,
__BCH_INODE_UNLINKED = 7,
+ __BCH_INODE_BACKPTR_UNTRUSTED = 8,
/* bits 20+ reserved for packed fields below: */
};
#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
/* Dirents */
__u8 data[];
} __attribute__((packed, aligned(8)));
-#define BCH_ALLOC_FIELDS() \
+#define BCH_ALLOC_FIELDS_V1() \
x(read_time, 16) \
x(write_time, 16) \
x(data_type, 8) \
x(dirty_sectors, 16) \
x(cached_sectors, 16) \
- x(oldest_gen, 8)
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
enum {
-#define x(name, bytes) BCH_ALLOC_FIELD_##name,
- BCH_ALLOC_FIELDS()
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
#undef x
BCH_ALLOC_FIELD_NR
};
-static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
- BCH_ALLOC_FIELDS()
-#undef x
-};
-
-#define x(name, bits) + (bits / 8)
-static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
- DIV_ROUND_UP(offsetof(struct bch_alloc, data)
- BCH_ALLOC_FIELDS(), sizeof(u64));
-#undef x
-
-#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
-
/* Quotas: */
enum quota_types {
LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
#endif
+#define BCH_MEMBER_STATES() \
+ x(rw, 0) \
+ x(ro, 1) \
+ x(failed, 2) \
+ x(spare, 3)
+
enum bch_member_state {
- BCH_MEMBER_STATE_RW = 0,
- BCH_MEMBER_STATE_RO = 1,
- BCH_MEMBER_STATE_FAILED = 2,
- BCH_MEMBER_STATE_SPARE = 3,
- BCH_MEMBER_STATE_NR = 4,
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
+ BCH_MEMBER_STATES()
+#undef x
+ BCH_MEMBER_STATE_NR
};
-enum cache_replacement {
- CACHE_REPLACEMENT_LRU = 0,
- CACHE_REPLACEMENT_FIFO = 1,
- CACHE_REPLACEMENT_RANDOM = 2,
- CACHE_REPLACEMENT_NR = 3,
+#define BCH_CACHE_REPLACEMENT_POLICIES() \
+ x(lru, 0) \
+ x(fifo, 1) \
+ x(random, 2)
+
+enum bch_cache_replacement_policies {
+#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
+ BCH_CACHE_REPLACEMENT_POLICIES()
+#undef x
+ BCH_CACHE_REPLACEMENT_NR
};
struct bch_sb_field_members {
struct bch_sb_field field;
__le32 flags;
- __le16 read_clock;
- __le16 write_clock;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
__le64 journal_seq;
union {
bcachefs_metadata_version_new_versioning = 10,
bcachefs_metadata_version_bkey_renumber = 10,
bcachefs_metadata_version_inode_btree_change = 11,
- bcachefs_metadata_version_max = 12,
+ bcachefs_metadata_version_snapshot = 12,
+ bcachefs_metadata_version_inode_backpointers = 13,
+ bcachefs_metadata_version_max = 14,
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
-LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62);
+/* bit 61 was reflink option */
+LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
/* 61-64 unused */
LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
/*
* Features:
x(btree_ptr_v2, 11) \
x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13) \
- x(reflink_inline_data, 14)
+ x(reflink_inline_data, 14) \
+ x(new_varint, 15) \
+ x(journal_no_flush, 16) \
+ x(alloc_v2, 17) \
+ x(extents_across_btree_nodes, 18)
+
+#define BCH_SB_FEATURES_ALWAYS \
+ ((1ULL << BCH_FEATURE_new_extent_overwrite)| \
+ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+ (1ULL << BCH_FEATURE_btree_updates_journalled)|\
+ (1ULL << BCH_FEATURE_alloc_v2)|\
+ (1ULL << BCH_FEATURE_extents_across_btree_nodes))
#define BCH_SB_FEATURES_ALL \
- ((1ULL << BCH_FEATURE_new_siphash)| \
- (1ULL << BCH_FEATURE_new_extent_overwrite)| \
+ (BCH_SB_FEATURES_ALWAYS| \
+ (1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
- (1ULL << BCH_FEATURE_extents_above_btree_updates))
+ (1ULL << BCH_FEATURE_new_varint)| \
+ (1ULL << BCH_FEATURE_journal_no_flush))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
BCH_FEATURE_NR,
};
+#define BCH_SB_COMPAT() \
+ x(alloc_info, 0) \
+ x(alloc_metadata, 1) \
+ x(extents_above_btree_updates_done, 2) \
+ x(bformat_overflow_done, 3)
+
enum bch_sb_compat {
- BCH_COMPAT_FEAT_ALLOC_INFO = 0,
- BCH_COMPAT_FEAT_ALLOC_METADATA = 1,
+#define x(f, n) BCH_COMPAT_##f,
+ BCH_SB_COMPAT()
+#undef x
+ BCH_COMPAT_NR,
};
/* options: */
#define BCH_REPLICAS_MAX 4U
+#define BCH_BKEY_PTRS_MAX 16U
+
+#define BCH_ERROR_ACTIONS() \
+ x(continue, 0) \
+ x(ro, 1) \
+ x(panic, 2)
+
enum bch_error_actions {
- BCH_ON_ERROR_CONTINUE = 0,
- BCH_ON_ERROR_RO = 1,
- BCH_ON_ERROR_PANIC = 2,
- BCH_NR_ERROR_ACTIONS = 3,
+#define x(t, n) BCH_ON_ERROR_##t = n,
+ BCH_ERROR_ACTIONS()
+#undef x
+ BCH_ON_ERROR_NR
};
enum bch_str_hash_type {
BCH_STR_HASH_NR = 4,
};
+#define BCH_STR_HASH_OPTS() \
+ x(crc32c, 0) \
+ x(crc64, 1) \
+ x(siphash, 2)
+
enum bch_str_hash_opts {
- BCH_STR_HASH_OPT_CRC32C = 0,
- BCH_STR_HASH_OPT_CRC64 = 1,
- BCH_STR_HASH_OPT_SIPHASH = 2,
- BCH_STR_HASH_OPT_NR = 3,
+#define x(t, n) BCH_STR_HASH_OPT_##t = n,
+ BCH_STR_HASH_OPTS()
+#undef x
+ BCH_STR_HASH_OPT_NR
};
enum bch_csum_type {
}
}
+#define BCH_CSUM_OPTS() \
+ x(none, 0) \
+ x(crc32c, 1) \
+ x(crc64, 2)
+
enum bch_csum_opts {
- BCH_CSUM_OPT_NONE = 0,
- BCH_CSUM_OPT_CRC32C = 1,
- BCH_CSUM_OPT_CRC64 = 2,
- BCH_CSUM_OPT_NR = 3,
+#define x(t, n) BCH_CSUM_OPT_##t = n,
+ BCH_CSUM_OPTS()
+#undef x
+ BCH_CSUM_OPT_NR
};
#define BCH_COMPRESSION_TYPES() \
x(incompressible, 5)
enum bch_compression_type {
-#define x(t, n) BCH_COMPRESSION_TYPE_##t,
+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
BCH_COMPRESSION_TYPES()
#undef x
BCH_COMPRESSION_TYPE_NR
x(zstd, 3)
enum bch_compression_opts {
-#define x(t, n) BCH_COMPRESSION_OPT_##t,
+#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
BCH_COMPRESSION_OPTS()
#undef x
BCH_COMPRESSION_OPT_NR
x(blacklist, 3) \
x(blacklist_v2, 4) \
x(usage, 5) \
- x(data_usage, 6)
+ x(data_usage, 6) \
+ x(clock, 7) \
+ x(dev_usage, 8)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
struct bch_replicas_entry r;
} __attribute__((packed));
+struct jset_entry_clock {
+ struct jset_entry entry;
+ __u8 rw;
+ __u8 pad[7];
+ __le64 time;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage_type {
+ __le64 buckets;
+ __le64 sectors;
+ __le64 fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+ struct jset_entry entry;
+ __le32 dev;
+ __u32 pad;
+
+ __le64 buckets_ec;
+ __le64 buckets_unavailable;
+
+ struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
__u8 encrypted_start[0];
- __le16 read_clock;
- __le16 write_clock;
+ __le16 _read_clock; /* no longer used */
+ __le16 _write_clock;
/* Sequence number of oldest dirty journal entry */
__le64 last_seq;
LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
+LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
#define BCH_JOURNAL_BUCKETS_MIN 8
/* Btree: */
-#define BCH_BTREE_IDS() \
- x(EXTENTS, 0, "extents") \
- x(INODES, 1, "inodes") \
- x(DIRENTS, 2, "dirents") \
- x(XATTRS, 3, "xattrs") \
- x(ALLOC, 4, "alloc") \
- x(QUOTAS, 5, "quotas") \
- x(EC, 6, "stripes") \
- x(REFLINK, 7, "reflink")
+#define BCH_BTREE_IDS() \
+ x(extents, 0) \
+ x(inodes, 1) \
+ x(dirents, 2) \
+ x(xattrs, 3) \
+ x(alloc, 4) \
+ x(quotas, 5) \
+ x(stripes, 6) \
+ x(reflink, 7)
enum btree_id {
-#define x(kwd, val, name) BTREE_ID_##kwd = val,
+#define x(kwd, val) BTREE_ID_##kwd = val,
BCH_BTREE_IDS()
#undef x
BTREE_ID_NR
/* Closed interval: */
struct bpos min_key;
struct bpos max_key;
- struct bch_extent_ptr ptr;
+ struct bch_extent_ptr _ptr; /* not used anymore */
struct bkey_format format;
union {
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
+#define BCH_FORCE_IF_LOST \
+ (BCH_FORCE_IF_DATA_LOST| \
+ BCH_FORCE_IF_METADATA_LOST)
#define BCH_FORCE_IF_DEGRADED \
(BCH_FORCE_IF_DATA_DEGRADED| \
BCH_FORCE_IF_METADATA_DEGRADED)
#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
/* ioctl below act on a particular file, not the filesystem as a whole: */
};
enum bch_data_ops {
- BCH_DATA_OP_SCRUB = 0,
- BCH_DATA_OP_REREPLICATE = 1,
- BCH_DATA_OP_MIGRATE = 2,
- BCH_DATA_OP_NR = 3,
+ BCH_DATA_OP_SCRUB = 0,
+ BCH_DATA_OP_REREPLICATE = 1,
+ BCH_DATA_OP_MIGRATE = 2,
+ BCH_DATA_OP_REWRITE_OLD_NODES = 3,
+ BCH_DATA_OP_NR = 4,
};
/*
* job. The file descriptor is O_CLOEXEC.
*/
struct bch_ioctl_data {
- __u32 op;
+ __u16 op;
+ __u8 start_btree;
+ __u8 end_btree;
__u32 flags;
- struct bpos start;
- struct bpos end;
+ struct bpos start_pos;
+ struct bpos end_pos;
union {
struct {
__u64 nbuckets;
};
+/*
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
+ *
+ * @dev - member to resize
+ * @nbuckets - new number of buckets
+ */
+struct bch_ioctl_disk_resize_journal {
+ __u32 flags;
+ __u32 pad;
+ __u64 dev;
+ __u64 nbuckets;
+};
+
#endif /* _BCACHEFS_IOCTL_H */
if ((*p & mask) != mask) {
*p += 1ULL << offset;
- EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
return true;
}
static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
unsigned bits, u64 offset)
{
- offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1));
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+
+ bits = min(bits, unpacked_bits);
+
+ offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
f->bits_per_field[i] = bits;
f->field_offset[i] = cpu_to_le64(offset);
return "incorrect number of fields";
for (i = 0; i < f->nr_fields; i++) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
u64 field_offset = le64_to_cpu(f->field_offset[i]);
- if (f->bits_per_field[i] > 64)
+ if (f->bits_per_field[i] > unpacked_bits)
return "field too large";
- if (field_offset &&
- (f->bits_per_field[i] == 64 ||
- (field_offset + ((1ULL << f->bits_per_field[i]) - 1) <
- field_offset)))
+ if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+ return "offset + bits overflow";
+
+ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+ unpacked_mask) <
+ field_offset)
return "offset + bits overflow";
bits += f->bits_per_field[i];
high_word(f, r),
b->nr_key_bits);
- EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l),
+ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
bkey_unpack_pos(b, r)));
return ret;
}
const struct bkey_packed *l,
const struct bpos *r)
{
- return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r);
+ return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
}
__pure __flatten
-int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
+int bch2_bkey_cmp_packed(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
struct bkey unpacked;
r = (void*) &unpacked;
}
- return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
+ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
}
__pure __flatten
const struct bkey *l_unpacked;
return unlikely(l_unpacked = packed_to_bkey_c(l))
- ? bkey_cmp(l_unpacked->p, *r)
+ ? bpos_cmp(l_unpacked->p, *r)
: __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
}
struct bkey_packed p;
struct bkey_format test_format = {
- .key_u64s = 2,
+ .key_u64s = 3,
.nr_fields = BKEY_NR_FIELDS,
.bits_per_field = {
13,
64,
+ 32,
},
};
#define bkey_next(_k) vstruct_next(_k)
-static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
- struct bkey_packed *end)
-{
- k = bkey_next(k);
-
- while (k != end && !k->u64s)
- k = (void *) ((u64 *) k + 1);
- return k;
-}
-
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
static inline size_t bkey_val_bytes(const struct bkey *k)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
-#define bkey_packed_typecheck(_k) \
-({ \
- BUILD_BUG_ON(!type_is(_k, struct bkey *) && \
- !type_is(_k, struct bkey_packed *)); \
- type_is(_k, struct bkey_packed *); \
-})
-
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
BKEY_PACKED_NONE,
};
-#define bkey_lr_packed_typecheck(_l, _r) \
- (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
-
#define bkey_lr_packed(_l, _r) \
((_l)->format + ((_r)->format << 1))
const struct bpos *);
__pure
-int __bch2_bkey_cmp_packed(const struct bkey_packed *,
- const struct bkey_packed *,
- const struct btree *);
+int bch2_bkey_cmp_packed(const struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
__pure
int __bch2_bkey_cmp_left_packed(const struct btree *,
return bkey_cmp_left_packed(b, l, &r);
}
-/*
- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
- * skip dispatching on k->format:
- */
-#define bkey_cmp_packed(_b, _l, _r) \
-({ \
- int _cmp; \
- \
- switch (bkey_lr_packed_typecheck(_l, _r)) { \
- case BKEY_PACKED_NONE: \
- _cmp = bkey_cmp(((struct bkey *) (_l))->p, \
- ((struct bkey *) (_r))->p); \
- break; \
- case BKEY_PACKED_LEFT: \
- _cmp = bkey_cmp_left_packed((_b), \
- (struct bkey_packed *) (_l), \
- &((struct bkey *) (_r))->p); \
- break; \
- case BKEY_PACKED_RIGHT: \
- _cmp = -bkey_cmp_left_packed((_b), \
- (struct bkey_packed *) (_r), \
- &((struct bkey *) (_l))->p); \
- break; \
- case BKEY_PACKED_BOTH: \
- _cmp = __bch2_bkey_cmp_packed((void *) (_l), \
- (void *) (_r), (_b)); \
- break; \
- } \
- _cmp; \
-})
-
-#if 1
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
+{
+ return cmp_int(l.inode, r.inode) ?:
+ cmp_int(l.offset, r.offset) ?:
+ cmp_int(l.snapshot, r.snapshot);
+}
+
static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
{
- if (l.inode != r.inode)
- return l.inode < r.inode ? -1 : 1;
- if (l.offset != r.offset)
- return l.offset < r.offset ? -1 : 1;
- if (l.snapshot != r.snapshot)
- return l.snapshot < r.snapshot ? -1 : 1;
- return 0;
+ return cmp_int(l.inode, r.inode) ?:
+ cmp_int(l.offset, r.offset);
}
-#else
-int bkey_cmp(struct bpos l, struct bpos r);
-#endif
static inline struct bpos bpos_min(struct bpos l, struct bpos r)
{
- return bkey_cmp(l, r) < 0 ? l : r;
+ return bpos_cmp(l, r) < 0 ? l : r;
+}
+
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
+{
+ return bpos_cmp(l, r) > 0 ? l : r;
+}
+
+#define sbb(a, b, borrow) \
+do { \
+ typeof(a) d1, d2; \
+ \
+ d1 = a - borrow; \
+ borrow = d1 > a; \
+ \
+ d2 = d1 - b; \
+ borrow += d2 > d1; \
+ a = d2; \
+} while (0)
+
+/* returns a - b: */
+static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
+{
+ int borrow = 0;
+
+ sbb(a.snapshot, b.snapshot, borrow);
+ sbb(a.offset, b.offset, borrow);
+ sbb(a.inode, b.inode, borrow);
+ return a;
+}
+
+static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
+{
+ if (bpos_cmp(l, r) > 0)
+ swap(l, r);
+
+ return bpos_sub(r, l);
}
void bch2_bpos_swab(struct bpos *);
format->bits_per_field[BKEY_FIELD_SNAPSHOT];
}
-static inline struct bpos bkey_successor(struct bpos p)
+static inline struct bpos bpos_successor(struct bpos p)
{
- struct bpos ret = p;
+ if (!++p.snapshot &&
+ !++p.offset &&
+ !++p.inode)
+ BUG();
- if (!++ret.offset)
- BUG_ON(!++ret.inode);
+ return p;
+}
- return ret;
+static inline struct bpos bpos_predecessor(struct bpos p)
+{
+ if (!p.snapshot-- &&
+ !p.offset-- &&
+ !p.inode--)
+ BUG();
+
+ return p;
}
-static inline struct bpos bkey_predecessor(struct bpos p)
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
{
- struct bpos ret = p;
+ p.snapshot = 0;
- if (!ret.offset--)
- BUG_ON(!ret.inode--);
+ if (!++p.offset &&
+ !++p.inode)
+ BUG();
- return ret;
+ return p;
+}
+
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
+{
+ p.snapshot = 0;
+
+ if (!p.offset-- &&
+ !p.inode--)
+ BUG();
+
+ return p;
}
static inline u64 bkey_start_offset(const struct bkey *k)
* bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
* functions.
*/
-#define BKEY_VAL_ACCESSORS(name) \
+#define x(name, ...) \
struct bkey_i_##name { \
union { \
struct bkey k; \
return k; \
}
-BKEY_VAL_ACCESSORS(cookie);
-BKEY_VAL_ACCESSORS(btree_ptr);
-BKEY_VAL_ACCESSORS(extent);
-BKEY_VAL_ACCESSORS(reservation);
-BKEY_VAL_ACCESSORS(inode);
-BKEY_VAL_ACCESSORS(inode_generation);
-BKEY_VAL_ACCESSORS(dirent);
-BKEY_VAL_ACCESSORS(xattr);
-BKEY_VAL_ACCESSORS(alloc);
-BKEY_VAL_ACCESSORS(quota);
-BKEY_VAL_ACCESSORS(stripe);
-BKEY_VAL_ACCESSORS(reflink_p);
-BKEY_VAL_ACCESSORS(reflink_v);
-BKEY_VAL_ACCESSORS(inline_data);
-BKEY_VAL_ACCESSORS(btree_ptr_v2);
-BKEY_VAL_ACCESSORS(indirect_inline_data);
+BCH_BKEY_TYPES();
+#undef x
/* byte order helpers */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+
+struct bkey_buf {
+ struct bkey_i *k;
+ u64 onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+ struct bch_fs *c, unsigned u64s)
+{
+ if (s->k == (void *) s->onstack &&
+ u64s > ARRAY_SIZE(s->onstack)) {
+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+ memcpy(s->k, s->onstack, sizeof(s->onstack));
+ }
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ bch2_bkey_buf_realloc(s, c, k.k->u64s);
+ bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct bkey_i *src)
+{
+ bch2_bkey_buf_realloc(s, c, src->k.u64s);
+ bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+ struct bch_fs *c,
+ struct btree *b,
+ struct bkey_packed *src)
+{
+ bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+ bkeyp_val_u64s(&b->format, src));
+ bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+ s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+ if (s->k != (void *) s->onstack)
+ mempool_free(s->k, &c->large_bkey_pool);
+ s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
.key_invalid = key_type_cookie_invalid, \
}
-#define bch2_bkey_ops_whiteout (struct bkey_ops) { \
+#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) { \
.key_invalid = empty_val_key_invalid, \
}
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
- if (type == BKEY_TYPE_BTREE &&
+ if (type == BKEY_TYPE_btree &&
bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
return "value too big";
return "nonzero size field";
}
- if (k.k->p.snapshot)
+ if (type != BKEY_TYPE_btree &&
+ !btree_type_has_snapshots(type) &&
+ k.k->p.snapshot)
return "nonzero snapshot";
- if (type != BKEY_TYPE_BTREE &&
+ if (type != BKEY_TYPE_btree &&
+ btree_type_has_snapshots(type) &&
+ k.k->p.snapshot != U32_MAX)
+ return "invalid snapshot field";
+
+ if (type != BKEY_TYPE_btree &&
!bkey_cmp(k.k->p, POS_MAX))
return "POS_MAX key";
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
- if (bkey_cmp(k.k->p, b->data->min_key) < 0)
+ if (bpos_cmp(k.k->p, b->data->min_key) < 0)
return "key before start of btree node";
- if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+ if (bpos_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node";
return NULL;
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
{
- const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
const char *invalid;
BUG_ON(!k.k->u64s);
bch2_bkey_val_to_text(&PBUF(buf), c, k);
bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
- return;
}
-
- if (ops->key_debugcheck)
- ops->key_debugcheck(c, k);
}
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
{
- if (!bkey_cmp(pos, POS_MIN))
+ if (!bpos_cmp(pos, POS_MIN))
pr_buf(out, "POS_MIN");
- else if (!bkey_cmp(pos, POS_MAX))
+ else if (!bpos_cmp(pos, POS_MAX))
pr_buf(out, "POS_MAX");
- else
- pr_buf(out, "%llu:%llu", pos.inode, pos.offset);
+ else {
+ if (pos.inode == U64_MAX)
+ pr_buf(out, "U64_MAX");
+ else
+ pr_buf(out, "%llu", pos.inode);
+ pr_buf(out, ":");
+ if (pos.offset == U64_MAX)
+ pr_buf(out, "U64_MAX");
+ else
+ pr_buf(out, "%llu", pos.offset);
+ pr_buf(out, ":");
+ if (pos.snapshot == U32_MAX)
+ pr_buf(out, "U32_MAX");
+ else
+ pr_buf(out, "%u", pos.snapshot);
+ }
}
void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
{
if (k) {
- pr_buf(out, "u64s %u type %s ", k->u64s,
- bch2_bkey_types[k->type]);
+ pr_buf(out, "u64s %u type ", k->u64s);
+
+ if (k->type < KEY_TYPE_MAX)
+ pr_buf(out, "%s ", bch2_bkey_types[k->type]);
+ else
+ pr_buf(out, "%u ", k->type);
bch2_bpos_to_text(out, k->p);
- pr_buf(out, " snap %u len %u ver %llu",
- k->p.snapshot, k->size, k->version.lo);
+ pr_buf(out, " len %u ver %llu", k->size, k->version.lo);
} else {
pr_buf(out, "(null)");
}
void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
+ if (k.k->type < KEY_TYPE_MAX) {
+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
- if (likely(ops->val_to_text))
- ops->val_to_text(out, c, k);
+ if (likely(ops->val_to_text))
+ ops->val_to_text(out, c, k);
+ } else {
+ pr_buf(out, "(invalid type %u)", k.k->type);
+ }
}
void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
enum merge_result ret;
- if (key_merging_disabled(c) ||
+ if (bch2_key_merging_disabled ||
!ops->key_merge ||
l.k->type != r.k->type ||
bversion_cmp(l.k->version, r.k->version) ||
- bkey_cmp(l.k->p, bkey_start_pos(r.k)))
+ bpos_cmp(l.k->p, bkey_start_pos(r.k)))
return BCH_MERGE_NOMERGE;
ret = ops->key_merge(c, l, r);
u8 old;
u8 new;
} bkey_renumber_table[] = {
- {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr },
- {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent },
- {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent },
- {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation },
- {BKEY_TYPE_INODES, 128, KEY_TYPE_inode },
- {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation },
- {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent },
- {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout },
- {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr },
- {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout },
- {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc },
- {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota },
+ {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr },
+ {BKEY_TYPE_extents, 128, KEY_TYPE_extent },
+ {BKEY_TYPE_extents, 129, KEY_TYPE_extent },
+ {BKEY_TYPE_extents, 130, KEY_TYPE_reservation },
+ {BKEY_TYPE_inodes, 128, KEY_TYPE_inode },
+ {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation },
+ {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent },
+ {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout },
+ {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr },
+ {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout },
+ {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc },
+ {BKEY_TYPE_quotas, 128, KEY_TYPE_quota },
};
void bch2_bkey_renumber(enum btree_node_type btree_node_type,
const struct bkey_ops *ops;
struct bkey uk;
struct bkey_s u;
+ unsigned nr_compat = 5;
int i;
/*
* Do these operations in reverse order in the write path:
*/
- for (i = 0; i < 4; i++)
- switch (!write ? i : 3 - i) {
+ for (i = 0; i < nr_compat; i++)
+ switch (!write ? i : nr_compat - 1 - i) {
case 0:
if (big_endian != CPU_BIG_ENDIAN)
bch2_bkey_swab_key(f, k);
break;
case 2:
if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_INODES) {
+ btree_id == BTREE_ID_inodes) {
if (!bkey_packed(k)) {
struct bkey_i *u = packed_to_bkey(k);
swap(u->k.p.inode, u->k.p.offset);
}
break;
case 3:
+ if (version < bcachefs_metadata_version_snapshot &&
+ (level || btree_type_has_snapshots(btree_id))) {
+ struct bkey_i *u = packed_to_bkey(k);
+
+ if (u) {
+ u->k.p.snapshot = write
+ ? 0 : U32_MAX;
+ } else {
+ u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
+ u64 max_packed = min_packed +
+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+ uk = __bch2_bkey_unpack_key(f, k);
+ uk.p.snapshot = write
+ ? min_packed : min_t(u64, U32_MAX, max_packed);
+
+ BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
+ }
+ }
+
+ break;
+ case 4:
if (!bkey_packed(k)) {
u = bkey_i_to_s(packed_to_bkey(k));
} else {
/* Returns reason for being invalid if invalid, else NULL: */
const char * (*key_invalid)(const struct bch_fs *,
struct bkey_s_c);
- void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void (*swab)(struct bkey_s);
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_ON_STACK_H
-#define _BCACHEFS_BKEY_ON_STACK_H
-
-#include "bcachefs.h"
-
-struct bkey_on_stack {
- struct bkey_i *k;
- u64 onstack[12];
-};
-
-static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
- struct bch_fs *c, unsigned u64s)
-{
- if (s->k == (void *) s->onstack &&
- u64s > ARRAY_SIZE(s->onstack)) {
- s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
- memcpy(s->k, s->onstack, sizeof(s->onstack));
- }
-}
-
-static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s,
- struct bch_fs *c,
- struct bkey_s_c k)
-{
- bkey_on_stack_realloc(s, c, k.k->u64s);
- bkey_reassemble(s->k, k);
-}
-
-static inline void bkey_on_stack_init(struct bkey_on_stack *s)
-{
- s->k = (void *) s->onstack;
-}
-
-static inline void bkey_on_stack_exit(struct bkey_on_stack *s,
- struct bch_fs *c)
-{
- if (s->k != (void *) s->onstack)
- mempool_free(s->k, &c->large_bkey_pool);
- s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_ON_STACK_H */
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "bkey_sort.h"
#include "bset.h"
#include "extents.h"
return !iter->used;
}
-static inline void __sort_iter_sift(struct sort_iter *iter,
- unsigned from,
- sort_cmp_fn cmp)
+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
+ sort_cmp_fn cmp)
{
unsigned i;
swap(iter->data[i], iter->data[i + 1]);
}
-static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
-{
-
- __sort_iter_sift(iter, 0, cmp);
-}
-
static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
{
unsigned i = iter->used;
while (i--)
- __sort_iter_sift(iter, i, cmp);
+ sort_iter_sift(iter, i, cmp);
}
static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
return !sort_iter_end(iter) ? iter->data->k : NULL;
}
-static inline void __sort_iter_advance(struct sort_iter *iter,
- unsigned idx, sort_cmp_fn cmp)
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
{
- struct sort_iter_set *i = iter->data + idx;
+ struct sort_iter_set *i = iter->data;
- BUG_ON(idx >= iter->used);
+ BUG_ON(!iter->used);
- i->k = bkey_next_skip_noops(i->k, i->end);
+ i->k = bkey_next(i->k);
BUG_ON(i->k > i->end);
if (i->k == i->end)
- array_remove_item(iter->data, iter->used, idx);
+ array_remove_item(iter->data, iter->used, 0);
else
- __sort_iter_sift(iter, idx, cmp);
-}
-
-static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
-{
- __sort_iter_advance(iter, 0, cmp);
+ sort_iter_sift(iter, 0, cmp);
}
static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed(b, l, r) ?:
cmp_int((unsigned long) l, (unsigned long) r);
}
* and should be dropped.
*/
return iter->used >= 2 &&
- !bkey_cmp_packed(iter->b,
+ !bch2_bkey_cmp_packed(iter->b,
iter->data[0].k,
iter->data[1].k);
}
sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
while ((k = sort_iter_peek(iter))) {
- if (!bkey_whiteout(k) &&
+ if (!bkey_deleted(k) &&
!should_drop_next_key(iter)) {
bkey_copy(out, k);
btree_keys_account_key_add(&nr, 0, out);
struct bkey_packed **out,
struct bkey_s k)
{
- if (!bkey_whiteout(k.k)) {
+ if (!bkey_deleted(k.k)) {
if (!bch2_bkey_pack_key(*out, k.k, f))
memcpy_u64s_small(*out, k.k, BKEY_U64s);
memset(&nr, 0, sizeof(nr));
while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
- if (filter_whiteouts && bkey_whiteout(in))
+ if (filter_whiteouts && bkey_deleted(in))
continue;
if (bch2_bkey_transform(out_f, out, bkey_packed(in)
bool filter_whiteouts)
{
struct bkey_packed *out = vstruct_last(dst), *k_packed;
- struct bkey_on_stack k;
+ struct bkey_buf k;
struct btree_nr_keys nr;
memset(&nr, 0, sizeof(nr));
- bkey_on_stack_init(&k);
+ bch2_bkey_buf_init(&k);
while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
- if (filter_whiteouts && bkey_whiteout(k_packed))
+ if (filter_whiteouts && bkey_deleted(k_packed))
continue;
/*
* node; we have to make a copy of the entire key before calling
* normalize
*/
- bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s);
+ bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
bch2_bkey_unpack(src, k.k, k_packed);
if (filter_whiteouts &&
}
dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- bkey_on_stack_exit(&k, c);
+ bch2_bkey_buf_exit(&k, c);
return nr;
}
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed(b, l, r) ?:
(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
(int) l->needs_whiteout - (int) r->needs_whiteout;
}
while ((in = sort_iter_next(iter, sort_keys_cmp))) {
bool needs_whiteout = false;
- if (bkey_whiteout(in) &&
+ if (bkey_deleted(in) &&
(filter_whiteouts || !in->needs_whiteout))
continue;
while ((next = sort_iter_peek(iter)) &&
- !bkey_cmp_packed(iter->b, in, next)) {
+ !bch2_bkey_cmp_packed(iter->b, in, next)) {
BUG_ON(in->needs_whiteout &&
next->needs_whiteout);
needs_whiteout |= in->needs_whiteout;
in = sort_iter_next(iter, sort_keys_cmp);
}
- if (bkey_whiteout(in)) {
+ if (bkey_deleted(in)) {
memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
set_bkeyp_val_u64s(f, out, 0);
} else {
return (u64 *) out - (u64 *) dst;
}
-
-/* Compat code for btree_node_old_extent_overwrite: */
-
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- struct bkey ul = bkey_unpack_key(b, l);
- struct bkey ur = bkey_unpack_key(b, r);
-
- return bkey_cmp(bkey_start_pos(&ul),
- bkey_start_pos(&ur)) ?:
- cmp_int((unsigned long) r, (unsigned long) l);
-}
-
-/*
- * The algorithm in extent_sort_fix_overlapping() relies on keys in the same
- * bset being ordered by start offset - but 0 size whiteouts (which are always
- * KEY_TYPE_deleted) break this ordering, so we need to skip over them:
- */
-static void extent_iter_advance(struct sort_iter *iter, unsigned idx)
-{
- struct sort_iter_set *i = iter->data + idx;
-
- do {
- i->k = bkey_next_skip_noops(i->k, i->end);
- } while (i->k != i->end && bkey_deleted(i->k));
-
- if (i->k == i->end)
- array_remove_item(iter->data, iter->used, idx);
- else
- __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp);
-}
-
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
- struct sort_iter *iter)
-{
- struct btree *b = iter->b;
- struct bkey_format *f = &b->format;
- struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
- struct bkey_packed *out = dst->start;
- struct bkey l_unpacked, r_unpacked;
- struct bkey_s l, r;
- struct btree_nr_keys nr;
- struct bkey_on_stack split;
- unsigned i;
-
- memset(&nr, 0, sizeof(nr));
- bkey_on_stack_init(&split);
-
- sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
- for (i = 0; i < iter->used;) {
- if (bkey_deleted(iter->data[i].k))
- __sort_iter_advance(iter, i,
- extent_sort_fix_overlapping_cmp);
- else
- i++;
- }
-
- while (!sort_iter_end(iter)) {
- l = __bkey_disassemble(b, _l->k, &l_unpacked);
-
- if (iter->used == 1) {
- extent_sort_append(c, f, &nr, &out, l);
- extent_iter_advance(iter, 0);
- continue;
- }
-
- r = __bkey_disassemble(b, _r->k, &r_unpacked);
-
- /* If current key and next key don't overlap, just append */
- if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
- extent_sort_append(c, f, &nr, &out, l);
- extent_iter_advance(iter, 0);
- continue;
- }
-
- /* Skip 0 size keys */
- if (!r.k->size) {
- extent_iter_advance(iter, 1);
- continue;
- }
-
- /*
- * overlap: keep the newer key and trim the older key so they
- * don't overlap. comparing pointers tells us which one is
- * newer, since the bsets are appended one after the other.
- */
-
- /* can't happen because of comparison func */
- BUG_ON(_l->k < _r->k &&
- !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
- if (_l->k > _r->k) {
- /* l wins, trim r */
- if (bkey_cmp(l.k->p, r.k->p) >= 0) {
- extent_iter_advance(iter, 1);
- } else {
- bch2_cut_front_s(l.k->p, r);
- extent_save(b, _r->k, r.k);
- __sort_iter_sift(iter, 1,
- extent_sort_fix_overlapping_cmp);
- }
- } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-
- /*
- * r wins, but it overlaps in the middle of l - split l:
- */
- bkey_on_stack_reassemble(&split, c, l.s_c);
- bch2_cut_back(bkey_start_pos(r.k), split.k);
-
- bch2_cut_front_s(r.k->p, l);
- extent_save(b, _l->k, l.k);
-
- __sort_iter_sift(iter, 0,
- extent_sort_fix_overlapping_cmp);
-
- extent_sort_append(c, f, &nr, &out,
- bkey_i_to_s(split.k));
- } else {
- bch2_cut_back_s(bkey_start_pos(r.k), l);
- extent_save(b, _l->k, l.k);
- }
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-
- bkey_on_stack_exit(&split, c);
- return nr;
-}
-
-static inline int sort_extents_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- return bkey_cmp_packed(b, l, r) ?:
- (int) bkey_deleted(l) - (int) bkey_deleted(r);
-}
-
-unsigned bch2_sort_extents(struct bkey_packed *dst,
- struct sort_iter *iter,
- bool filter_whiteouts)
-{
- struct bkey_packed *in, *out = dst;
-
- sort_iter_sort(iter, sort_extents_cmp);
-
- while ((in = sort_iter_next(iter, sort_extents_cmp))) {
- if (bkey_deleted(in))
- continue;
-
- if (bkey_whiteout(in) &&
- (filter_whiteouts || !in->needs_whiteout))
- continue;
-
- bkey_copy(out, in);
- out = bkey_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
-
-static inline int sort_extent_whiteouts_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- struct bkey ul = bkey_unpack_key(b, l);
- struct bkey ur = bkey_unpack_key(b, r);
-
- return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
-}
-
-unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst,
- struct sort_iter *iter)
-{
- const struct bkey_format *f = &iter->b->format;
- struct bkey_packed *in, *out = dst;
- struct bkey_i l, r;
- bool prev = false, l_packed = false;
- u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE);
- u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET);
- u64 new_size;
-
- max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
-
- sort_iter_sort(iter, sort_extent_whiteouts_cmp);
-
- while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
- if (bkey_deleted(in))
- continue;
-
- EBUG_ON(bkeyp_val_u64s(f, in));
- EBUG_ON(in->type != KEY_TYPE_discard);
-
- r.k = bkey_unpack_key(iter->b, in);
-
- if (prev &&
- bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
- if (bkey_cmp(l.k.p, r.k.p) >= 0)
- continue;
-
- new_size = l_packed
- ? min(max_packed_size, max_packed_offset -
- bkey_start_offset(&l.k))
- : KEY_SIZE_MAX;
-
- new_size = min(new_size, r.k.p.offset -
- bkey_start_offset(&l.k));
-
- BUG_ON(new_size < l.k.size);
-
- bch2_key_resize(&l.k, new_size);
-
- if (bkey_cmp(l.k.p, r.k.p) >= 0)
- continue;
-
- bch2_cut_front(l.k.p, &r);
- }
-
- if (prev) {
- if (!bch2_bkey_pack(out, &l, f)) {
- BUG_ON(l_packed);
- bkey_copy(out, &l);
- }
- out = bkey_next(out);
- }
-
- l = r;
- prev = true;
- l_packed = bkey_packed(in);
- }
-
- if (prev) {
- if (!bch2_bkey_pack(out, &l, f)) {
- BUG_ON(l_packed);
- bkey_copy(out, &l);
- }
- out = bkey_next(out);
- }
-
- return (u64 *) out - (u64 *) dst;
-}
struct btree_nr_keys
bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
struct sort_iter *);
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *,
- struct sort_iter *);
struct btree_nr_keys
bch2_sort_repack(struct bset *, struct btree *,
unsigned bch2_sort_keys(struct bkey_packed *,
struct sort_iter *, bool);
-unsigned bch2_sort_extents(struct bkey_packed *,
- struct sort_iter *, bool);
-
-unsigned bch2_sort_extent_whiteouts(struct bkey_packed *,
- struct sort_iter *);
#endif /* _BCACHEFS_BKEY_SORT_H */
for (_k = i->start;
_k < vstruct_last(i);
_k = _n) {
- _n = bkey_next_skip_noops(_k, vstruct_last(i));
+ _n = bkey_next(_k);
k = bkey_disassemble(b, _k, &uk);
if (c)
n = bkey_unpack_key(b, _n);
- if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) {
+ if (bpos_cmp(n.p, k.k->p) < 0) {
printk(KERN_ERR "Key skipped backwards\n");
continue;
}
if (!bkey_deleted(k.k) &&
- !bkey_cmp(n.p, k.k->p))
+ !bpos_cmp(n.p, k.k->p))
printk(KERN_ERR "Duplicate keys\n");
}
}
for_each_bset(b, t)
bset_tree_for_each_key(b, t, k)
- if (!bkey_whiteout(k))
+ if (!bkey_deleted(k))
btree_keys_account_key_add(&nr, t - b->set, k);
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
return ro_aux_tree_base(b, t)->f + idx;
}
-static void bset_aux_tree_verify(struct btree *b)
+static void bset_aux_tree_verify(const struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- struct bset_tree *t;
+ const struct bset_tree *t;
for_each_bset(b, t) {
if (t->aux_data_offset == U16_MAX)
#endif
}
-void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+void bch2_btree_keys_init(struct btree *b)
{
unsigned i;
b->nsets = 0;
memset(&b->nr, 0, sizeof(b->nr));
-#ifdef CONFIG_BCACHEFS_DEBUG
- b->expensive_debug_checks = expensive_debug_checks;
-#endif
+
for (i = 0; i < MAX_BSETS; i++)
b->set[i].data_offset = U16_MAX;
struct bkey_packed *k = btree_bkey_first(b, t);
unsigned j = 0;
- if (!btree_keys_expensive_checks(b))
+ if (!bch2_expensive_debug_checks)
return;
BUG_ON(bset_has_ro_aux_tree(t));
goto start;
while (1) {
if (rw_aux_to_bkey(b, t, j) == k) {
- BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k,
+ BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
bkey_unpack_pos(b, k)));
start:
if (++j == t->size)
rw_aux_tree(b, t)[j - 1].offset);
}
- k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+ k = bkey_next(k);
BUG_ON(k >= btree_bkey_last(b, t));
}
}
return (u16) v;
}
-static void make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
+__always_inline
+static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
{
struct bkey_float *f = bkey_float(b, t, j);
struct bkey_packed *m = tree_to_bkey(b, t, j);
- struct bkey_packed *l, *r;
+ struct bkey_packed *l = is_power_of_2(j)
+ ? min_key
+ : tree_to_prev_bkey(b, t, j >> ffs(j));
+ struct bkey_packed *r = is_power_of_2(j + 1)
+ ? max_key
+ : tree_to_bkey(b, t, j >> (ffz(j) + 1));
unsigned mantissa;
int shift, exponent, high_bit;
- if (is_power_of_2(j)) {
- l = min_key;
-
- if (!l->u64s) {
- if (!bkey_pack_pos(l, b->data->min_key, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = b->data->min_key;
- bkey_copy(l, &tmp);
- }
- }
- } else {
- l = tree_to_prev_bkey(b, t, j >> ffs(j));
-
- EBUG_ON(m < l);
- }
-
- if (is_power_of_2(j + 1)) {
- r = max_key;
-
- if (!r->u64s) {
- if (!bkey_pack_pos(r, t->max_key, b)) {
- struct bkey_i tmp;
-
- bkey_init(&tmp.k);
- tmp.k.p = t->max_key;
- bkey_copy(r, &tmp);
- }
- }
- } else {
- r = tree_to_bkey(b, t, j >> (ffz(j) + 1));
-
- EBUG_ON(m > r);
- }
-
/*
* for failed bfloats, the lookup code falls back to comparing against
* the original key.
f->mantissa = mantissa;
}
+static void make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
+{
+ struct bkey_i *k;
+
+ if (is_power_of_2(j) &&
+ !min_key->u64s) {
+ if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
+ k = (void *) min_key;
+ bkey_init(&k->k);
+ k->k.p = b->data->min_key;
+ }
+ }
+
+ if (is_power_of_2(j + 1) &&
+ !max_key->u64s) {
+ if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
+ k = (void *) max_key;
+ bkey_init(&k->k);
+ k->k.p = b->data->max_key;
+ }
+ }
+
+ __make_bfloat(b, t, j, min_key, max_key);
+}
+
/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
bset_aux_tree_verify(b);
return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
}
-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) /
(sizeof(struct bkey_float) + sizeof(u8));
}
-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
}
-static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *k;
}
}
-static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
- struct bkey_packed min_key, max_key;
+ struct bkey_i min_key, max_key;
unsigned j, cacheline = 1;
- /* signal to make_bfloat() that they're uninitialized: */
- min_key.u64s = max_key.u64s = 0;
-
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
retry:
/* First we figure out where the first key in each cacheline is */
eytzinger1_for_each(j, t->size) {
while (bkey_to_cacheline(b, t, k) < cacheline)
- prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+ prev = k, k = bkey_next(k);
if (k >= btree_bkey_last(b, t)) {
/* XXX: this path sucks */
}
while (k != btree_bkey_last(b, t))
- prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
+ prev = k, k = bkey_next(k);
+
+ if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
+ bkey_init(&min_key.k);
+ min_key.k.p = b->data->min_key;
+ }
- t->max_key = bkey_unpack_pos(b, prev);
+ if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
+ bkey_init(&max_key.k);
+ max_key.k.p = b->data->max_key;
+ }
/* Then we build the tree */
eytzinger1_for_each(j, t->size)
- make_bfloat(b, t, j, &min_key, &max_key);
+ __make_bfloat(b, t, j,
+ bkey_to_packed(&min_key),
+ bkey_to_packed(&max_key));
}
static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
while ((p = __bkey_prev(b, t, k)) && !ret) {
- for (i = p; i != k; i = bkey_next_skip_noops(i, k))
+ for (i = p; i != k; i = bkey_next(i))
if (i->type >= min_key_type)
ret = i;
k = p;
}
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
BUG_ON(ret >= orig_k);
for (i = ret
- ? bkey_next_skip_noops(ret, orig_k)
+ ? bkey_next(ret)
: btree_bkey_first(b, t);
i != orig_k;
- i = bkey_next_skip_noops(i, orig_k))
+ i = bkey_next(i))
BUG_ON(i->type >= min_key_type);
}
/* signal to make_bfloat() that they're uninitialized: */
min_key.u64s = max_key.u64s = 0;
- if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
- t->max_key = bkey_unpack_pos(b, k);
-
+ if (bkey_next(k) == btree_bkey_last(b, t)) {
for (j = 1; j < t->size; j = j * 2 + 1)
make_bfloat(b, t, j, &min_key, &max_key);
}
struct bkey_packed *k = start;
while (1) {
- k = bkey_next_skip_noops(k, end);
+ k = bkey_next(k);
if (k == end)
break;
if (bch2_bkey_pack_key(&packed, &insert->k, f))
src = &packed;
- if (!bkey_whiteout(&insert->k))
+ if (!bkey_deleted(&insert->k))
btree_keys_account_key_add(&b->nr, t - b->set, src);
if (src->u64s != clobber_u64s) {
__flatten
static struct bkey_packed *bset_search_write_set(const struct btree *b,
struct bset_tree *t,
- struct bpos *search,
- const struct bkey_packed *packed_search)
+ struct bpos *search)
{
unsigned l = 0, r = t->size;
while (l + 1 != r) {
unsigned m = (l + r) >> 1;
- if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
+ if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
l = m;
else
r = m;
__flatten
static struct bkey_packed *bset_search_tree(const struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
+ const struct bset_tree *t,
+ const struct bpos *search,
const struct bkey_packed *packed_search)
{
struct ro_aux_tree *base = ro_aux_tree_base(b, t);
prefetch(&base->f[n << 4]);
f = &base->f[n];
-
- if (!unlikely(packed_search))
- goto slowpath;
if (unlikely(f->exponent >= BFLOAT_FAILED))
goto slowpath;
case BSET_NO_AUX_TREE:
return btree_bkey_first(b, t);
case BSET_RW_AUX_TREE:
- return bset_search_write_set(b, t, search, lossy_packed_search);
+ return bset_search_write_set(b, t, search);
case BSET_RO_AUX_TREE:
- /*
- * Each node in the auxiliary search tree covers a certain range
- * of bits, and keys above and below the set it covers might
- * differ outside those bits - so we have to special case the
- * start and end - handle that here:
- */
-
- if (bkey_cmp(*search, t->max_key) > 0)
- return btree_bkey_last(b, t);
-
return bset_search_tree(b, t, search, lossy_packed_search);
default:
unreachable();
while (m != btree_bkey_last(b, t) &&
bkey_iter_cmp_p_or_unp(b, m,
lossy_packed_search, search) < 0)
- m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+ m = bkey_next(m);
if (!packed_search)
while (m != btree_bkey_last(b, t) &&
bkey_iter_pos_cmp(b, m, search) < 0)
- m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
+ m = bkey_next(m);
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
BUG_ON(prev &&
return m;
}
-/*
- * Returns the first key greater than or equal to @search
- */
-static __always_inline __flatten
-struct bkey_packed *bch2_bset_search(struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
- struct bkey_packed *packed_search,
- const struct bkey_packed *lossy_packed_search)
-{
- struct bkey_packed *m = __bch2_bset_search(b, t, search,
- lossy_packed_search);
-
- return bch2_bset_search_linear(b, t, search,
- packed_search, lossy_packed_search, m);
-}
-
/* Btree node iterator */
static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
struct btree *b, struct bpos *search)
{
- struct bset_tree *t;
+ struct bkey_packed *k;
trace_bkey_pack_pos_fail(search);
- for_each_bset(b, t)
- __bch2_btree_node_iter_push(iter, b,
- bch2_bset_search(b, t, search, NULL, NULL),
- btree_bkey_last(b, t));
+ bch2_btree_node_iter_init_from_start(iter, b);
- bch2_btree_node_iter_sort(iter, b);
+ while ((k = bch2_btree_node_iter_peek(iter, b)) &&
+ bkey_iter_pos_cmp(b, k, search) < 0)
+ bch2_btree_node_iter_advance(iter, b);
}
/**
* to the search key is going to have 0 sectors after the search key.
*
* But this does mean that we can't just search for
- * bkey_successor(start_of_range) to get the first extent that overlaps with
+ * bpos_successor(start_of_range) to get the first extent that overlaps with
* the range we want - if we're unlucky and there's an extent that ends
* exactly where we searched, then there could be a deleted key at the same
* position and we'd get that when we search instead of the preceding extent
struct bkey_packed *k[MAX_BSETS];
unsigned i;
- EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
+ EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
+ EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0);
bset_aux_tree_verify(b);
memset(iter, 0, sizeof(*iter));
void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
struct btree *b)
{
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
bch2_btree_node_iter_verify(iter, b);
bch2_btree_node_iter_next_check(iter, b);
}
struct bset_tree *t;
unsigned end = 0;
- if (btree_keys_expensive_checks(b))
+ if (bch2_expensive_debug_checks)
bch2_btree_node_iter_verify(iter, b);
for_each_bset(b, t) {
iter->data[0].k = __btree_node_key_to_offset(b, prev);
iter->data[0].end = end;
- if (btree_keys_expensive_checks(b))
+ if (bch2_expensive_debug_checks)
bch2_btree_node_iter_verify(iter, b);
return prev;
}
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
- struct btree *b,
- unsigned min_key_type)
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
+ struct btree *b)
{
struct bkey_packed *prev;
do {
prev = bch2_btree_node_iter_prev_all(iter, b);
- } while (prev && prev->type < min_key_type);
+ } while (prev && bkey_deleted(prev));
return prev;
}
uk = bkey_unpack_key(b, k);
pr_buf(out,
" failed unpacked at depth %u\n"
- "\t%llu:%llu\n",
- ilog2(j),
- uk.p.inode, uk.p.offset);
+ "\t",
+ ilog2(j));
+ bch2_bpos_to_text(out, uk.p);
+ pr_buf(out, "\n");
break;
}
}
#include <linux/kernel.h>
#include <linux/types.h>
-#include "bcachefs_format.h"
+#include "bcachefs.h"
#include "bkey.h"
#include "bkey_methods.h"
#include "btree_types.h"
* first key in that range of bytes again.
*/
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(const struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
- return false;
-#endif
-}
-
enum bset_aux_tree_type {
BSET_NO_AUX_TREE,
BSET_RO_AUX_TREE,
#define BSET_CACHELINE 128
-static inline size_t btree_keys_cachelines(struct btree *b)
+static inline size_t btree_keys_cachelines(const struct btree *b)
{
return (1U << b->byte_order) / BSET_CACHELINE;
}
-static inline size_t btree_aux_data_bytes(struct btree *b)
+static inline size_t btree_aux_data_bytes(const struct btree *b)
{
return btree_keys_cachelines(b) * 8;
}
-static inline size_t btree_aux_data_u64s(struct btree *b)
+static inline size_t btree_aux_data_u64s(const struct btree *b)
{
return btree_aux_data_bytes(b) / sizeof(u64);
}
compiled_unpack_fn unpack_fn = b->aux_data;
unpack_fn(dst, src);
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
#define bset_tree_for_each_key(_b, _t, _k) \
for (_k = btree_bkey_first(_b, _t); \
_k != btree_bkey_last(_b, _t); \
- _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+ _k = bkey_next(_k))
static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
{
return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
}
-void bch2_btree_keys_init(struct btree *, bool *);
+void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct bch_fs *, struct btree *,
EBUG_ON(r_packed && !bkey_packed(r_packed));
if (unlikely(!bkey_packed(l)))
- return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+ return bpos_cmp(packed_to_bkey_c(l)->p, *r);
if (likely(r_packed))
return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
static inline struct bkey_packed *
bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
{
- return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1);
-}
-
-enum bch_extent_overlap {
- BCH_EXTENT_OVERLAP_ALL = 0,
- BCH_EXTENT_OVERLAP_BACK = 1,
- BCH_EXTENT_OVERLAP_FRONT = 2,
- BCH_EXTENT_OVERLAP_MIDDLE = 3,
-};
-
-/* Returns how k overlaps with m */
-static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
- const struct bkey *m)
-{
- int cmp1 = bkey_cmp(k->p, m->p) < 0;
- int cmp2 = bkey_cmp(bkey_start_pos(k),
- bkey_start_pos(m)) > 0;
-
- return (cmp1 << 1) + cmp2;
+ return bch2_bkey_prev_filter(b, t, k, 1);
}
/* Btree key iteration */
const struct bkey_packed *l,
const struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r)
+ return bch2_bkey_cmp_packed(b, l, r)
?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
?: cmp_int(l, r);
}
}
static inline struct bkey_packed *
-bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
- struct btree *b,
- unsigned min_key_type)
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
{
- while (!bch2_btree_node_iter_end(iter)) {
- struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
-
- if (k->type >= min_key_type)
- return k;
-
- bch2_btree_node_iter_advance(iter, b);
- }
-
- return NULL;
-}
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
- struct btree *b)
-{
- return bch2_btree_node_iter_peek_filter(iter, b, 0);
+ return !bch2_btree_node_iter_end(iter)
+ ? __btree_node_offset_to_key(b, iter->data->k)
+ : NULL;
}
static inline struct bkey_packed *
bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
{
- return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1);
+ struct bkey_packed *k;
+
+ while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
+ bkey_deleted(k))
+ bch2_btree_node_iter_advance(iter, b);
+
+ return k;
}
static inline struct bkey_packed *
struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
struct btree *);
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
- struct btree *, unsigned);
-
-static inline struct bkey_packed *
-bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
-{
- return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1);
-}
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
+ struct btree *);
struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
struct btree *,
static inline void bch2_verify_btree_nr_keys(struct btree *b)
{
- if (btree_keys_expensive_checks(b))
+ if (bch2_debug_check_btree_accounting)
__bch2_verify_btree_nr_keys(b);
}
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
+#include "error.h"
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
-const char * const bch2_btree_ids[] = {
-#define x(kwd, val, name) name,
- BCH_BTREE_IDS()
-#undef x
- NULL
-};
-
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
unsigned i, reserve = 16;
b->c.level = level;
b->c.btree_id = id;
+ if (level)
+ six_lock_pcpu_alloc(&b->c.lock);
+ else
+ six_lock_pcpu_free_rcu(&b->c.lock);
+
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
* - unless btree verify mode is enabled, since it runs out of
* the post write cleanup:
*/
- if (verify_btree_ondisk(c))
+ if (bch2_verify_btree_ondisk)
bch2_btree_node_write(c, b, SIX_LOCK_intent);
else
__bch2_btree_node_write(c, b, SIX_LOCK_read);
unsigned long freed = 0;
unsigned i, flags;
- if (btree_shrinker_disabled(c))
+ if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
/* Return -1 if we can't do anything right now */
clear_btree_node_accessed(b);
}
- memalloc_nofs_restore(flags);
mutex_unlock(&bc->lock);
out:
+ memalloc_nofs_restore(flags);
return (unsigned long) freed * btree_pages(c);
}
btree_cache.shrink);
struct btree_cache *bc = &c->btree_cache;
- if (btree_shrinker_disabled(c))
+ if (bch2_btree_shrinker_disabled)
return 0;
return btree_cache_can_free(bc) * btree_pages(c);
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
btree_node_data_free(c, b);
}
+ BUG_ON(atomic_read(&c->btree_cache.dirty));
+
while (!list_empty(&bc->freed)) {
b = list_first_entry(&bc->freed, struct btree, list);
list_del(&b->list);
+ six_lock_pcpu_free(&b->c.lock);
kfree(b);
}
bc->shrink.scan_objects = bch2_btree_cache_scan;
bc->shrink.seeks = 4;
bc->shrink.batch = btree_pages(c) * 2;
- register_shrinker(&bc->shrink);
+ ret = register_shrinker(&bc->shrink);
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
b->sib_u64s[0] = 0;
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
- bch2_btree_keys_init(b, &c->expensive_debug_checks);
+ bch2_btree_keys_init(b);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
*/
struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type)
+ enum six_lock_type lock_type,
+ unsigned long trace_ip)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
btree_node_unlock(iter, level + 1);
if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
- lock_node_check_fn, (void *) k)) {
+ lock_node_check_fn, (void *) k, trace_ip)) {
if (b->hash_val != btree_ptr_hash_val(k))
goto retry;
return ERR_PTR(-EINTR);
return ERR_PTR(-EIO);
}
- EBUG_ON(b->c.btree_id != iter->btree_id ||
- BTREE_NODE_LEVEL(b->data) != level ||
- bkey_cmp(b->data->max_key, k->k.p));
+ EBUG_ON(b->c.btree_id != iter->btree_id);
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
+ EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ bpos_cmp(b->data->min_key,
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
return b;
}
struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
const struct bkey_i *k,
enum btree_id btree_id,
- unsigned level)
+ unsigned level,
+ bool nofill)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
+ if (nofill)
+ goto out;
+
b = bch2_btree_node_fill(c, NULL, k, btree_id,
level, SIX_LOCK_read, true);
if (!b)
goto retry;
+ if (IS_ERR(b) &&
+ !bch2_btree_cache_cannibalize_lock(c, NULL))
+ goto retry;
+
if (IS_ERR(b))
- return b;
+ goto out;
} else {
lock_node:
ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
- return ERR_PTR(-EIO);
- }
-
- EBUG_ON(b->c.btree_id != btree_id ||
- BTREE_NODE_LEVEL(b->data) != level ||
- bkey_cmp(b->data->max_key, k->k.p));
-
- return b;
-}
-
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
- struct btree_iter *iter,
- struct btree *b,
- enum btree_node_sibling sib)
-{
- struct btree_trans *trans = iter->trans;
- struct btree *parent;
- struct btree_node_iter node_iter;
- struct bkey_packed *k;
- BKEY_PADDED(k) tmp;
- struct btree *ret = NULL;
- unsigned level = b->c.level;
-
- parent = btree_iter_node(iter, level + 1);
- if (!parent)
- return NULL;
-
- /*
- * There's a corner case where a btree_iter might have a node locked
- * that is just outside its current pos - when
- * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
- *
- * But the lock ordering checks in __bch2_btree_node_lock() go off of
- * iter->pos, not the node's key: so if the iterator is marked as
- * needing to be traversed, we risk deadlock if we don't bail out here:
- */
- if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
- return ERR_PTR(-EINTR);
-
- if (!bch2_btree_node_relock(iter, level + 1)) {
- ret = ERR_PTR(-EINTR);
+ b = ERR_PTR(-EIO);
goto out;
}
- node_iter = iter->l[parent->c.level].iter;
-
- k = bch2_btree_node_iter_peek_all(&node_iter, parent);
- BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
-
- k = sib == btree_prev_sib
- ? bch2_btree_node_iter_prev(&node_iter, parent)
- : (bch2_btree_node_iter_advance(&node_iter, parent),
- bch2_btree_node_iter_peek(&node_iter, parent));
- if (!k)
- goto out;
-
- bch2_bkey_unpack(parent, &tmp.k, k);
-
- ret = bch2_btree_node_get(c, iter, &tmp.k, level,
- SIX_LOCK_intent);
-
- if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
- struct btree_iter *linked;
-
- if (!bch2_btree_node_relock(iter, level + 1))
- goto out;
-
- /*
- * We might have got -EINTR because trylock failed, and we're
- * holding other locks that would cause us to deadlock:
- */
- trans_for_each_iter(trans, linked)
- if (btree_iter_cmp(iter, linked) < 0)
- __bch2_btree_iter_unlock(linked);
-
- if (sib == btree_prev_sib)
- btree_node_unlock(iter, level);
-
- ret = bch2_btree_node_get(c, iter, &tmp.k, level,
- SIX_LOCK_intent);
-
- /*
- * before btree_iter_relock() calls btree_iter_verify_locks():
- */
- if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(iter, level + 1);
-
- if (!bch2_btree_node_relock(iter, level)) {
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
-
- if (!IS_ERR(ret)) {
- six_unlock_intent(&ret->c.lock);
- ret = ERR_PTR(-EINTR);
- }
- }
-
- bch2_trans_relock(trans);
- }
+ EBUG_ON(b->c.btree_id != btree_id);
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+ EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
+ EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+ bpos_cmp(b->data->min_key,
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
out:
- if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(iter, level + 1);
-
- if (PTR_ERR_OR_ZERO(ret) == -EINTR)
- bch2_btree_iter_upgrade(iter, level + 2);
-
- BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
-
- if (!IS_ERR_OR_NULL(ret)) {
- struct btree *n1 = ret, *n2 = b;
-
- if (sib != btree_prev_sib)
- swap(n1, n2);
-
- BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
- n2->data->min_key));
- }
-
- bch2_btree_trans_verify_locks(trans);
-
- return ret;
+ bch2_btree_cache_cannibalize_unlock(c);
+ return b;
}
void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
- const struct bkey_i *k, unsigned level)
+ const struct bkey_i *k,
+ enum btree_id btree_id, unsigned level)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- BUG_ON(!btree_node_locked(iter, level + 1));
+ BUG_ON(iter && !btree_node_locked(iter, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_cache_find(bc, k);
if (b)
return;
- bch2_btree_node_fill(c, iter, k, iter->btree_id,
- level, SIX_LOCK_read, false);
+ bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
bch2_btree_keys_stats(b, &stats);
- pr_buf(out,
- "l %u %llu:%llu - %llu:%llu:\n"
- " ptrs: ",
- b->c.level,
- b->data->min_key.inode,
- b->data->min_key.offset,
- b->data->max_key.inode,
- b->data->max_key.offset);
+ pr_buf(out, "l %u ", b->c.level);
+ bch2_bpos_to_text(out, b->data->min_key);
+ pr_buf(out, " - ");
+ bch2_bpos_to_text(out, b->data->max_key);
+ pr_buf(out, ":\n"
+ " ptrs: ");
bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+
pr_buf(out, "\n"
" format: u64s %u fields %u %u %u %u %u\n"
" unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
- " sib u64s: %u, %u (merge threshold %zu)\n"
+ " sib u64s: %u, %u (merge threshold %u)\n"
" nr packed keys %u\n"
" nr unpacked keys %u\n"
" floats %zu\n"
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
- BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+ c->btree_foreground_merge_threshold,
b->nr.packed_keys,
b->nr.unpacked_keys,
stats.floats,
stats.failed);
}
+
+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
+ pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
+ pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+}
struct btree_iter;
-extern const char * const bch2_btree_ids[];
-
void bch2_recalc_btree_reserve(struct bch_fs *);
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned,
- enum six_lock_type);
+ enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
- enum btree_id, unsigned);
-
-struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
- struct btree *, enum btree_node_sibling);
+ enum btree_id, unsigned, bool);
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
- const struct bkey_i *, unsigned);
+ const struct bkey_i *, enum btree_id, unsigned);
void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);
#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
- (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b)
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
struct btree *);
+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "bkey_methods.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "btree_io.h"
__gc_pos_set(c, new_pos);
}
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
static int bch2_gc_check_topology(struct bch_fs *c,
- struct bkey_s_c k,
- struct bpos *expected_start,
- struct bpos expected_end,
+ struct btree *b,
+ struct bkey_buf *prev,
+ struct bkey_buf cur,
bool is_last)
{
+ struct bpos node_start = b->data->min_key;
+ struct bpos node_end = b->data->max_key;
+ struct bpos expected_start = bkey_deleted(&prev->k->k)
+ ? node_start
+ : bpos_successor(prev->k->k.p);
+ char buf1[200], buf2[200];
+ bool update_min = false;
+ bool update_max = false;
int ret = 0;
- if (k.k->type == KEY_TYPE_btree_ptr_v2) {
- struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+ if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
- if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c,
- "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu",
- bp.v->min_key.inode,
- bp.v->min_key.offset,
- expected_start->inode,
- expected_start->offset)) {
- BUG();
+ if (bkey_deleted(&prev->k->k)) {
+ struct printbuf out = PBUF(buf1);
+ pr_buf(&out, "start of node: ");
+ bch2_bpos_to_text(&out, node_start);
+ } else {
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
}
- }
- *expected_start = bkey_cmp(k.k->p, POS_MAX)
- ? bkey_successor(k.k->p)
- : k.k->p;
+ if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " cur %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1,
+ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
+ update_min = true;
+ }
if (fsck_err_on(is_last &&
- bkey_cmp(k.k->p, expected_end), c,
- "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu",
- k.k->p.inode,
- k.k->p.offset,
- expected_end.inode,
- expected_end.offset)) {
- BUG();
+ bpos_cmp(cur.k->k.p, node_end), c,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " %s\n"
+ " expected %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
+ update_max = true;
+
+ bch2_bkey_buf_copy(prev, c, cur.k);
+
+ if (update_min || update_max) {
+ struct bkey_i *new;
+ struct bkey_i_btree_ptr_v2 *bp = NULL;
+ struct btree *n;
+
+ if (update_max) {
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur.k->k.p);
+ if (ret)
+ return ret;
+ }
+
+ new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
+ if (!new) {
+ bch_err(c, "%s: error allocating new key", __func__);
+ return -ENOMEM;
+ }
+
+ bkey_copy(new, cur.k);
+
+ if (new->k.type == KEY_TYPE_btree_ptr_v2)
+ bp = bkey_i_to_btree_ptr_v2(new);
+
+ if (update_min)
+ bp->v.min_key = expected_start;
+ if (update_max)
+ new->k.p = node_end;
+ if (bp)
+ SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
+
+ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
+ if (ret) {
+ kfree(new);
+ return ret;
+ }
+
+ n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
+ b->c.level - 1, true);
+ if (n) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, n);
+
+ bkey_copy(&n->key, new);
+ if (update_min)
+ n->data->min_key = expected_start;
+ if (update_max)
+ n->data->max_key = node_end;
+
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ six_unlock_read(&n->c.lock);
+ }
+ }
+fsck_err:
+ return ret;
+}
+
+static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, bool is_root,
+ struct bkey_s_c *k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ int ret = 0;
+
+ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
+ struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+
+ if (fsck_err_on(!g->gen_valid, c,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen)) {
+ if (p.ptr.cached) {
+ g2->_mark.gen = g->_mark.gen = p.ptr.gen;
+ g2->gen_valid = g->gen_valid = true;
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->mark.gen)) {
+ if (p.ptr.cached) {
+ g2->_mark.gen = g->_mark.gen = p.ptr.gen;
+ g2->gen_valid = g->gen_valid = true;
+ g2->_mark.data_type = 0;
+ g2->_mark.dirty_sectors = 0;
+ g2->_mark.cached_sectors = 0;
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (fsck_err_on(!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->mark.gen))
+ do_update = true;
+
+ if (p.has_ec) {
+ struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
+
+ if (fsck_err_on(!m || !m->alive, c,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.ec.idx))
+ do_update = true;
+
+ if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
+ "pointer does not match stripe %llu",
+ (u64) p.ec.idx))
+ do_update = true;
+ }
+ }
+
+ if (do_update) {
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+ struct bch_extent_ptr *ptr;
+ struct bkey_i *new;
+
+ if (is_root) {
+ bch_err(c, "cannot update btree roots yet");
+ return -EINVAL;
+ }
+
+ new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+ if (!new) {
+ bch_err(c, "%s: error allocating new key", __func__);
+ return -ENOMEM;
+ }
+
+ bkey_reassemble(new, *k);
+
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+ (ptr->cached &&
+ (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+ (!ptr->cached &&
+ gen_cmp(ptr->gen, g->mark.gen) < 0);
+ }));
+again:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+ struct stripe *m = genradix_ptr(&c->stripes[true],
+ entry->stripe_ptr.idx);
+ union bch_extent_entry *next_ptr;
+
+ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+ goto found;
+ next_ptr = NULL;
+found:
+ if (!next_ptr) {
+ bch_err(c, "aieee, found stripe ptr with no data ptr");
+ continue;
+ }
+
+ if (!m || !m->alive ||
+ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+ &next_ptr->ptr,
+ m->sectors)) {
+ bch2_bkey_extent_entry_drop(new, entry);
+ goto again;
+ }
+ }
+ }
+
+ ret = bch2_journal_key_insert(c, btree_id, level, new);
+ if (ret)
+ kfree(new);
+ else
+ *k = bkey_i_to_s_c(new);
}
fsck_err:
return ret;
/* marking of btree keys/nodes: */
-static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, bool is_root,
+ struct bkey_s_c k,
u8 *max_stale, bool initial)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
int ret = 0;
if (initial) {
- BUG_ON(journal_seq_verify(c) &&
+ BUG_ON(bch2_journal_seq_verify &&
k.k->version.lo > journal_cur_seq(&c->journal));
- /* XXX change to fsck check */
if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
"key version number higher than recorded: %llu > %llu",
k.k->version.lo,
"superblock not marked as containing replicas (type %u)",
k.k->type)) {
ret = bch2_mark_bkey_replicas(c, k);
- if (ret)
- return ret;
- }
-
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, true);
- struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
-
- if (mustfix_fsck_err_on(!g->gen_valid, c,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
- ptr->dev, PTR_BUCKET_NR(ca, ptr),
- bch2_data_types[ptr_data_type(k.k, ptr)],
- ptr->gen)) {
- g2->_mark.gen = g->_mark.gen = ptr->gen;
- g2->gen_valid = g->gen_valid = true;
- }
-
- if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
- ptr->dev, PTR_BUCKET_NR(ca, ptr),
- bch2_data_types[ptr_data_type(k.k, ptr)],
- ptr->gen, g->mark.gen)) {
- g2->_mark.gen = g->_mark.gen = ptr->gen;
- g2->gen_valid = g->gen_valid = true;
- g2->_mark.data_type = 0;
- g2->_mark.dirty_sectors = 0;
- g2->_mark.cached_sectors = 0;
- set_bit(BCH_FS_FIXED_GENS, &c->flags);
+ if (ret) {
+ bch_err(c, "error marking bkey replicas: %i", ret);
+ goto err;
}
}
+
+ ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k);
}
bkey_for_each_ptr(ptrs, ptr) {
bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
fsck_err:
+err:
+ if (ret)
+ bch_err(c, "%s: ret %i", __func__, ret);
return ret;
}
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
bool initial)
{
- struct bpos next_node_start = b->data->min_key;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
+ struct bkey_buf prev, cur;
int ret = 0;
*max_stale = 0;
return 0;
bch2_btree_node_iter_init_from_start(&iter, b);
+ bch2_bkey_buf_init(&prev);
+ bch2_bkey_buf_init(&cur);
+ bkey_init(&prev.k->k);
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
- bch2_bkey_debugcheck(c, b, k);
-
- ret = bch2_gc_mark_key(c, k, max_stale, initial);
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+ k, max_stale, initial);
if (ret)
break;
bch2_btree_node_iter_advance(&iter, b);
if (b->c.level) {
- ret = bch2_gc_check_topology(c, k,
- &next_node_start,
- b->data->max_key,
+ bch2_bkey_buf_reassemble(&cur, c, k);
+
+ ret = bch2_gc_check_topology(c, b, &prev, cur,
bch2_btree_node_iter_end(&iter));
if (ret)
break;
}
}
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
return ret;
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
- bool initial, bool metadata_only)
+ bool initial)
{
struct btree_trans trans;
struct btree_iter *iter;
struct btree *b;
- unsigned depth = metadata_only ? 1
- : expensive_debug_checks(c) ? 0
+ unsigned depth = bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
if (max_stale > 64)
bch2_btree_node_rewrite(c, iter,
b->data->keys.seq,
- BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_NOWAIT|
BTREE_INSERT_GC_LOCK_HELD);
- else if (!btree_gc_rewrite_disabled(c) &&
- (btree_gc_always_rewrite(c) || max_stale > 16))
+ else if (!bch2_btree_gc_rewrite_disabled &&
+ (bch2_btree_gc_always_rewrite || max_stale > 16))
bch2_btree_node_rewrite(c, iter,
b->data->keys.seq,
BTREE_INSERT_NOWAIT|
bch2_trans_cond_resched(&trans);
}
+ bch2_trans_iter_put(&trans, iter);
+
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
return ret;
mutex_lock(&c->btree_root_lock);
b = c->btree_roots[btree_id].b;
if (!btree_node_fake(b))
- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+ bkey_i_to_s_c(&b->key),
&max_stale, initial);
gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
mutex_unlock(&c->btree_root_lock);
}
static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
- struct journal_keys *journal_keys,
unsigned target_depth)
{
struct btree_and_journal_iter iter;
struct bkey_s_c k;
- struct bpos next_node_start = b->data->min_key;
+ struct bkey_buf cur, prev;
u8 max_stale = 0;
int ret = 0;
- bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+ bch2_bkey_buf_init(&prev);
+ bch2_bkey_buf_init(&cur);
+ bkey_init(&prev.k->k);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- bch2_bkey_debugcheck(c, b, k);
-
- BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
- BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
- ret = bch2_gc_mark_key(c, k, &max_stale, true);
- if (ret)
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+ k, &max_stale, true);
+ if (ret) {
+ bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
break;
+ }
if (b->c.level) {
- struct btree *child;
- BKEY_PADDED(k) tmp;
-
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
+ bch2_bkey_buf_reassemble(&cur, c, k);
+ k = bkey_i_to_s_c(cur.k);
bch2_btree_and_journal_iter_advance(&iter);
- ret = bch2_gc_check_topology(c, k,
- &next_node_start,
- b->data->max_key,
+ ret = bch2_gc_check_topology(c, b,
+ &prev, cur,
!bch2_btree_and_journal_iter_peek(&iter).k);
if (ret)
break;
+ } else {
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+ }
- if (b->c.level > target_depth) {
- child = bch2_btree_node_get_noiter(c, &tmp.k,
- b->c.btree_id, b->c.level - 1);
- ret = PTR_ERR_OR_ZERO(child);
- if (ret)
- break;
+ if (b->c.level > target_depth) {
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
- ret = bch2_gc_btree_init_recurse(c, child,
- journal_keys, target_depth);
- six_unlock_read(&child->c.lock);
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ struct btree *child;
+ bch2_bkey_buf_reassemble(&cur, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+
+ child = bch2_btree_node_get_noiter(c, cur.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
+ ret = PTR_ERR_OR_ZERO(child);
+
+ if (fsck_err_on(ret == -EIO, c,
+ "unreadable btree node")) {
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur.k->k.p);
if (ret)
- break;
+ return ret;
+
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ continue;
}
- } else {
- bch2_btree_and_journal_iter_advance(&iter);
+
+ if (ret) {
+ bch_err(c, "%s: error %i getting btree node",
+ __func__, ret);
+ break;
+ }
+
+ ret = bch2_gc_btree_init_recurse(c, child,
+ target_depth);
+ six_unlock_read(&child->c.lock);
+
+ if (ret)
+ break;
}
}
-
+fsck_err:
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
+ bch2_btree_and_journal_iter_exit(&iter);
return ret;
}
static int bch2_gc_btree_init(struct bch_fs *c,
- struct journal_keys *journal_keys,
- enum btree_id btree_id,
- bool metadata_only)
+ enum btree_id btree_id)
{
struct btree *b;
- unsigned target_depth = metadata_only ? 1
- : expensive_debug_checks(c) ? 0
- : !btree_node_type_needs_gc(btree_id) ? 1
+ unsigned target_depth = bch2_expensive_debug_checks ? 0
+ : !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
+ char buf[100];
int ret = 0;
b = c->btree_roots[btree_id].b;
return 0;
six_lock_read(&b->c.lock, NULL, NULL);
- if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
- "btree root with incorrect min_key: %llu:%llu",
- b->data->min_key.inode,
- b->data->min_key.offset)) {
+ if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
+ "btree root with incorrect min_key: %s",
+ (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
BUG();
}
- if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
- "btree root with incorrect min_key: %llu:%llu",
- b->data->max_key.inode,
- b->data->max_key.offset)) {
+ if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
+ "btree root with incorrect max_key: %s",
+ (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
BUG();
}
if (b->c.level >= target_depth)
- ret = bch2_gc_btree_init_recurse(c, b,
- journal_keys, target_depth);
+ ret = bch2_gc_btree_init_recurse(c, b, target_depth);
if (!ret)
- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+ bkey_i_to_s_c(&b->key),
&max_stale, true);
fsck_err:
six_unlock_read(&b->c.lock);
+ if (ret)
+ bch_err(c, "%s: ret %i", __func__, ret);
return ret;
}
(int) btree_id_to_gc_phase(r);
}
-static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
- bool initial, bool metadata_only)
+static int bch2_gc_btrees(struct bch_fs *c, bool initial)
{
enum btree_id ids[BTREE_ID_NR];
unsigned i;
for (i = 0; i < BTREE_ID_NR; i++) {
enum btree_id id = ids[i];
int ret = initial
- ? bch2_gc_btree_init(c, journal_keys,
- id, metadata_only)
- : bch2_gc_btree(c, id, initial, metadata_only);
- if (ret)
+ ? bch2_gc_btree_init(c, id)
+ : bch2_gc_btree(c, id, initial);
+ if (ret) {
+ bch_err(c, "%s: ret %i", __func__, ret);
return ret;
+ }
}
return 0;
ca->mi.nbuckets * sizeof(struct bucket));
ca->buckets[1] = NULL;
- free_percpu(ca->usage[1]);
- ca->usage[1] = NULL;
+ free_percpu(ca->usage_gc);
+ ca->usage_gc = NULL;
}
free_percpu(c->usage_gc);
}
static int bch2_gc_done(struct bch_fs *c,
- bool initial, bool metadata_only)
+ bool initial)
{
struct bch_dev *ca;
- bool verify = !metadata_only &&
- (!initial ||
- (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
- unsigned i;
+ bool verify = (!initial ||
+ (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+ unsigned i, dev;
int ret = 0;
#define copy_field(_f, _msg, ...) \
fsck_err(c, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
- ret = 1; \
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
if (verify) \
fsck_err(c, "stripe %zu has wrong "_msg \
": got %u, should be %u", \
- dst_iter.pos, ##__VA_ARGS__, \
+ iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
- dst->dirty = true; \
- ret = 1; \
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
bch2_data_types[dst->b[b].mark.data_type],\
dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
- ret = 1; \
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \
copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
- if (!metadata_only) {
- struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
- struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
+ {
+ struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
struct stripe *dst, *src;
- unsigned i;
-
- c->ec_stripes_heap.used = 0;
-
- while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
- (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
- BUG_ON(src_iter.pos != dst_iter.pos);
- copy_stripe_field(alive, "alive");
- copy_stripe_field(sectors, "sectors");
- copy_stripe_field(algorithm, "algorithm");
- copy_stripe_field(nr_blocks, "nr_blocks");
- copy_stripe_field(nr_redundant, "nr_redundant");
- copy_stripe_field(blocks_nonempty,
- "blocks_nonempty");
+ while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
+ dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
+
+ if (dst->alive != src->alive ||
+ dst->sectors != src->sectors ||
+ dst->algorithm != src->algorithm ||
+ dst->nr_blocks != src->nr_blocks ||
+ dst->nr_redundant != src->nr_redundant) {
+ bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
+ ret = -EINVAL;
+ goto fsck_err;
+ }
for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
copy_stripe_field(block_sectors[i],
"block_sectors[%u]", i);
- if (dst->alive) {
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_insert(c, dst, dst_iter.pos);
- spin_unlock(&c->ec_stripes_heap_lock);
- }
+ dst->blocks_nonempty = 0;
+ for (i = 0; i < dst->nr_blocks; i++)
+ dst->blocks_nonempty += dst->block_sectors[i] != 0;
- genradix_iter_advance(&dst_iter, &c->stripes[0]);
- genradix_iter_advance(&src_iter, &c->stripes[1]);
+ genradix_iter_advance(&iter, &c->stripes[1]);
}
}
- for_each_member_device(ca, c, i) {
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
+
+ for_each_member_device(ca, c, dev) {
struct bucket_array *dst = __bucket_array(ca, 0);
struct bucket_array *src = __bucket_array(ca, 1);
size_t b;
dst->b[b].oldest_gen = src->b[b].oldest_gen;
}
- };
- bch2_fs_usage_acc_to_base(c, 0);
- bch2_fs_usage_acc_to_base(c, 1);
+ {
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
- bch2_dev_usage_from_buckets(c);
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ }
+ }
+ };
{
unsigned nr = fs_usage_u64s(c);
copy_fs_field(hidden, "hidden");
copy_fs_field(btree, "btree");
+ copy_fs_field(data, "data");
+ copy_fs_field(cached, "cached");
+ copy_fs_field(reserved, "reserved");
+ copy_fs_field(nr_inodes,"nr_inodes");
- if (!metadata_only) {
- copy_fs_field(data, "data");
- copy_fs_field(cached, "cached");
- copy_fs_field(reserved, "reserved");
- copy_fs_field(nr_inodes,"nr_inodes");
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(persistent_reserved[i],
- "persistent_reserved[%i]", i);
- }
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
+ copy_fs_field(persistent_reserved[i],
+ "persistent_reserved[%i]", i);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
char buf[80];
- if (metadata_only &&
- (e->data_type == BCH_DATA_user ||
- e->data_type == BCH_DATA_cached))
- continue;
-
bch2_replicas_entry_to_text(&PBUF(buf), e);
copy_fs_field(replicas[i], "%s", buf);
#undef copy_stripe_field
#undef copy_field
fsck_err:
+ if (ret)
+ bch_err(c, "%s: ret %i", __func__, ret);
return ret;
}
-static int bch2_gc_start(struct bch_fs *c,
- bool metadata_only)
+static int bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
- BUG_ON(ca->usage[1]);
+ BUG_ON(ca->usage_gc);
ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
return -ENOMEM;
}
- ca->usage[1] = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage[1]) {
- bch_err(c, "error allocating ca->usage[gc]");
+ ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage_gc) {
+ bch_err(c, "error allocating ca->usage_gc");
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
d->gen_valid = s->gen_valid;
-
- if (metadata_only &&
- (s->mark.data_type == BCH_DATA_user ||
- s->mark.data_type == BCH_DATA_cached)) {
- d->_mark = s->mark;
- d->_mark.owned_by_allocator = 0;
- }
}
};
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
-int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
- bool initial, bool metadata_only)
+int bch2_gc(struct bch_fs *c, bool initial)
{
struct bch_dev *ca;
u64 start_time = local_clock();
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
again:
- ret = bch2_gc_start(c, metadata_only);
+ ret = bch2_gc_start(c);
if (ret)
goto out;
bch2_mark_superblocks(c);
- ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only);
+ ret = bch2_gc_btrees(c, initial);
if (ret)
goto out;
bch2_mark_allocator_buckets(c);
c->gc_count++;
-out:
- if (!ret &&
- (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
- (!iter && test_restart_gc(c)))) {
+
+ if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+ (!iter && bch2_test_restart_gc)) {
/*
* XXX: make sure gens we fixed got saved
*/
if (iter++ <= 2) {
- bch_info(c, "Fixed gens, restarting mark and sweep:");
- clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+ bch_info(c, "Second GC pass needed, restarting:");
+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
percpu_down_write(&c->mark_lock);
bch_info(c, "Unable to fix bucket gens, looping");
ret = -EINVAL;
}
-
+out:
if (!ret) {
bch2_journal_block(&c->journal);
percpu_down_write(&c->mark_lock);
- ret = bch2_gc_done(c, initial, metadata_only);
+ ret = bch2_gc_done(c, initial);
bch2_journal_unblock(&c->journal);
} else {
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
int ret = 0;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH);
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) {
if (gc_btree_gens_key(c, k)) {
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
bch2_extent_normalize(c, bkey_i_to_s(sk.k));
bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
}
}
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_advance(iter);
}
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
/* Find a format that all keys in @old_nodes can pack into */
bch2_bkey_format_init(&format_state);
+ /*
+ * XXX: this won't correctly take it account the new min/max keys:
+ */
for (i = 0; i < nr_old_nodes; i++)
__bch2_btree_calc_format(&format_state, old_nodes[i]);
}
if (bch2_keylist_realloc(&keylist, NULL, 0,
- (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+ BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
return;
}
- as = bch2_btree_update_start(iter->trans, iter->btree_id,
+ as = bch2_btree_update_start(iter, old_nodes[0]->c.level,
btree_update_reserve_required(c, parent) + nr_old_nodes,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE,
- NULL);
+ BTREE_INSERT_USE_RESERVE);
if (IS_ERR(as)) {
trace_btree_gc_coalesce_fail(c,
BTREE_GC_COALESCE_FAIL_RESERVE_GET);
k < vstruct_last(s2) &&
vstruct_blocks_plus(n1->data, c->block_bits,
u64s + k->u64s) <= blocks;
- k = bkey_next_skip_noops(k, vstruct_last(s2))) {
+ k = bkey_next(k)) {
last = k;
u64s += k->u64s;
}
n1->key.k.p = n1->data->max_key =
bkey_unpack_pos(n1, last);
- n2->data->min_key = bkey_successor(n1->data->max_key);
+ n2->data->min_key = bpos_successor(n1->data->max_key);
memcpy_u64s(vstruct_last(s1),
s2->start, u64s);
unsigned j;
for (j = 0; j < nr_new_nodes; j++)
- if (!bkey_cmp(old_nodes[i]->key.k.p,
+ if (!bpos_cmp(old_nodes[i]->key.k.p,
new_nodes[j]->key.k.p))
goto next;
struct btree *b;
bool kthread = (current->flags & PF_KTHREAD) != 0;
unsigned i;
+ int ret = 0;
/* Sliding window of adjacent btree nodes */
struct btree *merge[GC_MERGE_NODES];
lock_seq[0] = merge[0]->c.lock.state.seq;
if (kthread && kthread_should_stop()) {
- bch2_trans_exit(&trans);
- return -ESHUTDOWN;
+ ret = -ESHUTDOWN;
+ break;
}
bch2_trans_cond_resched(&trans);
memset(merge + 1, 0,
(GC_MERGE_NODES - 1) * sizeof(merge[0]));
}
- return bch2_trans_exit(&trans);
+ bch2_trans_iter_put(&trans, iter);
+
+ return bch2_trans_exit(&trans) ?: ret;
}
/**
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last = atomic_long_read(&clock->now);
+ unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
int ret;
if (c->btree_gc_periodic) {
unsigned long next = last + c->capacity / 16;
- if (atomic_long_read(&clock->now) >= next)
+ if (atomic64_read(&clock->now) >= next)
break;
bch2_io_clock_schedule_timeout(clock, next);
}
__set_current_state(TASK_RUNNING);
- last = atomic_long_read(&clock->now);
+ last = atomic64_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
/*
* Full gc is currently incompatible with btree key cache:
*/
#if 0
- ret = bch2_gc(c, NULL, false, false);
+ ret = bch2_gc(c, false, false);
#else
ret = bch2_gc_gens(c);
#endif
{
struct task_struct *p;
- BUG_ON(c->gc_thread);
+ if (c->gc_thread)
+ return 0;
- p = kthread_create(bch2_gc_thread, c, "bch_gc");
- if (IS_ERR(p))
+ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
+ if (IS_ERR(p)) {
+ bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
return PTR_ERR(p);
+ }
get_task_struct(p);
c->gc_thread = p;
void bch2_coalesce(struct bch_fs *);
-struct journal_keys;
-int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool);
+int bch2_gc(struct bch_fs *, bool);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
- if (l.phase != r.phase)
- return l.phase < r.phase ? -1 : 1;
- if (bkey_cmp(l.pos, r.pos))
- return bkey_cmp(l.pos, r.pos);
- if (l.level != r.level)
- return l.level < r.level ? -1 : 1;
- return 0;
+ return cmp_int(l.phase, r.phase) ?:
+ bpos_cmp(l.pos, r.pos) ?:
+ cmp_int(l.level, r.level);
}
static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
{
switch (id) {
-#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
BCH_BTREE_IDS()
#undef x
default:
static void verify_no_dups(struct btree *b,
struct bkey_packed *start,
- struct bkey_packed *end,
- bool extents)
+ struct bkey_packed *end)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct bkey_packed *k, *p;
if (start == end)
return;
- for (p = start, k = bkey_next_skip_noops(start, end);
+ for (p = start, k = bkey_next(start);
k != end;
- p = k, k = bkey_next_skip_noops(k, end)) {
+ p = k, k = bkey_next(k)) {
struct bkey l = bkey_unpack_key(b, p);
struct bkey r = bkey_unpack_key(b, k);
- BUG_ON(extents
- ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
- : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
- //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
+ BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
}
#endif
}
{
struct bkey_packed *k;
- for (k = i->start;
- k != vstruct_last(i);
- k = bkey_next_skip_noops(k, vstruct_last(i)))
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
k->needs_whiteout = v;
}
break;
for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
- b = bkey_cmp_packed(bt,
+ b = bch2_bkey_cmp_packed(bt,
ptrs[c],
ptrs[d]) >= 0 ? c : d;
if (d == n)
b = c;
while (b != a &&
- bkey_cmp_packed(bt,
+ bch2_bkey_cmp_packed(bt,
ptrs[a],
ptrs[b]) >= 0)
b = (b - 1) / 2;
}
verify_no_dups(b, new_whiteouts,
- (void *) ((u64 *) new_whiteouts + b->whiteout_u64s),
- btree_node_old_extent_overwrite(b));
+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
memcpy_u64s(unwritten_whiteouts_start(c, b),
new_whiteouts, b->whiteout_u64s);
}
}
-static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
- struct btree *b,
- enum compact_mode mode)
-{
- const struct bkey_format *f = &b->format;
- struct bset_tree *t;
- struct bkey_packed *whiteouts = NULL;
- struct bkey_packed *u_start, *u_pos;
- struct sort_iter sort_iter;
- unsigned bytes, whiteout_u64s = 0, u64s;
- bool used_mempool, compacting = false;
-
- BUG_ON(!btree_node_is_extents(b));
-
- for_each_bset(b, t)
- if (should_compact_bset(b, t, whiteout_u64s != 0, mode))
- whiteout_u64s += bset_dead_u64s(b, t);
-
- if (!whiteout_u64s)
- return false;
-
- bch2_sort_whiteouts(c, b);
-
- sort_iter_init(&sort_iter, b);
-
- whiteout_u64s += b->whiteout_u64s;
- bytes = whiteout_u64s * sizeof(u64);
-
- whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
- u_start = u_pos = whiteouts;
-
- memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
- b->whiteout_u64s);
- u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
-
- sort_iter_add(&sort_iter, u_start, u_pos);
-
- for_each_bset(b, t) {
- struct bset *i = bset(b, t);
- struct bkey_packed *k, *n, *out, *start, *end;
- struct btree_node_entry *src = NULL, *dst = NULL;
-
- if (t != b->set && !bset_written(b, i)) {
- src = container_of(i, struct btree_node_entry, keys);
- dst = max(write_block(b),
- (void *) btree_bkey_last(b, t - 1));
- }
-
- if (src != dst)
- compacting = true;
-
- if (!should_compact_bset(b, t, compacting, mode)) {
- if (src != dst) {
- memmove(dst, src, sizeof(*src) +
- le16_to_cpu(src->keys.u64s) *
- sizeof(u64));
- i = &dst->keys;
- set_btree_bset(b, t, i);
- }
- continue;
- }
-
- compacting = true;
- u_start = u_pos;
- start = i->start;
- end = vstruct_last(i);
-
- if (src != dst) {
- memmove(dst, src, sizeof(*src));
- i = &dst->keys;
- set_btree_bset(b, t, i);
- }
-
- out = i->start;
-
- for (k = start; k != end; k = n) {
- n = bkey_next_skip_noops(k, end);
-
- if (bkey_deleted(k))
- continue;
-
- BUG_ON(bkey_whiteout(k) &&
- k->needs_whiteout &&
- bkey_written(b, k));
-
- if (bkey_whiteout(k) && !k->needs_whiteout)
- continue;
-
- if (bkey_whiteout(k)) {
- memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
- set_bkeyp_val_u64s(f, u_pos, 0);
- u_pos = bkey_next(u_pos);
- } else {
- bkey_copy(out, k);
- out = bkey_next(out);
- }
- }
-
- sort_iter_add(&sort_iter, u_start, u_pos);
-
- i->u64s = cpu_to_le16((u64 *) out - i->_data);
- set_btree_bset_end(b, t);
- bch2_bset_set_no_aux_tree(b, t);
- }
-
- b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
-
- BUG_ON((void *) unwritten_whiteouts_start(c, b) <
- (void *) btree_bkey_last(b, bset_tree_last(b)));
-
- u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
- &sort_iter);
-
- BUG_ON(u64s > b->whiteout_u64s);
- BUG_ON(u_pos != whiteouts && !u64s);
-
- if (u64s != b->whiteout_u64s) {
- void *src = unwritten_whiteouts_start(c, b);
-
- b->whiteout_u64s = u64s;
- memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
- }
-
- verify_no_dups(b,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b),
- true);
-
- btree_bounce_free(c, bytes, used_mempool, whiteouts);
-
- bch2_btree_build_aux_trees(b);
-
- bch_btree_keys_u64s_remaining(c, b);
- bch2_verify_btree_nr_keys(b);
-
- return true;
-}
-
static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
{
struct bset_tree *t;
out = i->start;
for (k = start; k != end; k = n) {
- n = bkey_next_skip_noops(k, end);
+ n = bkey_next(k);
- if (!bkey_whiteout(k)) {
+ if (!bkey_deleted(k)) {
bkey_copy(out, k);
out = bkey_next(out);
} else {
bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
enum compact_mode mode)
{
- return !btree_node_old_extent_overwrite(b)
- ? bch2_drop_whiteouts(b, mode)
- : bch2_compact_extent_whiteouts(c, b, mode);
+ return bch2_drop_whiteouts(b, mode);
}
static void btree_node_sort(struct bch_fs *c, struct btree *b,
start_time = local_clock();
- if (btree_node_old_extent_overwrite(b))
- filter_whiteouts = bset_written(b, start_bset);
-
- u64s = (btree_node_old_extent_overwrite(b)
- ? bch2_sort_extents
- : bch2_sort_keys)(out->keys.start,
- &sort_iter,
- filter_whiteouts);
+ u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
out->keys.u64s = cpu_to_le16(u64s);
bch2_btree_iter_reinit_node(iter, b);
}
-static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
- struct btree *b, struct bset *i,
- unsigned offset, int write)
+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
+ struct btree *b)
{
- pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n"
- "pos ",
- write ? "before write " : "",
- b->c.btree_id, b->c.level,
+ pr_buf(out, "%s level %u/%u\n ",
+ bch2_btree_ids[b->c.btree_id],
+ b->c.level,
c->btree_roots[b->c.btree_id].level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
- pr_buf(out, " node offset %u", b->written);
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
+ struct bch_dev *ca,
+ struct btree *b, struct bset *i,
+ unsigned offset, int write)
+{
+ pr_buf(out, "error validating btree node ");
+ if (write)
+ pr_buf(out, "before write ");
+ if (ca)
+ pr_buf(out, "on %s ", ca->name);
+ pr_buf(out, "at btree ");
+ btree_pos_to_text(out, c, b);
+
+ pr_buf(out, "\n node offset %u", b->written);
if (i)
pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
}
BTREE_RETRY_READ = 64,
};
-#define btree_err(type, c, b, i, msg, ...) \
+#define btree_err(type, c, ca, b, i, msg, ...) \
({ \
__label__ out; \
char _buf[300]; \
+ char *_buf2 = _buf; \
struct printbuf out = PBUF(_buf); \
\
- btree_err_msg(&out, c, b, i, b->written, write); \
+ _buf2 = kmalloc(4096, GFP_ATOMIC); \
+ if (_buf2) \
+ out = _PBUF(_buf2, 4986); \
+ \
+ btree_err_msg(&out, c, ca, b, i, b->written, write); \
pr_buf(&out, ": " msg, ##__VA_ARGS__); \
\
if (type == BTREE_ERR_FIXABLE && \
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
- mustfix_fsck_err(c, "%s", _buf); \
+ mustfix_fsck_err(c, "%s", _buf2); \
goto out; \
} \
\
switch (write) { \
case READ: \
- bch_err(c, "%s", _buf); \
+ bch_err(c, "%s", _buf2); \
\
switch (type) { \
case BTREE_ERR_FIXABLE: \
} \
break; \
case WRITE: \
- bch_err(c, "corrupt metadata before write: %s", _buf); \
+ bch_err(c, "corrupt metadata before write: %s", _buf2); \
\
if (bch2_fs_inconsistent(c)) { \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
break; \
} \
out: \
+ if (_buf2 != _buf) \
+ kfree(_buf2); \
true; \
})
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
-static int validate_bset(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned sectors,
- int write, bool have_retry)
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
+ struct btree *b, struct bset *i,
+ unsigned sectors, int write, bool have_retry)
{
unsigned version = le16_to_cpu(i->version);
const char *err;
+ char buf1[100];
+ char buf2[100];
int ret = 0;
btree_err_on((version != BCH_BSET_VERSION_OLD &&
version < bcachefs_metadata_version_min) ||
version >= bcachefs_metadata_version_max,
- BTREE_ERR_FATAL, c, b, i,
+ BTREE_ERR_FATAL, c, ca, b, i,
"unsupported bset version");
+ if (btree_err_on(version < c->sb.version_min,
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
+ "bset version %u older than superblock version_min %u",
+ version, c->sb.version_min)) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->version_min = cpu_to_le16(version);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ if (btree_err_on(version > c->sb.version,
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
+ "bset version %u newer than superblock version %u",
+ version, c->sb.version)) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->version = cpu_to_le16(version);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
+ BTREE_ERR_FATAL, c, ca, b, i,
+ "BSET_SEPARATE_WHITEOUTS no longer supported");
+
if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
"bset past end of btree node")) {
i->u64s = 0;
return 0;
}
btree_err_on(b->written && !i->u64s,
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
"empty bset");
if (!b->written) {
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
- BTREE_ERR_MUST_RETRY, c, b, i,
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
- BTREE_ERR_MUST_RETRY, c, b, i,
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
"incorrect level");
- if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
- u64 *p = (u64 *) &bn->ptr;
-
- *p = swab64(*p);
- }
-
if (!write)
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write, bn);
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
- btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
- BTREE_ERR_MUST_RETRY, c, b, NULL,
- "incorrect min_key: got %llu:%llu should be %llu:%llu",
- b->data->min_key.inode,
- b->data->min_key.offset,
- bp->min_key.inode,
- bp->min_key.offset);
+ if (BTREE_PTR_RANGE_UPDATED(bp)) {
+ b->data->min_key = bp->min_key;
+ b->data->max_key = b->key.k.p;
+ }
+
+ btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+ "incorrect min_key: got %s should be %s",
+ (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
+ (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
}
- btree_err_on(bkey_cmp(bn->max_key, b->key.k.p),
- BTREE_ERR_MUST_RETRY, c, b, i,
- "incorrect max key %llu:%llu",
- bn->max_key.inode,
- bn->max_key.offset);
+ btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
+ "incorrect max key %s",
+ (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
if (write)
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write, bn);
- /* XXX: ideally we would be validating min_key too */
-#if 0
- /*
- * not correct anymore, due to btree node write error
- * handling
- *
- * need to add bn->seq to btree keys and verify
- * against that
- */
- btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
- bn->ptr),
- BTREE_ERR_FATAL, c, b, i,
- "incorrect backpointer");
-#endif
err = bch2_bkey_format_validate(&bn->format);
btree_err_on(err,
- BTREE_ERR_FATAL, c, b, i,
+ BTREE_ERR_FATAL, c, ca, b, i,
"invalid bkey format: %s", err);
compat_bformat(b->c.level, b->c.btree_id, version,
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
- bool seen_non_whiteout = false;
int ret = 0;
- if (!BSET_SEPARATE_WHITEOUTS(i)) {
- seen_non_whiteout = true;
- *whiteout_u64s = 0;
- }
-
for (k = i->start;
k != vstruct_last(i);) {
struct bkey_s u;
const char *invalid;
if (btree_err_on(bkey_next(k) > vstruct_last(i),
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
break;
}
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
"invalid bkey format %u", k->format)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, b, i,
- "invalid bkey:\n%s\n%s", invalid, buf);
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
+ "invalid bkey: %s\n%s", invalid, buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
BSET_BIG_ENDIAN(i), write,
&b->format, k);
- /*
- * with the separate whiteouts thing (used for extents), the
- * second set of keys actually can have whiteouts too, so we
- * can't solely go off bkey_whiteout()...
- */
-
- if (!seen_non_whiteout &&
- (!bkey_whiteout(k) ||
- (prev && bkey_iter_cmp(b, prev, k) > 0))) {
- *whiteout_u64s = k->_data - i->_data;
- seen_non_whiteout = true;
- } else if (prev && bkey_iter_cmp(b, prev, k) > 0) {
+ if (prev && bkey_iter_cmp(b, prev, k) > 0) {
char buf1[80];
char buf2[80];
struct bkey up = bkey_unpack_key(b, prev);
bch2_bkey_to_text(&PBUF(buf2), u.k);
bch2_dump_bset(c, b, i, 0);
- btree_err(BTREE_ERR_FATAL, c, b, i,
- "keys out of order: %s > %s",
- buf1, buf2);
- /* XXX: repair this */
+
+ if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
+ "keys out of order: %s > %s",
+ buf1, buf2)) {
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+ memmove_u64s_down(k, bkey_next(k),
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ continue;
+ }
}
prev = k;
- k = bkey_next_skip_noops(k, vstruct_last(i));
+ k = bkey_next(k);
}
fsck_err:
return ret;
}
-int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
+ struct btree *b, bool have_retry)
{
struct btree_node_entry *bne;
struct sort_iter *iter;
unsigned u64s;
int ret, retry_read = 0, write = READ;
+ b->version_ondisk = U16_MAX;
+
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
sort_iter_init(iter, b);
iter->size = (btree_blocks(c) + 1) * 2;
if (bch2_meta_read_fault("btree"))
- btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
+ btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"bad magic");
btree_err_on(!b->data->keys.seq,
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"bad btree header");
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
&bkey_i_to_btree_ptr_v2(&b->key)->v;
btree_err_on(b->data->keys.seq != bp->seq,
- BTREE_ERR_MUST_RETRY, c, b, NULL,
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
"got wrong btree node (seq %llx want %llx)",
b->data->keys.seq, bp->seq);
}
i = &b->data->keys;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- BTREE_ERR_WANT_RETRY, c, b, i,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
"unknown checksum type %llu",
BSET_CSUM_TYPE(i));
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
btree_err_on(bch2_crc_cmp(csum, b->data->csum),
- BTREE_ERR_WANT_RETRY, c, b, i,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
bset_encrypt(c, i, b->written << 9);
- if (btree_node_is_extents(b) &&
- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
- set_btree_node_old_extent_overwrite(b);
- set_btree_node_need_rewrite(b);
- }
+ btree_err_on(btree_node_is_extents(b) &&
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
+ BTREE_ERR_FATAL, c, NULL, b, NULL,
+ "btree node does not have NEW_EXTENT_OVERWRITE set");
sectors = vstruct_sectors(b->data, c->block_bits);
} else {
break;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- BTREE_ERR_WANT_RETRY, c, b, i,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
"unknown checksum type %llu",
BSET_CSUM_TYPE(i));
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
btree_err_on(bch2_crc_cmp(csum, bne->csum),
- BTREE_ERR_WANT_RETRY, c, b, i,
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
"invalid checksum");
bset_encrypt(c, i, b->written << 9);
sectors = vstruct_sectors(bne, c->block_bits);
}
- ret = validate_bset(c, b, i, sectors,
+ b->version_ondisk = min(b->version_ondisk,
+ le16_to_cpu(i->version));
+
+ ret = validate_bset(c, ca, b, i, sectors,
READ, have_retry);
if (ret)
goto fsck_err;
true);
btree_err_on(blacklisted && first,
- BTREE_ERR_FIXABLE, c, b, i,
+ BTREE_ERR_FIXABLE, c, ca, b, i,
"first btree node bset has blacklisted journal seq");
if (blacklisted && !first)
continue;
bset_byte_offset(b, bne) < btree_bytes(c);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq,
- BTREE_ERR_WANT_RETRY, c, b, NULL,
+ BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
"found bset signature after last bset");
sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
set_btree_bset(b, b->set, &b->data->keys);
- b->nr = (btree_node_old_extent_overwrite(b)
- ? bch2_extent_sort_fix_overlapping
- : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter);
+ b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
u64s = le16_to_cpu(sorted->keys.u64s);
*sorted = *b->data;
const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
if (invalid ||
- (inject_invalid_keys(c) &&
+ (bch2_inject_invalid_keys &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, b, i,
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
"invalid bkey %s: %s", buf, invalid);
btree_keys_account_key_drop(&b->nr, 0, k);
bp.v->mem_ptr = 0;
}
- k = bkey_next_skip_noops(k, vstruct_last(i));
+ k = bkey_next(k);
}
bch2_bset_build_aux_tree(b, b->set, false);
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- if (ca->mi.state != BCH_MEMBER_STATE_RW)
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
set_btree_node_need_rewrite(b);
}
out:
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
+ char buf[200];
+ struct printbuf out;
bool can_retry;
goto start;
bio->bi_status = BLK_STS_REMOVED;
}
start:
- bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s",
- bch2_blk_status_to_str(bio->bi_status));
+ out = PBUF(buf);
+ btree_pos_to_text(&out, c, b);
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+ bch2_blk_status_to_str(bio->bi_status), buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
&failed, &rb->pick) > 0;
if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, b, can_retry))
+ !bch2_btree_node_read_done(c, ca, b, can_retry))
break;
if (!can_retry) {
struct btree_write_bio *wbio)
{
struct btree *b = wbio->wbio.bio.bi_private;
- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ struct bkey_buf k;
struct bch_extent_ptr *ptr;
struct btree_trans trans;
struct btree_iter *iter;
int ret;
+ bch2_bkey_buf_init(&k);
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
BUG_ON(!btree_node_hashed(b));
- bkey_copy(&tmp.k, &b->key);
+ bch2_bkey_buf_copy(&k, c, &b->key);
- bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
+ bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k)))
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
goto err;
- ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+ ret = bch2_btree_node_update_key(c, iter, b, k.k);
if (ret == -EINTR)
goto retry;
if (ret)
goto err;
out:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(&k, c);
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
return;
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
unsigned whiteout_u64s = 0;
int ret;
- if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE))
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
return -1;
- ret = validate_bset(c, b, i, sectors, WRITE, false) ?:
- validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false);
- if (ret)
+ ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
+ validate_bset(c, NULL, b, i, sectors, WRITE, false);
+ if (ret) {
bch2_inconsistent_error(c);
+ dump_stack();
+ }
return ret;
}
struct bset *i;
struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL;
- BKEY_PADDED(key) k;
+ struct bkey_buf k;
struct bch_extent_ptr *ptr;
struct sort_iter sort_iter;
struct nonce nonce;
bool validate_before_checksum = false;
void *data;
+ bch2_bkey_buf_init(&k);
+
if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
return;
if (!btree_node_may_write(b))
return;
+ if (old & (1 << BTREE_NODE_never_write))
+ return;
+
if (old & (1 << BTREE_NODE_write_in_flight)) {
btree_node_wait_on_io(b);
continue;
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
+ atomic_dec(&c->btree_cache.dirty);
+
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);
seq = max(seq, le64_to_cpu(i->journal_seq));
}
+ BUG_ON(b->written && !seq);
+
+ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
+ bytes += 8;
+
data = btree_bounce_alloc(c, bytes, &used_mempool);
if (!b->written) {
i->journal_seq = cpu_to_le64(seq);
i->u64s = 0;
- if (!btree_node_old_extent_overwrite(b)) {
- sort_iter_add(&sort_iter,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
- SET_BSET_SEPARATE_WHITEOUTS(i, false);
- } else {
- memcpy_u64s(i->start,
- unwritten_whiteouts_start(c, b),
- b->whiteout_u64s);
- i->u64s = cpu_to_le16(b->whiteout_u64s);
- SET_BSET_SEPARATE_WHITEOUTS(i, true);
- }
+ sort_iter_add(&sort_iter,
+ unwritten_whiteouts_start(c, b),
+ unwritten_whiteouts_end(c, b));
+ SET_BSET_SEPARATE_WHITEOUTS(i, false);
b->whiteout_u64s = 0;
- u64s = btree_node_old_extent_overwrite(b)
- ? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
- : bch2_sort_keys(i->start, &sort_iter, false);
+ u64s = bch2_sort_keys(i->start, &sort_iter, false);
le16_add_cpu(&i->u64s, u64s);
set_needs_whiteout(i, false);
validate_before_checksum = true;
/* validate_bset will be modifying: */
- if (le16_to_cpu(i->version) < bcachefs_metadata_version_max)
+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
validate_before_checksum = true;
/* if we're going to be encrypting, check metadata validity first: */
* just make all btree node writes FUA to keep things sane.
*/
- bkey_copy(&k.key, &b->key);
+ bch2_bkey_buf_copy(&k, c, &b->key);
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
ptr->offset += b->written;
b->written += sectors_to_write;
+ atomic64_inc(&c->btree_writes_nr);
+ atomic64_add(sectors_to_write, &c->btree_writes_sectors);
+
/* XXX: submitting IO with btree locks held: */
- bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key);
+ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
+ bch2_bkey_buf_exit(&k, c);
return;
err:
set_btree_node_noevict(b);
__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
}
-void bch2_btree_verify_flushed(struct bch_fs *c)
-{
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
- unsigned i;
-
- rcu_read_lock();
- for_each_cached_btree(b, c, tbl, i, pos) {
- unsigned long flags = READ_ONCE(b->flags);
-
- BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
- (flags & (1 << BTREE_NODE_write_in_flight)));
- }
- rcu_read_unlock();
-}
-
void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
{
struct bucket_table *tbl;
struct btree;
struct btree_iter;
+static inline bool btree_node_dirty(struct btree *b)
+{
+ return test_bit(BTREE_NODE_dirty, &b->flags);
+}
+
+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
+ atomic_inc(&c->btree_cache.dirty);
+}
+
+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
+{
+ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
+ atomic_dec(&c->btree_cache.dirty);
+}
+
struct btree_read_bio {
struct bch_fs *c;
u64 start_time;
void bch2_btree_init_next(struct bch_fs *, struct btree *,
struct btree_iter *);
-int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
+ struct btree *, bool);
void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
void bch2_btree_flush_all_reads(struct bch_fs *);
void bch2_btree_flush_all_writes(struct bch_fs *);
-void bch2_btree_verify_flushed(struct bch_fs *);
void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
static inline void compat_bformat(unsigned level, enum btree_id btree_id,
- unsigned version, unsigned big_endian,
- int write, struct bkey_format *f)
+ unsigned version, unsigned big_endian,
+ int write, struct bkey_format *f)
{
if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_INODES) {
+ btree_id == BTREE_ID_inodes) {
swap(f->bits_per_field[BKEY_FIELD_INODE],
f->bits_per_field[BKEY_FIELD_OFFSET]);
swap(f->field_offset[BKEY_FIELD_INODE],
f->field_offset[BKEY_FIELD_OFFSET]);
}
+
+ if (version < bcachefs_metadata_version_snapshot &&
+ (level || btree_type_has_snapshots(btree_id))) {
+ u64 max_packed =
+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
+
+ f->field_offset[BKEY_FIELD_SNAPSHOT] = write
+ ? 0
+ : U32_MAX - max_packed;
+ }
}
static inline void compat_bpos(unsigned level, enum btree_id btree_id,
bch2_bpos_swab(p);
if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_INODES)
+ btree_id == BTREE_ID_inodes)
swap(p->inode, p->offset);
}
{
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_node_type_is_extents(btree_id) &&
- bkey_cmp(bn->min_key, POS_MIN) &&
+ bpos_cmp(bn->min_key, POS_MIN) &&
write)
- bn->min_key = bkey_predecessor(bn->min_key);
+ bn->min_key = bpos_nosnap_predecessor(bn->min_key);
+
+ if (version < bcachefs_metadata_version_snapshot &&
+ write)
+ bn->max_key.snapshot = 0;
compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
+ if (version < bcachefs_metadata_version_snapshot &&
+ !write)
+ bn->max_key.snapshot = U32_MAX;
+
if (version < bcachefs_metadata_version_inode_btree_change &&
btree_node_type_is_extents(btree_id) &&
- bkey_cmp(bn->min_key, POS_MIN) &&
+ bpos_cmp(bn->min_key, POS_MIN) &&
!write)
- bn->min_key = bkey_successor(bn->min_key);
+ bn->min_key = bpos_nosnap_successor(bn->min_key);
}
#endif /* _BCACHEFS_BTREE_IO_H */
#include "bcachefs.h"
#include "bkey_methods.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "debug.h"
+#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "replicas.h"
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
+static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
+
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+ /* Are we iterating over keys in all snapshots? */
+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ p = bpos_successor(p);
+ } else {
+ p = bpos_nosnap_successor(p);
+ p.snapshot = iter->snapshot;
+ }
+
+ return p;
+}
+
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
+{
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+
+ /* Are we iterating over keys in all snapshots? */
+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
+ p = bpos_predecessor(p);
+ } else {
+ p = bpos_nosnap_predecessor(p);
+ p.snapshot = iter->snapshot;
+ }
+
+ return p;
+}
+
static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
{
return l < BTREE_MAX_DEPTH &&
if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
bkey_cmp(pos, POS_MAX))
- pos = bkey_successor(pos);
+ pos = bkey_successor(iter, pos);
return pos;
}
static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
struct btree *b)
{
- return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
+ return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
}
static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
struct btree *b)
{
- return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
+ return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
}
static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level, struct btree_iter *iter,
enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn,
- void *p)
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
{
struct btree_trans *trans = iter->trans;
- struct btree_iter *linked;
+ struct btree_iter *linked, *deadlock_iter = NULL;
u64 start_time = local_clock();
- bool ret = true;
+ unsigned reason = 9;
+ bool ret;
/* Check if it's safe to block: */
trans_for_each_iter(trans, linked) {
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
- if (!(trans->nounlock)) {
- linked->locks_want = max_t(unsigned,
- linked->locks_want,
- __fls(linked->nodes_locked) + 1);
- if (!btree_iter_get_locks(linked, true, false))
- ret = false;
- } else {
- ret = false;
+ linked->locks_want = max_t(unsigned,
+ linked->locks_want,
+ __fls(linked->nodes_locked) + 1);
+ if (!btree_iter_get_locks(linked, true, false)) {
+ deadlock_iter = linked;
+ reason = 1;
+ }
+ }
+
+ if (linked->btree_id != iter->btree_id) {
+ if (linked->btree_id > iter->btree_id) {
+ deadlock_iter = linked;
+ reason = 3;
}
+ continue;
+ }
+
+ /*
+ * Within the same btree, cached iterators come before non
+ * cached iterators:
+ */
+ if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
+ if (btree_iter_is_cached(iter)) {
+ deadlock_iter = linked;
+ reason = 4;
+ }
+ continue;
}
/*
* another iterator has possible descendants locked of the node
* we're about to lock, it must have the ancestors locked too:
*/
- if (linked->btree_id == iter->btree_id &&
- level > __fls(linked->nodes_locked)) {
- if (!(trans->nounlock)) {
- linked->locks_want =
- max(level + 1, max_t(unsigned,
- linked->locks_want,
- iter->locks_want));
- if (!btree_iter_get_locks(linked, true, false))
- ret = false;
- } else {
- ret = false;
+ if (level > __fls(linked->nodes_locked)) {
+ linked->locks_want =
+ max(level + 1, max_t(unsigned,
+ linked->locks_want,
+ iter->locks_want));
+ if (!btree_iter_get_locks(linked, true, false)) {
+ deadlock_iter = linked;
+ reason = 5;
}
}
/* Must lock btree nodes in key order: */
- if ((cmp_int(iter->btree_id, linked->btree_id) ?:
- -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
- ret = false;
-
- if (iter->btree_id == linked->btree_id &&
- btree_node_locked(linked, level) &&
- bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
- btree_iter_type(linked))) <= 0)
- ret = false;
+ if (btree_node_locked(linked, level) &&
+ bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
+ btree_iter_type(linked))) <= 0) {
+ deadlock_iter = linked;
+ reason = 7;
+ }
/*
* Recheck if this is a node we already have locked - since one
}
}
- if (unlikely(!ret)) {
- trace_trans_restart_would_deadlock(iter->trans->ip);
+ if (unlikely(deadlock_iter)) {
+ trace_trans_restart_would_deadlock(iter->trans->ip, ip,
+ reason,
+ deadlock_iter->btree_id,
+ btree_iter_type(deadlock_iter),
+ iter->btree_id,
+ btree_iter_type(iter));
return false;
}
if (six_trylock_type(&b->c.lock, type))
return true;
- if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
- return false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans->locking_iter_idx = iter->idx;
+ trans->locking_pos = pos;
+ trans->locking_btree_id = iter->btree_id;
+ trans->locking_level = level;
+ trans->locking = b;
+#endif
- bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
- start_time);
- return true;
+ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans->locking = NULL;
+#endif
+ if (ret)
+ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
+ start_time);
+ return ret;
}
/* Btree iterator locking: */
{
struct btree_iter *iter;
- trans_for_each_iter_all(trans, iter)
+ trans_for_each_iter(trans, iter)
bch2_btree_iter_verify_locks(iter);
}
#else
return false;
}
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
- unsigned new_locks_want)
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+ unsigned new_locks_want)
{
- unsigned l = iter->level;
+ unsigned l;
- EBUG_ON(iter->locks_want >= new_locks_want);
+ EBUG_ON(iter->locks_want < new_locks_want);
iter->locks_want = new_locks_want;
- do {
- if (!btree_iter_node(iter, l))
- break;
-
- if (!bch2_btree_node_upgrade(iter, l)) {
- iter->locks_want = l;
- return false;
- }
-
- l++;
- } while (l < iter->locks_want);
-
- return true;
-}
-
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
- unsigned downgrade_to)
-{
- unsigned l, new_locks_want = downgrade_to ?:
- (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
-
- if (iter->locks_want < downgrade_to) {
- iter->locks_want = new_locks_want;
-
- while (iter->nodes_locked &&
- (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
- if (l > iter->level) {
- btree_node_unlock(iter, l);
- } else {
- if (btree_node_intent_locked(iter, l)) {
- six_lock_downgrade(&iter->l[l].b->c.lock);
- iter->nodes_intent_locked ^= 1 << l;
- }
- break;
+ while (iter->nodes_locked &&
+ (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
+ if (l > iter->level) {
+ btree_node_unlock(iter, l);
+ } else {
+ if (btree_node_intent_locked(iter, l)) {
+ six_lock_downgrade(&iter->l[l].b->c.lock);
+ iter->nodes_intent_locked ^= 1 << l;
}
+ break;
}
}
bool bch2_trans_relock(struct btree_trans *trans)
{
struct btree_iter *iter;
- bool ret = true;
trans_for_each_iter(trans, iter)
- if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
- ret &= bch2_btree_iter_relock(iter, true);
-
- return ret;
+ if (btree_iter_keep(trans, iter) &&
+ !bch2_btree_iter_relock(iter, true))
+ return false;
+ return true;
}
void bch2_trans_unlock(struct btree_trans *trans)
static void bch2_btree_iter_verify_level(struct btree_iter *iter,
unsigned level)
{
- struct bpos pos = btree_iter_search_key(iter);
- struct btree_iter_level *l = &iter->l[level];
- struct btree_node_iter tmp = l->iter;
- bool locked = btree_node_locked(iter, level);
+ struct btree_iter_level *l;
+ struct btree_node_iter tmp;
+ bool locked;
struct bkey_packed *p, *k;
- char buf1[100], buf2[100];
+ char buf1[100], buf2[100], buf3[100];
const char *msg;
- if (!debug_check_iterators(iter->trans->c))
+ if (!bch2_debug_check_iterators)
return;
+ l = &iter->l[level];
+ tmp = l->iter;
+ locked = btree_node_locked(iter, level);
+
if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
if (!level)
bch2_btree_iter_verify_cached(iter);
if (!bch2_btree_node_relock(iter, level))
return;
- /*
- * Ideally this invariant would always be true, and hopefully in the
- * future it will be, but for now set_pos_same_leaf() breaks it:
- */
- BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
- !btree_iter_pos_in_node(iter, l->b));
+ BUG_ON(!btree_iter_pos_in_node(iter, l->b));
/*
* node iterators don't use leaf node iterator:
* whiteouts)
*/
p = level || btree_node_type_is_extents(iter->btree_id)
- ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard)
+ ? bch2_btree_node_iter_prev(&tmp, l->b)
: bch2_btree_node_iter_prev_all(&tmp, l->b);
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
+ if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
msg = "before";
goto err;
}
- if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+ if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
msg = "after";
goto err;
}
btree_node_unlock(iter, level);
return;
err:
- strcpy(buf1, "(none)");
strcpy(buf2, "(none)");
+ strcpy(buf3, "(none)");
+
+ bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
if (p) {
struct bkey uk = bkey_unpack_key(l->b, p);
- bch2_bkey_to_text(&PBUF(buf1), &uk);
+ bch2_bkey_to_text(&PBUF(buf2), &uk);
}
if (k) {
struct bkey uk = bkey_unpack_key(l->b, k);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
+ bch2_bkey_to_text(&PBUF(buf3), &uk);
}
panic("iterator should be %s key at level %u:\n"
- "iter pos %s %llu:%llu\n"
+ "iter pos %s\n"
"prev key %s\n"
"cur key %s\n",
- msg, level,
- iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
- iter->pos.inode, iter->pos.offset,
- buf1, buf2);
+ msg, level, buf1, buf2, buf3);
}
static void bch2_btree_iter_verify(struct btree_iter *iter)
{
+ enum btree_iter_type type = btree_iter_type(iter);
unsigned i;
- bch2_btree_trans_verify_locks(iter->trans);
+ EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ iter->pos.snapshot != iter->snapshot);
+
+ BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+ BUG_ON(type == BTREE_ITER_NODES &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+ BUG_ON(type != BTREE_ITER_NODES &&
+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ !btree_type_has_snapshots(iter->btree_id));
+
+ bch2_btree_iter_verify_locks(iter);
for (i = 0; i < BTREE_MAX_DEPTH; i++)
bch2_btree_iter_verify_level(iter, i);
}
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+ enum btree_iter_type type = btree_iter_type(iter);
+
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ iter->pos.snapshot != iter->snapshot);
+
+ BUG_ON((type == BTREE_ITER_KEYS ||
+ type == BTREE_ITER_CACHED) &&
+ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+ bkey_cmp(iter->pos, iter->k.p) > 0));
+}
+
void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
{
struct btree_iter *iter;
- if (!debug_check_iterators(trans->c))
+ if (!bch2_debug_check_iterators)
return;
trans_for_each_iter_with_node(trans, b, iter)
static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
#endif
struct bkey_packed *where)
{
struct btree_iter_level *l = &iter->l[b->c.level];
- struct bpos pos = btree_iter_search_key(iter);
if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
return;
- if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
+ if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
bch2_btree_node_iter_advance(&l->iter, l->b);
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
bool iter_current_key_modified =
orig_iter_pos >= offset &&
orig_iter_pos <= offset + clobber_u64s;
- struct bpos iter_pos = btree_iter_search_key(iter);
btree_node_iter_for_each(node_iter, set)
if (set->end == old_end)
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
+ bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
bch2_btree_node_iter_push(node_iter, b, where, end);
goto fixup_done;
} else {
return;
if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
+ bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
set->k = offset;
} else if (set->k < offset + clobber_u64s) {
set->k = offset + new_u64s;
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
where, clobber_u64s, new_u64s);
- if (debug_check_iterators(iter->trans->c))
+ if (bch2_debug_check_iterators)
bch2_btree_node_iter_verify(node_iter, b);
}
ret = bkey_disassemble(l->b, k, u);
- if (debug_check_bkeys(iter->trans->c))
+ if (bch2_debug_check_bkeys)
bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
return ret;
}
/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
- struct btree_iter_level *l,
- struct bkey *u)
+static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
+ struct btree_iter_level *l,
+ struct bkey *u)
{
return __btree_iter_unpack(iter, l, u,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
- struct btree_iter_level *l)
+static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
+ struct btree_iter_level *l)
{
- return __btree_iter_unpack(iter, l, &iter->k,
+ struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+ return k;
}
-static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
- struct btree_iter_level *l)
+static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
+ struct btree_iter_level *l)
{
- return __btree_iter_unpack(iter, l, &iter->k,
+ struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
bch2_btree_node_iter_prev(&l->iter, l->b));
+
+ iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+ return k;
}
static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
struct btree_iter_level *l,
int max_advance)
{
- struct bpos pos = btree_iter_search_key(iter);
struct bkey_packed *k;
int nr_advanced = 0;
while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
- bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
+ bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
if (max_advance > 0 && nr_advanced >= max_advance)
return false;
if (!k ||
bkey_deleted(k) ||
bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
- char buf[100];
+ char buf1[100];
+ char buf2[100];
+ char buf3[100];
+ char buf4[100];
struct bkey uk = bkey_unpack_key(b, k);
- bch2_bkey_to_text(&PBUF(buf), &uk);
- panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
- buf, b->key.k.p.inode, b->key.k.p.offset);
+ bch2_dump_btree_node(iter->trans->c, l->b);
+ bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+ bch2_bkey_to_text(&PBUF(buf2), &uk);
+ bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
+ bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
+ panic("parent iter doesn't point to new node:\n"
+ "iter pos %s %s\n"
+ "iter key %s\n"
+ "new node %s-%s\n",
+ bch2_btree_ids[iter->btree_id], buf1,
+ buf2, buf3, buf4);
}
if (!parent_locked)
static inline void __btree_iter_init(struct btree_iter *iter,
unsigned level)
{
- struct bpos pos = btree_iter_search_key(iter);
struct btree_iter_level *l = &iter->l[level];
- bch2_btree_node_iter_init(&l->iter, l->b, &pos);
+ bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
+
+ /*
+ * Iterators to interior nodes should always be pointed at the first non
+ * whiteout:
+ */
+ if (level)
+ bch2_btree_node_iter_peek(&l->iter, l->b);
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
trans_for_each_iter(iter->trans, linked)
if (linked->l[level].b == b) {
- __btree_node_unlock(linked, level);
+ btree_node_unlock(linked, level);
linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
}
}
}
static inline int btree_iter_lock_root(struct btree_iter *iter,
- unsigned depth_want)
+ unsigned depth_want,
+ unsigned long trace_ip)
{
struct bch_fs *c = iter->trans->c;
struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
lock_type = __btree_lock_want(iter, iter->level);
if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
iter, lock_type,
- lock_root_check_fn, rootp)))
+ lock_root_check_fn, rootp,
+ trace_ip)))
return -EINTR;
if (likely(b == READ_ONCE(*rootp) &&
struct btree_iter_level *l = &iter->l[iter->level];
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
- BKEY_PADDED(k) tmp;
+ struct bkey_buf tmp;
unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
? (iter->level > 1 ? 0 : 2)
: (iter->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(iter, iter->level);
+ bch2_bkey_buf_init(&tmp);
+
while (nr) {
if (!bch2_btree_node_relock(iter, iter->level))
- return;
+ break;
bch2_btree_node_iter_advance(&node_iter, l->b);
k = bch2_btree_node_iter_peek(&node_iter, l->b);
if (!k)
break;
- bch2_bkey_unpack(l->b, &tmp.k, k);
- bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+ bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
+ iter->level - 1);
}
if (!was_locked)
btree_node_unlock(iter, iter->level);
+
+ bch2_bkey_buf_exit(&tmp, c);
}
static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
btree_node_unlock(iter, plevel);
}
-static __always_inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter,
+ unsigned long trace_ip)
{
struct bch_fs *c = iter->trans->c;
struct btree_iter_level *l = &iter->l[iter->level];
struct btree *b;
unsigned level = iter->level - 1;
enum six_lock_type lock_type = __btree_lock_want(iter, level);
- BKEY_PADDED(k) tmp;
+ struct bkey_buf tmp;
+ int ret;
EBUG_ON(!btree_node_locked(iter, iter->level));
- bch2_bkey_unpack(l->b, &tmp.k,
+ bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b));
- b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
- if (unlikely(IS_ERR(b)))
- return PTR_ERR(b);
+ b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (unlikely(ret))
+ goto err;
mark_btree_node_locked(iter, level, lock_type);
btree_iter_node_set(iter, b);
- if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
- unlikely(b != btree_node_mem_ptr(&tmp.k)))
+ if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+ unlikely(b != btree_node_mem_ptr(tmp.k)))
btree_node_mem_ptr_set(iter, level + 1, b);
if (iter->flags & BTREE_ITER_PREFETCH)
btree_iter_prefetch(iter);
iter->level = level;
-
- return 0;
-}
-
-static void btree_iter_up(struct btree_iter *iter)
-{
- btree_node_unlock(iter, iter->level++);
+err:
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
}
-static int btree_iter_traverse_one(struct btree_iter *);
+static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
{
sorted[nr_sorted++] = iter->idx;
#define btree_iter_cmp_by_idx(_l, _r) \
- btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
+ btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
#undef btree_iter_cmp_by_idx
bch2_trans_unlock(trans);
+ cond_resched();
if (unlikely(ret == -ENOMEM)) {
struct closure cl;
if (!(trans->iters_linked & (1ULL << idx)))
continue;
- ret = btree_iter_traverse_one(&trans->iters[idx]);
+ ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
if (ret)
goto retry_all;
}
!bch2_btree_node_relock(iter, l))
return false;
- if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+ if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
return false;
- if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+ if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
return false;
return true;
}
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
* stashed in the iterator and returned from bch2_trans_exit().
*/
-static int btree_iter_traverse_one(struct btree_iter *iter)
+static int btree_iter_traverse_one(struct btree_iter *iter,
+ unsigned long trace_ip)
{
unsigned depth_want = iter->level;
if (unlikely(iter->level >= BTREE_MAX_DEPTH))
return 0;
- /*
- * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
- * here unnecessary
- */
iter->level = btree_iter_up_until_good_node(iter, 0);
- /*
- * If we've got a btree node locked (i.e. we aren't about to relock the
- * root) - advance its node iterator if necessary:
- *
- * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
- */
- if (is_btree_node(iter, iter->level)) {
- BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));
-
- btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
- }
-
/*
* Note: iter->nodes[iter->level] may be temporarily NULL here - that
* would indicate to other code that we got to the end of the btree,
*/
while (iter->level > depth_want) {
int ret = btree_iter_node(iter, iter->level)
- ? btree_iter_down(iter)
- : btree_iter_lock_root(iter, depth_want);
+ ? btree_iter_down(iter, trace_ip)
+ : btree_iter_lock_root(iter, depth_want, trace_ip);
if (unlikely(ret)) {
if (ret == 1)
return 0;
return 0;
}
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
int ret;
ret = bch2_trans_cond_resched(trans) ?:
- btree_iter_traverse_one(iter);
+ btree_iter_traverse_one(iter, _RET_IP_);
if (unlikely(ret))
ret = __btree_iter_traverse_all(trans, ret);
return ret;
}
-static inline void bch2_btree_iter_checks(struct btree_iter *iter)
+/*
+ * Note:
+ * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
+ * for internal btree iterator users
+ *
+ * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
+ * btree_iter_traverse() does not:
+ */
+static inline int __must_check
+btree_iter_traverse(struct btree_iter *iter)
{
- enum btree_iter_type type = btree_iter_type(iter);
-
- EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+ return iter->uptodate >= BTREE_ITER_NEED_RELOCK
+ ? __bch2_btree_iter_traverse(iter)
+ : 0;
+}
- BUG_ON((type == BTREE_ITER_KEYS ||
- type == BTREE_ITER_CACHED) &&
- (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
- bkey_cmp(iter->pos, iter->k.p) > 0));
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+ btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
- bch2_btree_iter_verify_locks(iter);
- bch2_btree_iter_verify_level(iter, iter->level);
+ return btree_iter_traverse(iter);
}
/* Iterate across nodes (leaf and interior nodes) */
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
- bch2_btree_iter_checks(iter);
-
- if (iter->uptodate == BTREE_ITER_UPTODATE)
- return iter->l[iter->level].b;
+ bch2_btree_iter_verify(iter);
- ret = bch2_btree_iter_traverse(iter);
+ ret = btree_iter_traverse(iter);
if (ret)
return NULL;
if (!b)
return NULL;
- BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+ BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
- iter->pos = b->key.k.p;
- iter->uptodate = BTREE_ITER_UPTODATE;
+ iter->pos = iter->real_pos = b->key.k.p;
bch2_btree_iter_verify(iter);
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
/* already got to end? */
if (!btree_iter_node(iter, iter->level))
bch2_trans_cond_resched(iter->trans);
- btree_iter_up(iter);
+ btree_node_unlock(iter, iter->level);
+ iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
+ iter->level++;
- if (!bch2_btree_node_relock(iter, iter->level))
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
-
- ret = bch2_btree_iter_traverse(iter);
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+ ret = btree_iter_traverse(iter);
if (ret)
return NULL;
if (!b)
return NULL;
- if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
+ if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
+ btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
- /*
- * We don't really want to be unlocking here except we can't
- * directly tell btree_iter_traverse() "traverse to this level"
- * except by setting iter->level, so we have to unlock so we
- * don't screw up our lock invariants:
- */
- if (btree_node_read_locked(iter, iter->level))
- btree_node_unlock(iter, iter->level);
-
- iter->pos = bkey_successor(iter->pos);
- iter->level = iter->min_depth;
+ /* Unlock to avoid screwing up our lock invariants: */
+ btree_node_unlock(iter, iter->level);
+ iter->level = iter->min_depth;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- ret = bch2_btree_iter_traverse(iter);
+ bch2_btree_iter_verify(iter);
+
+ ret = btree_iter_traverse(iter);
if (ret)
return NULL;
b = iter->l[iter->level].b;
}
- iter->pos = b->key.k.p;
- iter->uptodate = BTREE_ITER_UPTODATE;
+ iter->pos = iter->real_pos = b->key.k.p;
bch2_btree_iter_verify(iter);
/* Iterate across keys (in leaf nodes only) */
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
-{
- struct btree_iter_level *l = &iter->l[0];
-
- EBUG_ON(iter->level != 0);
- EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
- EBUG_ON(!btree_node_locked(iter, 0));
- EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = new_pos;
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
- btree_iter_advance_to_pos(iter, l, -1);
-
- /*
- * XXX:
- * keeping a node locked that's outside (even just outside) iter->pos
- * breaks __bch2_btree_node_lock(). This seems to only affect
- * bch2_btree_node_get_sibling so for now it's fixed there, but we
- * should try to get rid of this corner case.
- *
- * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK)
- */
-
- if (bch2_btree_node_iter_end(&l->iter) &&
- btree_iter_pos_after_node(iter, l->b))
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-}
-
-static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
+static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
{
+ int cmp = bpos_cmp(new_pos, iter->real_pos);
unsigned l = iter->level;
if (!cmp)
goto out;
+ iter->real_pos = new_pos;
+
if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
btree_node_unlock(iter, 0);
iter->l[0].b = BTREE_ITER_NO_NODE_UP;
btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
else
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+
+ bch2_btree_iter_verify(iter);
}
-void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
- bool strictly_greater)
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
- struct bpos old = btree_iter_search_key(iter);
- int cmp;
-
- iter->flags &= ~BTREE_ITER_IS_EXTENTS;
- iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
+ struct bpos pos = iter->k.p;
+ bool ret = bpos_cmp(pos, POS_MAX) != 0;
- bkey_init(&iter->k);
- iter->k.p = iter->pos = new_pos;
-
- cmp = bkey_cmp(btree_iter_search_key(iter), old);
-
- btree_iter_pos_changed(iter, cmp);
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
}
-void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
{
- int cmp = bkey_cmp(new_pos, iter->pos);
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = new_pos;
+ struct bpos pos = bkey_start_pos(&iter->k);
+ bool ret = bpos_cmp(pos, POS_MIN) != 0;
- btree_iter_pos_changed(iter, cmp);
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_predecessor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
}
static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
- bool ret;
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = l->b->key.k.p;
+ struct bpos next_pos = iter->l[0].b->key.k.p;
+ bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
- ret = bkey_cmp(iter->pos, POS_MAX) != 0;
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- iter->k.p = iter->pos = bkey_successor(iter->pos);
+ /*
+ * Typically, we don't want to modify iter->pos here, since that
+ * indicates where we searched from - unless we got to the end of the
+ * btree, in that case we want iter->pos to reflect that:
+ */
+ if (ret)
+ btree_iter_set_search_pos(iter, bpos_successor(next_pos));
+ else
+ bch2_btree_iter_set_pos(iter, POS_MAX);
- btree_iter_pos_changed(iter, 1);
return ret;
}
static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
- bool ret;
-
- bkey_init(&iter->k);
- iter->k.p = iter->pos = l->b->data->min_key;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
-
- ret = bkey_cmp(iter->pos, POS_MIN) != 0;
- if (ret) {
- iter->k.p = iter->pos = bkey_predecessor(iter->pos);
+ struct bpos next_pos = iter->l[0].b->data->min_key;
+ bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- iter->k.p = iter->pos = bkey_predecessor(iter->pos);
- }
+ if (ret)
+ btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
+ else
+ bch2_btree_iter_set_pos(iter, POS_MIN);
- btree_iter_pos_changed(iter, -1);
return ret;
}
-/**
- * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key
- * it currently points to
- */
-static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
+static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos)
{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_s_c ret = { .k = &iter->k };
-
- if (!bkey_deleted(&iter->k)) {
- struct bkey_packed *_k =
- __bch2_btree_node_iter_peek_all(&l->iter, l->b);
-
- ret.v = bkeyp_val(&l->b->format, _k);
-
- if (debug_check_iterators(iter->trans->c)) {
- struct bkey k = bkey_unpack_key(l->b, _k);
+ struct btree_insert_entry *i;
- BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
+ trans_for_each_update2(trans, i)
+ if ((cmp_int(btree_id, i->iter->btree_id) ?:
+ bkey_cmp(pos, i->k->k.p)) <= 0) {
+ if (btree_id == i->iter->btree_id)
+ return i->k;
+ break;
}
- if (debug_check_bkeys(iter->trans->c))
- bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
- }
-
- return ret;
+ return NULL;
}
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates)
{
- struct btree_iter_level *l = &iter->l[0];
+ struct bpos search_key = btree_iter_search_key(iter);
+ struct bkey_i *next_update = with_updates
+ ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
+ : NULL;
struct bkey_s_c k;
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
- if (iter->uptodate == BTREE_ITER_UPTODATE &&
- !bkey_deleted(&iter->k))
- return btree_iter_peek_uptodate(iter);
+ btree_iter_set_search_pos(iter, search_key);
while (1) {
- ret = bch2_btree_iter_traverse(iter);
+ ret = btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
- k = __btree_iter_peek(iter, l);
- if (likely(k.k))
+ k = btree_iter_level_peek(iter, &iter->l[0]);
+
+ if (next_update &&
+ bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
+ k = bkey_i_to_s_c(next_update);
+
+ if (likely(k.k)) {
+ if (bkey_deleted(k.k)) {
+ btree_iter_set_search_pos(iter,
+ bkey_successor(iter, k.k->p));
+ continue;
+ }
+
break;
+ }
if (!btree_iter_set_pos_to_next_leaf(iter))
return bkey_s_c_null;
}
/*
- * iter->pos should always be equal to the key we just
- * returned - except extents can straddle iter->pos:
+ * iter->pos should be mononotically increasing, and always be equal to
+ * the key we just returned - except extents can straddle iter->pos:
*/
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
+ if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
- iter->uptodate = BTREE_ITER_UPTODATE;
-
- bch2_btree_iter_verify_level(iter, 0);
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
return k;
}
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+ return __btree_iter_peek(iter, false);
+}
+
/**
* bch2_btree_iter_next: returns first key greater than iterator's current
* position
*/
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
{
- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ if (!bch2_btree_iter_advance(iter))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? iter->k.p
- : bkey_successor(iter->k.p));
-
return bch2_btree_iter_peek(iter);
}
-static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
-{
- struct bpos pos = btree_iter_search_key(iter);
- struct btree_trans *trans = iter->trans;
- struct btree_insert_entry *i;
-
- trans_for_each_update2(trans, i)
- if ((cmp_int(iter->btree_id, i->iter->btree_id) ?:
- bkey_cmp(pos, i->k->k.p)) <= 0)
- break;
-
- return i < trans->updates2 + trans->nr_updates2 &&
- iter->btree_id == i->iter->btree_id
- ? bkey_i_to_s_c(i->k)
- : bkey_s_c_null;
-}
-
-static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct bkey_s_c k = __btree_iter_peek(iter, l);
- struct bkey_s_c u = __btree_trans_updates_peek(iter);
-
- if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
- return k;
- if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) {
- iter->k = *u.k;
- return u;
- }
- return bkey_s_c_null;
-}
-
struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
{
- struct bkey_s_c k;
- int ret;
-
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
-
- while (1) {
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
-
- k = __bch2_btree_iter_peek_with_updates(iter);
-
- if (k.k && bkey_deleted(k.k)) {
- bch2_btree_iter_set_pos(iter,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? iter->k.p
- : bkey_successor(iter->k.p));
- continue;
- }
-
- if (likely(k.k))
- break;
-
- if (!btree_iter_set_pos_to_next_leaf(iter))
- return bkey_s_c_null;
- }
-
- /*
- * iter->pos should always be equal to the key we just
- * returned - except extents can straddle iter->pos:
- */
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
- bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
- iter->pos = bkey_start_pos(k.k);
-
- iter->uptodate = BTREE_ITER_UPTODATE;
- return k;
+ return __btree_iter_peek(iter, true);
}
struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
{
- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ if (!bch2_btree_iter_advance(iter))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? iter->k.p
- : bkey_successor(iter->k.p));
-
return bch2_btree_iter_peek_with_updates(iter);
}
*/
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
- struct bpos pos = iter->pos;
struct btree_iter_level *l = &iter->l[0];
struct bkey_s_c k;
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
- if (iter->uptodate == BTREE_ITER_UPTODATE &&
- !bkey_deleted(&iter->k))
- return btree_iter_peek_uptodate(iter);
+ btree_iter_set_search_pos(iter, iter->pos);
while (1) {
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ ret = btree_iter_traverse(iter);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto no_key;
+ }
- k = __btree_iter_peek(iter, l);
- if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0)
- k = __btree_iter_prev(iter, l);
+ k = btree_iter_level_peek(iter, l);
+ if (!k.k ||
+ ((iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
+ : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0))
+ k = btree_iter_level_prev(iter, l);
if (likely(k.k))
break;
- if (!btree_iter_set_pos_to_prev_leaf(iter))
- return bkey_s_c_null;
+ if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+ k = bkey_s_c_null;
+ goto no_key;
+ }
}
- EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
- iter->pos = bkey_start_pos(k.k);
- iter->uptodate = BTREE_ITER_UPTODATE;
+ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
+
+ /* Extents can straddle iter->pos: */
+ if (bkey_cmp(k.k->p, iter->pos) < 0)
+ iter->pos = k.k->p;
+out:
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
return k;
+no_key:
+ /*
+ * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
+ * then we errored going to the previous leaf - make sure it's
+ * consistent with iter->pos:
+ */
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos;
+ goto out;
}
/**
*/
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
{
- struct bpos pos = bkey_start_pos(&iter->k);
-
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
-
- if (unlikely(!bkey_cmp(pos, POS_MIN)))
+ if (!bch2_btree_iter_rewind(iter))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bkey_predecessor(pos));
-
return bch2_btree_iter_peek_prev(iter);
}
static inline struct bkey_s_c
__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter;
struct bkey_s_c k;
- struct bkey n;
- int ret;
+ struct bpos pos, next_start;
/* keys & holes can't span inode numbers: */
if (iter->pos.offset == KEY_OFFSET_MAX) {
if (iter->pos.inode == KEY_INODE_MAX)
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
-
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
}
- /*
- * iterator is now at the correct position for inserting at iter->pos,
- * but we need to keep iterating until we find the first non whiteout so
- * we know how big a hole we have, if any:
- */
-
- node_iter = l->iter;
- k = __btree_iter_unpack(iter, l, &iter->k,
- bch2_btree_node_iter_peek(&node_iter, l->b));
-
- if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
- /*
- * We're not setting iter->uptodate because the node iterator
- * doesn't necessarily point at the key we're returning:
- */
+ pos = iter->pos;
+ k = bch2_btree_iter_peek(iter);
+ iter->pos = pos;
- EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
- bch2_btree_iter_verify_level(iter, 0);
+ if (bkey_err(k))
return k;
- }
- /* hole */
+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
+ return k;
- if (!k.k)
- k.k = &l->b->key.k;
+ next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
- bkey_init(&n);
- n.p = iter->pos;
- bch2_key_resize(&n,
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos;
+ bch2_key_resize(&iter->k,
min_t(u64, KEY_SIZE_MAX,
- (k.k->p.inode == n.p.inode
- ? bkey_start_offset(k.k)
+ (next_start.inode == iter->pos.inode
+ ? next_start.offset
: KEY_OFFSET_MAX) -
- n.p.offset));
+ iter->pos.offset));
- EBUG_ON(!n.size);
+ EBUG_ON(!iter->k.size);
- iter->k = n;
- iter->uptodate = BTREE_ITER_UPTODATE;
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
- bch2_btree_iter_verify_level(iter, 0);
return (struct bkey_s_c) { &iter->k, NULL };
}
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify_entry_exit(iter);
- if (iter->uptodate == BTREE_ITER_UPTODATE)
- return btree_iter_peek_uptodate(iter);
-
- ret = bch2_btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
if (iter->flags & BTREE_ITER_IS_EXTENTS)
return __bch2_btree_iter_peek_slot_extents(iter);
- k = __btree_iter_peek_all(iter, l, &iter->k);
+ ret = btree_iter_traverse(iter);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
+ k = btree_iter_level_peek_all(iter, l, &iter->k);
EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
k = (struct bkey_s_c) { &iter->k, NULL };
}
- iter->uptodate = BTREE_ITER_UPTODATE;
- bch2_btree_iter_verify_level(iter, 0);
+ bch2_btree_iter_verify_entry_exit(iter);
+ bch2_btree_iter_verify(iter);
return k;
}
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
- if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
+ if (!bch2_btree_iter_advance(iter))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? iter->k.p
- : bkey_successor(iter->k.p));
+ return bch2_btree_iter_peek_slot(iter);
+}
+
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+{
+ if (!bch2_btree_iter_rewind(iter))
+ return bkey_s_c_null;
return bch2_btree_iter_peek_slot(iter);
}
int ret;
EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
- bch2_btree_iter_checks(iter);
+ bch2_btree_iter_verify(iter);
- ret = bch2_btree_iter_traverse(iter);
+ ret = btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
}
static inline void bch2_btree_iter_init(struct btree_trans *trans,
- struct btree_iter *iter, enum btree_id btree_id,
- struct bpos pos, unsigned flags)
+ struct btree_iter *iter, enum btree_id btree_id)
{
struct bch_fs *c = trans->c;
unsigned i;
- if (btree_node_type_is_extents(btree_id) &&
- !(flags & BTREE_ITER_NODES))
- flags |= BTREE_ITER_IS_EXTENTS;
-
iter->trans = trans;
- iter->pos = pos;
- bkey_init(&iter->k);
- iter->k.p = pos;
- iter->flags = flags;
iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
iter->btree_id = btree_id;
iter->level = 0;
iter->min_depth = 0;
- iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0;
+ iter->locks_want = 0;
iter->nodes_locked = 0;
iter->nodes_intent_locked = 0;
for (i = 0; i < ARRAY_SIZE(iter->l); i++)
return 0;
BUG_ON(trans->iters + iter->idx != iter);
+ BUG_ON(!btree_iter_live(trans, iter));
ret = btree_iter_err(iter);
if (IS_ERR_OR_NULL(iter))
return 0;
- trans->iters_touched &= ~(1ULL << iter->idx);
+ set_btree_iter_dontneed(trans, iter);
return bch2_trans_iter_put(trans, iter);
}
-static int bch2_trans_realloc_iters(struct btree_trans *trans,
- unsigned new_size)
+noinline __cold
+static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
{
- void *p, *new_iters, *new_updates, *new_updates2;
- size_t iters_bytes;
- size_t updates_bytes;
-
- new_size = roundup_pow_of_two(new_size);
-
- BUG_ON(new_size > BTREE_ITER_MAX);
-
- if (new_size <= trans->size)
- return 0;
-
- BUG_ON(trans->used_mempool);
- bch2_trans_unlock(trans);
-
- iters_bytes = sizeof(struct btree_iter) * new_size;
- updates_bytes = sizeof(struct btree_insert_entry) * new_size;
-
- p = kmalloc(iters_bytes +
- updates_bytes +
- updates_bytes, GFP_NOFS);
- if (p)
- goto success;
-
- p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
- new_size = BTREE_ITER_MAX;
-
- trans->used_mempool = true;
-success:
- new_iters = p; p += iters_bytes;
- new_updates = p; p += updates_bytes;
- new_updates2 = p; p += updates_bytes;
-
- memcpy(new_iters, trans->iters,
- sizeof(struct btree_iter) * trans->nr_iters);
- memcpy(new_updates, trans->updates,
- sizeof(struct btree_insert_entry) * trans->nr_updates);
- memcpy(new_updates2, trans->updates2,
- sizeof(struct btree_insert_entry) * trans->nr_updates2);
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- memset(trans->iters, POISON_FREE,
- sizeof(struct btree_iter) * trans->nr_iters +
- sizeof(struct btree_insert_entry) * trans->nr_iters);
-
- if (trans->iters != trans->iters_onstack)
- kfree(trans->iters);
-
- trans->iters = new_iters;
- trans->updates = new_updates;
- trans->updates2 = new_updates2;
- trans->size = new_size;
+ struct btree_iter *iter;
+ struct btree_insert_entry *i;
+ char buf[100];
- if (trans->iters_live) {
- trace_trans_restart_iters_realloced(trans->ip, trans->size);
- return -EINTR;
+ trans_for_each_iter(trans, iter)
+ printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
+ bch2_btree_ids[iter->btree_id],
+ (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
+ btree_iter_live(trans, iter) ? " live" : "",
+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+ (void *) iter->ip_allocated);
+
+ trans_for_each_update(trans, i) {
+ char buf[300];
+
+ bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
+ printk(KERN_ERR "update: btree %s %s\n",
+ bch2_btree_ids[i->iter->btree_id], buf);
}
-
- return 0;
+ panic("trans iter oveflow\n");
}
static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
{
- unsigned idx = __ffs64(~trans->iters_linked);
+ unsigned idx;
- if (idx < trans->nr_iters)
- goto got_slot;
+ if (unlikely(trans->iters_linked ==
+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
+ btree_trans_iter_alloc_fail(trans);
- if (trans->nr_iters == trans->size) {
- int ret;
-
- if (trans->nr_iters >= BTREE_ITER_MAX) {
- struct btree_iter *iter;
-
- trans_for_each_iter(trans, iter) {
- pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
- bch2_btree_ids[iter->btree_id],
- iter->pos.inode,
- iter->pos.offset,
- (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
- (void *) iter->ip_allocated);
- }
-
- panic("trans iter oveflow\n");
- }
+ idx = __ffs64(~trans->iters_linked);
- ret = bch2_trans_realloc_iters(trans, trans->size * 2);
- if (ret)
- return ERR_PTR(ret);
- }
-
- idx = trans->nr_iters++;
- BUG_ON(trans->nr_iters > trans->size);
-
- trans->iters[idx].idx = idx;
-got_slot:
- BUG_ON(trans->iters_linked & (1ULL << idx));
- trans->iters_linked |= 1ULL << idx;
- trans->iters[idx].flags = 0;
+ trans->iters_linked |= 1ULL << idx;
+ trans->iters[idx].idx = idx;
+ trans->iters[idx].flags = 0;
return &trans->iters[idx];
}
dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
}
-static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
-{
- if (bkey_cmp(l, r) > 0)
- swap(l, r);
-
- return POS(r.inode - l.inode, r.offset - l.offset);
-}
-
-static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
- unsigned btree_id, struct bpos pos,
- unsigned flags)
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
+ unsigned btree_id, struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags)
{
struct btree_iter *iter, *best = NULL;
- BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
+ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+ !btree_type_has_snapshots(btree_id))
+ flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+
+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
+ pos.snapshot = btree_type_has_snapshots(btree_id)
+ ? U32_MAX : 0;
trans_for_each_iter(trans, iter) {
if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
continue;
if (best &&
- bkey_cmp(bpos_diff(best->pos, pos),
- bpos_diff(iter->pos, pos)) < 0)
+ bkey_cmp(bpos_diff(best->real_pos, pos),
+ bpos_diff(iter->real_pos, pos)) > 0)
continue;
best = iter;
if (!best) {
iter = btree_trans_iter_alloc(trans);
- if (IS_ERR(iter))
- return iter;
-
- bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
- } else if ((trans->iters_live & (1ULL << best->idx)) ||
- (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
+ bch2_btree_iter_init(trans, iter, btree_id);
+ } else if (btree_iter_keep(trans, best)) {
iter = btree_trans_iter_alloc(trans);
- if (IS_ERR(iter))
- return iter;
-
btree_iter_copy(iter, best);
} else {
iter = best;
}
- iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
- iter->flags &= ~BTREE_ITER_USER_FLAGS;
- iter->flags |= flags & BTREE_ITER_USER_FLAGS;
+ trans->iters_live |= 1ULL << iter->idx;
+ trans->iters_touched |= 1ULL << iter->idx;
- if (iter->flags & BTREE_ITER_INTENT)
- bch2_btree_iter_upgrade(iter, 1);
- else
- bch2_btree_iter_downgrade(iter);
+ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+ btree_node_type_is_extents(btree_id) &&
+ !(flags & BTREE_ITER_NOT_EXTENTS) &&
+ !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+ flags |= BTREE_ITER_IS_EXTENTS;
- BUG_ON(iter->btree_id != btree_id);
- BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
- BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
- BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT);
- BUG_ON(trans->iters_live & (1ULL << iter->idx));
+ iter->flags = flags;
- trans->iters_live |= 1ULL << iter->idx;
- trans->iters_touched |= 1ULL << iter->idx;
+ iter->snapshot = pos.snapshot;
- return iter;
-}
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos, unsigned flags)
-{
- struct btree_iter *iter =
- __btree_trans_get_iter(trans, btree_id, pos, flags);
+ if (locks_want > iter->locks_want) {
+ iter->locks_want = locks_want;
+ btree_iter_get_locks(iter, true, false);
+ } else if (locks_want < iter->locks_want) {
+ __bch2_btree_iter_downgrade(iter, locks_want);
+ }
+
+ while (iter->level < depth) {
+ btree_node_unlock(iter, iter->level);
+ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
+ iter->level++;
+ }
+
+ while (iter->level > depth)
+ iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT;
+
+ iter->min_depth = depth;
+
+ bch2_btree_iter_set_pos(iter, pos);
+ btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
- if (!IS_ERR(iter))
- __bch2_btree_iter_set_pos(iter, pos,
- btree_node_type_is_extents(btree_id));
return iter;
}
unsigned flags)
{
struct btree_iter *iter =
- __btree_trans_get_iter(trans, btree_id, pos,
- flags|BTREE_ITER_NODES);
- unsigned i;
+ __bch2_trans_get_iter(trans, btree_id, pos,
+ locks_want, depth,
+ BTREE_ITER_NODES|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ flags);
- BUG_ON(IS_ERR(iter));
BUG_ON(bkey_cmp(iter->pos, pos));
-
- iter->locks_want = locks_want;
- iter->level = depth;
- iter->min_depth = depth;
-
- for (i = 0; i < ARRAY_SIZE(iter->l); i++)
- iter->l[i].b = NULL;
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
+ BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
+ BUG_ON(iter->level != depth);
+ BUG_ON(iter->min_depth != depth);
+ iter->ip_allocated = _RET_IP_;
return iter;
}
struct btree_iter *iter;
iter = btree_trans_iter_alloc(trans);
- if (IS_ERR(iter))
- return iter;
-
btree_iter_copy(iter, src);
trans->iters_live |= 1ULL << iter->idx;
* We don't need to preserve this iter since it's cheap to copy it
* again - this will cause trans_iter_put() to free it right away:
*/
- trans->iters_touched &= ~(1ULL << iter->idx);
+ set_btree_iter_dontneed(trans, iter);
return iter;
}
trans->iters_touched &= trans->iters_live;
- trans->need_reset = 0;
trans->nr_updates = 0;
trans->nr_updates2 = 0;
trans->mem_top = 0;
+ trans->hooks = NULL;
trans->extra_journal_entries = NULL;
trans->extra_journal_entry_u64s = 0;
(void *) &trans->fs_usage_deltas->memset_start);
}
+ if (!(flags & TRANS_RESET_NOUNLOCK))
+ bch2_trans_cond_resched(trans);
+
if (!(flags & TRANS_RESET_NOTRAVERSE))
bch2_btree_iter_traverse_all(trans);
}
+static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+{
+ size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX;
+ size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
+ void *p = NULL;
+
+ BUG_ON(trans->used_mempool);
+
+#ifdef __KERNEL__
+ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+#endif
+ if (!p)
+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+
+ trans->iters = p; p += iters_bytes;
+ trans->updates = p; p += updates_bytes;
+ trans->updates2 = p; p += updates_bytes;
+}
+
void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
unsigned expected_nr_iters,
size_t expected_mem_bytes)
{
- memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
+ memset(trans, 0, sizeof(*trans));
+ trans->c = c;
+ trans->ip = _RET_IP_;
/*
* reallocating iterators currently completely breaks
- * bch2_trans_iter_put():
+ * bch2_trans_iter_put(), we always allocate the max:
*/
- expected_nr_iters = BTREE_ITER_MAX;
-
- trans->c = c;
- trans->ip = _RET_IP_;
- trans->size = ARRAY_SIZE(trans->iters_onstack);
- trans->iters = trans->iters_onstack;
- trans->updates = trans->updates_onstack;
- trans->updates2 = trans->updates2_onstack;
- trans->fs_usage_deltas = NULL;
+ bch2_trans_alloc_iters(trans, c);
- if (expected_nr_iters > trans->size)
- bch2_trans_realloc_iters(trans, expected_nr_iters);
+ if (expected_mem_bytes) {
+ trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
+ trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+ }
- if (expected_mem_bytes)
- bch2_trans_preload_mem(trans, expected_mem_bytes);
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
#ifdef CONFIG_BCACHEFS_DEBUG
trans->pid = current->pid;
int bch2_trans_exit(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
+
bch2_trans_unlock(trans);
#ifdef CONFIG_BCACHEFS_DEBUG
+ if (trans->iters_live) {
+ struct btree_iter *iter;
+
+ bch_err(c, "btree iterators leaked!");
+ trans_for_each_iter(trans, iter)
+ if (btree_iter_live(trans, iter))
+ printk(KERN_ERR " btree %s allocated at %pS\n",
+ bch2_btree_ids[iter->btree_id],
+ (void *) iter->ip_allocated);
+ /* Be noisy about this: */
+ bch2_fatal_error(c);
+ }
+
mutex_lock(&trans->c->btree_trans_lock);
list_del(&trans->list);
mutex_unlock(&trans->c->btree_trans_lock);
#endif
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
kfree(trans->fs_usage_deltas);
kfree(trans->mem);
- if (trans->used_mempool)
+
+#ifdef __KERNEL__
+ /*
+ * Userspace doesn't have a real percpu implementation:
+ */
+ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+#endif
+ if (trans->iters)
mempool_free(trans->iters, &trans->c->btree_iters_pool);
- else if (trans->iters != trans->iters_onstack)
- kfree(trans->iters);
+
trans->mem = (void *) 0x1;
trans->iters = (void *) 0x1;
return trans->error ? -EIO : 0;
}
-static void bch2_btree_iter_node_to_text(struct printbuf *out,
- struct btree_bkey_cached_common *_b,
- enum btree_iter_type type)
+static void __maybe_unused
+bch2_btree_iter_node_to_text(struct printbuf *out,
+ struct btree_bkey_cached_common *_b,
+ enum btree_iter_type type)
{
- pr_buf(out, " %px l=%u %s:",
- _b, _b->level, bch2_btree_ids[_b->btree_id]);
+ pr_buf(out, " l=%u %s:",
+ _b->level, bch2_btree_ids[_b->btree_id]);
bch2_bpos_to_text(out, btree_node_pos(_b, type));
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+{
+ struct btree_iter *iter;
+
+ trans_for_each_iter(trans, iter)
+ if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
+ iter->nodes_locked)
+ return true;
+ return false;
+}
+#endif
+
void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
{
#ifdef CONFIG_BCACHEFS_DEBUG
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
+ if (!trans_has_btree_nodes_locked(trans))
+ continue;
+
+ pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
trans_for_each_iter(trans, iter) {
if (!iter->nodes_locked)
continue;
- pr_buf(out, " iter %u %s:",
+ pr_buf(out, " iter %u %c %s:",
iter->idx,
+ btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
bch2_btree_ids[iter->btree_id]);
bch2_bpos_to_text(out, iter->pos);
pr_buf(out, "\n");
b = READ_ONCE(trans->locking);
if (b) {
- pr_buf(out, " locking iter %u l=%u %s:",
+ iter = &trans->iters[trans->locking_iter_idx];
+ pr_buf(out, " locking iter %u %c l=%u %s:",
trans->locking_iter_idx,
+ btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
trans->locking_level,
bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos);
-
pr_buf(out, " node ");
bch2_btree_iter_node_to_text(out,
(void *) b,
- btree_iter_type(&trans->iters[trans->locking_iter_idx]));
+ btree_iter_type(iter));
pr_buf(out, "\n");
}
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
mempool_exit(&c->btree_iters_pool);
+ cleanup_srcu_struct(&c->btree_trans_barrier);
}
int bch2_fs_btree_iter_init(struct bch_fs *c)
INIT_LIST_HEAD(&c->btree_trans_list);
mutex_init(&c->btree_trans_lock);
- return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+ return init_srcu_struct(&c->btree_trans_barrier) ?:
+ mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
sizeof(struct btree_iter) * nr +
sizeof(struct btree_insert_entry) * nr +
sizeof(struct btree_insert_entry) * nr);
/* Iterate over iters within a transaction: */
-#define trans_for_each_iter_all(_trans, _iter) \
- for (_iter = (_trans)->iters; \
- _iter < (_trans)->iters + (_trans)->nr_iters; \
- _iter++)
-
static inline struct btree_iter *
__trans_next_iter(struct btree_trans *trans, unsigned idx)
{
- EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx);
+ u64 l;
+
+ if (idx == BTREE_ITER_MAX)
+ return NULL;
- for (; idx < trans->nr_iters; idx++)
- if (trans->iters_linked & (1ULL << idx))
- return &trans->iters[idx];
+ l = trans->iters_linked >> idx;
+ if (!l)
+ return NULL;
- return NULL;
+ idx += __ffs64(l);
+ EBUG_ON(idx >= BTREE_ITER_MAX);
+ EBUG_ON(trans->iters[idx].idx != idx);
+ return &trans->iters[idx];
}
#define trans_for_each_iter(_trans, _iter) \
void bch2_trans_unlock(struct btree_trans *);
bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
-bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
unsigned new_locks_want)
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
return iter->locks_want < new_locks_want
- ? (!iter->trans->nounlock
- ? __bch2_btree_iter_upgrade(iter, new_locks_want)
- : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
+ ? __bch2_btree_iter_upgrade(iter, new_locks_want)
: iter->uptodate <= BTREE_ITER_NEED_PEEK;
}
static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
{
- if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
- __bch2_btree_iter_downgrade(iter, 0);
+ unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+ if (iter->locks_want > new_locks_want)
+ __bch2_btree_iter_downgrade(iter, new_locks_want);
}
void bch2_trans_downgrade(struct btree_trans *);
void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
-
-static inline int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
-{
- return iter->uptodate >= BTREE_ITER_NEED_RELOCK
- ? __bch2_btree_iter_traverse(iter)
- : 0;
-}
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
int bch2_btree_iter_traverse_all(struct btree_trans *);
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
-void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
-void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
-void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
+bool bch2_btree_iter_advance(struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_iter *);
-static inline int btree_iter_cmp(const struct btree_iter *l,
- const struct btree_iter *r)
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ new_pos.snapshot = iter->snapshot;
+
+ iter->k.type = KEY_TYPE_deleted;
+ iter->k.p.inode = iter->pos.inode = new_pos.inode;
+ iter->k.p.offset = iter->pos.offset = new_pos.offset;
+ iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot;
+ iter->k.size = 0;
+}
+
+/* Sort order for locking btree iterators: */
+static inline int btree_iter_lock_cmp(const struct btree_iter *l,
+ const struct btree_iter *r)
{
return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
+ -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
bkey_cmp(l->pos, r->pos);
}
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
- for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \
- bch2_trans_get_iter((_trans), (_btree_id), \
- (_start), (_flags))) ?: \
- PTR_ERR_OR_ZERO(((_k) = \
- __bch2_btree_iter_peek(_iter, _flags)).k); \
- !_ret && (_k).k; \
- (_ret) = PTR_ERR_OR_ZERO(((_k) = \
- __bch2_btree_iter_next(_iter, _flags)).k))
+ for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \
+ (_start), (_flags)), \
+ (_k) = __bch2_btree_iter_peek(_iter, _flags); \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ (_k) = __bch2_btree_iter_next(_iter, _flags))
#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \
for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
void bch2_trans_unlink_iters(struct btree_trans *);
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
- struct bpos, unsigned);
+ struct bpos, unsigned,
+ unsigned, unsigned);
static inline struct btree_iter *
bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
struct bpos pos, unsigned flags)
{
struct btree_iter *iter =
- __bch2_trans_get_iter(trans, btree_id, pos, flags);
-
- if (!IS_ERR(iter))
- iter->ip_allocated = _THIS_IP_;
+ __bch2_trans_get_iter(trans, btree_id, pos,
+ (flags & BTREE_ITER_INTENT) != 0, 0,
+ flags);
+ iter->ip_allocated = _THIS_IP_;
return iter;
}
struct btree_iter *iter =
__bch2_trans_copy_iter(trans, src);
- if (!IS_ERR(iter))
- iter->ip_allocated = _THIS_IP_;
+ iter->ip_allocated = _THIS_IP_;
return iter;
-
}
struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
+static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
+{
+ return (trans->iters_live & (1ULL << iter->idx)) != 0;
+}
+
+static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
+{
+ return btree_iter_live(trans, iter) ||
+ (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+}
+
+static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
+{
+ trans->iters_touched &= ~(1ULL << iter->idx);
+}
+
#define TRANS_RESET_NOTRAVERSE (1 << 0)
+#define TRANS_RESET_NOUNLOCK (1 << 1)
void bch2_trans_reset(struct btree_trans *, unsigned);
#include "journal.h"
#include "journal_reclaim.h"
+#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+static struct kmem_cache *bch2_key_cache;
+
static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
const struct bkey_cached_key *key = arg->key;
return cmp_int(ck->key.btree_id, key->btree_id) ?:
- bkey_cmp(ck->key.pos, key->pos);
+ bpos_cmp(ck->key.pos, key->pos);
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
};
__flatten
-static inline struct bkey_cached *
-btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
{
struct bkey_cached_key key = {
.btree_id = btree_id,
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params));
memset(&ck->key, ~0, sizeof(ck->key));
+
+ atomic_long_dec(&c->nr_keys);
}
-static void bkey_cached_free(struct btree_key_cache *c,
+static void bkey_cached_free(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
- list_move(&ck->list, &c->freed);
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+
+ ck->btree_trans_barrier_seq =
+ start_poll_synchronize_srcu(&c->btree_trans_barrier);
+
+ list_move_tail(&ck->list, &bc->freed);
+ bc->nr_freed++;
kfree(ck->k);
ck->k = NULL;
{
struct bkey_cached *ck;
- list_for_each_entry(ck, &c->freed, list)
- if (bkey_cached_lock_for_evict(ck))
- return ck;
+ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
+ if (likely(ck)) {
+ INIT_LIST_HEAD(&ck->list);
+ six_lock_init(&ck->c.lock);
+ BUG_ON(!six_trylock_intent(&ck->c.lock));
+ BUG_ON(!six_trylock_write(&ck->c.lock));
+ return ck;
+ }
+
+ return NULL;
+}
- list_for_each_entry(ck, &c->clean, list)
+static struct bkey_cached *
+bkey_cached_reuse(struct btree_key_cache *c)
+{
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct bkey_cached *ck;
+ unsigned i;
+
+ mutex_lock(&c->lock);
+ list_for_each_entry_reverse(ck, &c->freed, list)
if (bkey_cached_lock_for_evict(ck)) {
- bkey_cached_evict(c, ck);
+ c->nr_freed--;
+ list_del(&ck->list);
+ mutex_unlock(&c->lock);
return ck;
}
+ mutex_unlock(&c->lock);
- ck = kzalloc(sizeof(*ck), GFP_NOFS);
- if (!ck)
- return NULL;
-
- INIT_LIST_HEAD(&ck->list);
- six_lock_init(&ck->c.lock);
- BUG_ON(!six_trylock_intent(&ck->c.lock));
- BUG_ON(!six_trylock_write(&ck->c.lock));
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bkey_cached_lock_for_evict(ck)) {
+ bkey_cached_evict(c, ck);
+ rcu_read_unlock();
+ return ck;
+ }
+ }
+ rcu_read_unlock();
- return ck;
+ return NULL;
}
static struct bkey_cached *
struct bpos pos)
{
struct bkey_cached *ck;
+ bool was_new = true;
ck = bkey_cached_alloc(c);
- if (!ck)
- return ERR_PTR(-ENOMEM);
+
+ if (unlikely(!ck)) {
+ ck = bkey_cached_reuse(c);
+ if (unlikely(!ck))
+ return ERR_PTR(-ENOMEM);
+
+ was_new = false;
+ }
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
ck->key.pos = pos;
ck->valid = false;
+ ck->flags = 1U << BKEY_CACHED_ACCESSED;
- BUG_ON(ck->flags);
-
- if (rhashtable_lookup_insert_fast(&c->table,
+ if (unlikely(rhashtable_lookup_insert_fast(&c->table,
&ck->hash,
- bch2_btree_key_cache_params)) {
+ bch2_btree_key_cache_params))) {
/* We raced with another fill: */
- bkey_cached_free(c, ck);
+
+ if (likely(was_new)) {
+ six_unlock_write(&ck->c.lock);
+ six_unlock_intent(&ck->c.lock);
+ kfree(ck);
+ } else {
+ mutex_lock(&c->lock);
+ bkey_cached_free(c, ck);
+ mutex_unlock(&c->lock);
+ }
+
return NULL;
}
- list_move(&ck->list, &c->clean);
+ atomic_long_inc(&c->nr_keys);
+
six_unlock_write(&ck->c.lock);
return ck;
iter = bch2_trans_get_iter(trans, ck->key.btree_id,
ck->key.pos, BTREE_ITER_SLOTS);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
- if (ret) {
- bch2_trans_iter_put(trans, iter);
- return ret;
- }
+ if (ret)
+ goto err;
if (!bch2_btree_node_relock(ck_iter, 0)) {
- bch2_trans_iter_put(trans, iter);
trace_transaction_restart_ip(trans->ip, _THIS_IP_);
- return -EINTR;
+ ret = -EINTR;
+ goto err;
}
if (k.k->u64s > ck->u64s) {
new_u64s = roundup_pow_of_two(k.k->u64s);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
- bch2_trans_iter_put(trans, iter);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto err;
}
}
bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
/* We're not likely to need this iterator again: */
- bch2_trans_iter_free(trans, iter);
-
- return 0;
+ set_btree_iter_dontneed(trans, iter);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
static int bkey_cached_check_fn(struct six_lock *lock, void *p)
const struct btree_iter *iter = p;
return ck->key.btree_id == iter->btree_id &&
- !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+ !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
}
+__flatten
int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
goto fill;
}
retry:
- ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
+ ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
if (!ck) {
if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
iter->l[0].b = NULL;
return 0;
}
- mutex_lock(&c->btree_key_cache.lock);
ck = btree_key_cache_create(&c->btree_key_cache,
iter->btree_id, iter->pos);
- mutex_unlock(&c->btree_key_cache.lock);
-
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
enum six_lock_type lock_want = __btree_lock_want(iter, 0);
if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
- bkey_cached_check_fn, iter)) {
+ bkey_cached_check_fn, iter, _THIS_IP_)) {
if (ck->key.btree_id != iter->btree_id ||
- bkey_cmp(ck->key.pos, iter->pos)) {
+ bpos_cmp(ck->key.pos, iter->pos)) {
goto retry;
}
}
if (ck->key.btree_id != iter->btree_id ||
- bkey_cmp(ck->key.pos, iter->pos)) {
+ bpos_cmp(ck->key.pos, iter->pos)) {
six_unlock_type(&ck->c.lock, lock_want);
goto retry;
}
goto err;
}
+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+
iter->uptodate = BTREE_ITER_NEED_PEEK;
- bch2_btree_iter_downgrade(iter);
+
+ if (!(iter->flags & BTREE_ITER_INTENT))
+ bch2_btree_iter_downgrade(iter);
+ else if (!iter->locks_want) {
+ if (!__bch2_btree_iter_upgrade(iter, 1))
+ ret = -EINTR;
+ }
+
return ret;
err:
if (ret != -EINTR) {
static int btree_key_cache_flush_pos(struct btree_trans *trans,
struct bkey_cached_key key,
u64 journal_seq,
+ unsigned commit_flags,
bool evict)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_iter *c_iter = NULL, *b_iter = NULL;
- struct bkey_cached *ck;
+ struct bkey_cached *ck = NULL;
int ret;
b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
BTREE_ITER_SLOTS|
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(b_iter);
- if (ret)
- goto out;
-
c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_CACHED_NOCREATE|
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(c_iter);
- if (ret)
- goto out;
retry:
ret = bch2_btree_iter_traverse(c_iter);
if (ret)
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
- BTREE_INSERT_JOURNAL_RESERVED|
- BTREE_INSERT_JOURNAL_RECLAIM);
+ (ck->journal.seq == journal_last_seq(j)
+ ? BTREE_INSERT_JOURNAL_RESERVED
+ : 0)|
+ commit_flags);
err:
if (ret == -EINTR)
goto retry;
- BUG_ON(ret && !bch2_journal_error(j));
+ if (ret == -EAGAIN)
+ goto out;
- if (ret)
+ if (ret) {
+ bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
+ "error flushing key cache: %i", ret);
goto out;
+ }
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
- clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+
+ BUG_ON(!btree_node_locked(c_iter, 0));
if (!evict) {
- mutex_lock(&c->btree_key_cache.lock);
- list_move_tail(&ck->list, &c->btree_key_cache.clean);
- mutex_unlock(&c->btree_key_cache.lock);
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ }
} else {
evict:
BUG_ON(!btree_node_intent_locked(c_iter, 0));
six_lock_write(&ck->c.lock, NULL, NULL);
- mutex_lock(&c->btree_key_cache.lock);
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
+ }
+
bkey_cached_evict(&c->btree_key_cache, ck);
+
+ mutex_lock(&c->btree_key_cache.lock);
bkey_cached_free(&c->btree_key_cache, ck);
mutex_unlock(&c->btree_key_cache.lock);
}
return ret;
}
-static void btree_key_cache_journal_flush(struct journal *j,
- struct journal_entry_pin *pin,
- u64 seq)
+int bch2_btree_key_cache_journal_flush(struct journal *j,
+ struct journal_entry_pin *pin, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
struct btree_trans trans;
+ int ret = 0;
+
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
six_lock_read(&ck->c.lock, NULL, NULL);
key = ck->key;
if (ck->journal.seq != seq ||
!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
six_unlock_read(&ck->c.lock);
- return;
+ goto unlock;
}
six_unlock_read(&ck->c.lock);
bch2_trans_init(&trans, c, 0, 0);
- btree_key_cache_flush_pos(&trans, key, seq, false);
+ ret = btree_key_cache_flush_pos(&trans, key, seq,
+ BTREE_INSERT_JOURNAL_RECLAIM, false);
bch2_trans_exit(&trans);
+unlock:
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+ return ret;
}
/*
struct bkey_cached_key key = { id, pos };
/* Fastpath - assume it won't be found: */
- if (!btree_key_cache_find(c, id, pos))
+ if (!bch2_btree_key_cache_find(c, id, pos))
return 0;
- return btree_key_cache_flush_pos(trans, key, 0, true);
+ return btree_key_cache_flush_pos(trans, key, 0, 0, true);
}
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) iter->l[0].b;
+ bool kick_reclaim = false;
BUG_ON(insert->u64s > ck->u64s);
ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- mutex_lock(&c->btree_key_cache.lock);
- list_del_init(&ck->list);
-
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
- mutex_unlock(&c->btree_key_cache.lock);
+ atomic_long_inc(&c->btree_key_cache.nr_dirty);
+
+ if (bch2_nr_btree_keys_need_flush(c))
+ kick_reclaim = true;
}
bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
- &ck->journal, btree_key_cache_journal_flush);
+ &ck->journal, bch2_btree_key_cache_journal_flush);
+
+ if (kick_reclaim)
+ journal_reclaim_kick(&c->journal);
return true;
}
void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
enum btree_id id, struct bpos pos)
{
- BUG_ON(btree_key_cache_find(trans->c, id, pos));
+ BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
}
#endif
-void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c)
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
+ btree_key_cache.shrink);
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ struct bucket_table *tbl;
+ struct bkey_cached *ck, *t;
+ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
+ unsigned start, flags;
+ int srcu_idx;
+
+ /* Return -1 if we can't do anything right now */
+ if (sc->gfp_mask & __GFP_FS)
+ mutex_lock(&bc->lock);
+ else if (!mutex_trylock(&bc->lock))
+ return -1;
+
+ srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ flags = memalloc_nofs_save();
+
+ /*
+ * Newest freed entries are at the end of the list - once we hit one
+ * that's too new to be freed, we can bail out:
+ */
+ list_for_each_entry_safe(ck, t, &bc->freed, list) {
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
+ ck->btree_trans_barrier_seq))
+ break;
+
+ list_del(&ck->list);
+ kmem_cache_free(bch2_key_cache, ck);
+ bc->nr_freed--;
+ scanned++;
+ freed++;
+ }
+
+ if (scanned >= nr)
+ goto out;
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ if (bc->shrink_iter >= tbl->size)
+ bc->shrink_iter = 0;
+ start = bc->shrink_iter;
+
+ do {
+ struct rhash_head *pos, *next;
+
+ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+
+ while (!rht_is_a_nulls(pos)) {
+ next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+ ck = container_of(pos, struct bkey_cached, hash);
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
+ goto next;
+
+ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
+ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
+ else if (bkey_cached_lock_for_evict(ck)) {
+ bkey_cached_evict(bc, ck);
+ bkey_cached_free(bc, ck);
+ }
+
+ scanned++;
+ if (scanned >= nr)
+ break;
+next:
+ pos = next;
+ }
+
+ bc->shrink_iter++;
+ if (bc->shrink_iter >= tbl->size)
+ bc->shrink_iter = 0;
+ } while (scanned < nr && bc->shrink_iter != start);
+
+ rcu_read_unlock();
+out:
+ memalloc_nofs_restore(flags);
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+ mutex_unlock(&bc->lock);
+
+ return freed;
+}
+
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
+ btree_key_cache.shrink);
+ struct btree_key_cache *bc = &c->btree_key_cache;
+
+ return atomic_long_read(&bc->nr_keys);
+}
+
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
+{
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ struct bucket_table *tbl;
struct bkey_cached *ck, *n;
+ struct rhash_head *pos;
+ unsigned i;
- mutex_lock(&c->lock);
- list_for_each_entry_safe(ck, n, &c->clean, list) {
+ if (bc->shrink.list.next)
+ unregister_shrinker(&bc->shrink);
+
+ mutex_lock(&bc->lock);
+
+ rcu_read_lock();
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ bkey_cached_evict(bc, ck);
+ list_add(&ck->list, &bc->freed);
+ }
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(ck, n, &bc->freed, list) {
+ cond_resched();
+
+ bch2_journal_pin_drop(&c->journal, &ck->journal);
+ bch2_journal_preres_put(&c->journal, &ck->res);
+
+ list_del(&ck->list);
kfree(ck->k);
- kfree(ck);
+ kmem_cache_free(bch2_key_cache, ck);
}
- list_for_each_entry_safe(ck, n, &c->freed, list)
- kfree(ck);
- mutex_unlock(&c->lock);
- rhashtable_destroy(&c->table);
+ BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal));
+ BUG_ON(atomic_long_read(&bc->nr_keys));
+
+ mutex_unlock(&bc->lock);
+
+ if (bc->table_init_done)
+ rhashtable_destroy(&bc->table);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed);
- INIT_LIST_HEAD(&c->clean);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
{
- return rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+ int ret;
+
+ c->shrink.seeks = 1;
+ c->shrink.count_objects = bch2_btree_key_cache_count;
+ c->shrink.scan_objects = bch2_btree_key_cache_scan;
+
+ ret = register_shrinker(&c->shrink);
+ if (ret)
+ return ret;
+
+ ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
+ if (ret)
+ return ret;
+
+ c->table_init_done = true;
+ return 0;
}
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
{
- struct bucket_table *tbl;
- struct bkey_cached *ck;
- struct rhash_head *pos;
- size_t i;
+ pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
+ pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys));
+ pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty));
+}
- mutex_lock(&c->lock);
- tbl = rht_dereference_rcu(c->table.tbl, &c->table);
+void bch2_btree_key_cache_exit(void)
+{
+ if (bch2_key_cache)
+ kmem_cache_destroy(bch2_key_cache);
+}
- for (i = 0; i < tbl->size; i++) {
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- pr_buf(out, "%s:",
- bch2_btree_ids[ck->key.btree_id]);
- bch2_bpos_to_text(out, ck->key.pos);
+int __init bch2_btree_key_cache_init(void)
+{
+ bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
+ if (!bch2_key_cache)
+ return -ENOMEM;
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
- pr_buf(out, " journal seq %llu", ck->journal.seq);
- pr_buf(out, "\n");
- }
- }
- mutex_unlock(&c->lock);
+ return 0;
}
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
+{
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+ size_t max_dirty = 1024 + nr_keys / 2;
+
+ return max_t(ssize_t, 0, nr_dirty - max_dirty);
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+ size_t max_dirty = 4096 + (nr_keys * 3) / 4;
+
+ return nr_dirty > max_dirty &&
+ test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
+}
+
+int bch2_btree_key_cache_journal_flush(struct journal *,
+ struct journal_entry_pin *, u64);
+
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
int bch2_btree_iter_traverse_cached(struct btree_iter *);
bool bch2_btree_insert_key_cached(struct btree_trans *,
void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
+void bch2_btree_key_cache_exit(void);
+int __init bch2_btree_key_cache_init(void);
+
#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
return BTREE_NODE_UNLOCKED;
}
-static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
{
int lock_type = btree_node_locked_type(iter, level);
mark_btree_node_unlocked(iter, level);
}
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
-{
- EBUG_ON(!level && iter->trans->nounlock);
-
- __btree_node_unlock(iter, level);
-}
-
static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
{
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
struct btree_iter *, enum six_lock_type,
- six_lock_should_sleep_fn, void *);
+ six_lock_should_sleep_fn, void *,
+ unsigned long);
static inline bool btree_node_lock(struct btree *b,
struct bpos pos, unsigned level,
struct btree_iter *iter,
enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
{
struct btree_trans *trans = iter->trans;
- bool ret;
EBUG_ON(level >= BTREE_MAX_DEPTH);
EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans->locking = b;
- trans->locking_iter_idx = iter->idx;
- trans->locking_pos = pos;
- trans->locking_btree_id = iter->btree_id;
- trans->locking_level = level;
-#endif
- ret = likely(six_trylock_type(&b->c.lock, type)) ||
+ return likely(six_trylock_type(&b->c.lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
__bch2_btree_node_lock(b, pos, level, iter, type,
- should_sleep_fn, p);
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans->locking = NULL;
-#endif
- return ret;
+ should_sleep_fn, p, ip);
}
bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
u16 data_offset;
u16 aux_data_offset;
u16 end_offset;
-
- struct bpos max_key;
};
struct btree_write {
struct btree_alloc {
struct open_buckets ob;
- BKEY_PADDED(k);
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
struct btree_bkey_cached_common {
u16 written;
u8 nsets;
u8 nr_key_bits;
+ u16 version_ondisk;
struct bkey_format format;
u8 byte_order;
u8 unpack_fn_len;
+ struct btree_write writes[2];
+
+ /* Key/pointer for this btree node */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
/*
* XXX: add a delete sequence number, so when bch2_btree_node_relock()
* fails because the lock sequence number has changed - i.e. the
/* lru list */
struct list_head list;
-
- struct btree_write writes[2];
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- bool *expensive_debug_checks;
-#endif
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
struct btree_cache {
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ atomic_t dirty;
struct shrinker shrink;
/*
#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8)
#define BTREE_ITER_CACHED_NOFILL (1 << 9)
#define BTREE_ITER_CACHED_NOCREATE (1 << 10)
-
-#define BTREE_ITER_USER_FLAGS \
- (BTREE_ITER_SLOTS \
- |BTREE_ITER_INTENT \
- |BTREE_ITER_PREFETCH \
- |BTREE_ITER_CACHED_NOFILL \
- |BTREE_ITER_CACHED_NOCREATE)
+#define BTREE_ITER_NOT_EXTENTS (1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
struct btree_iter {
struct btree_trans *trans;
struct bpos pos;
+ /* what we're searching for/what the iterator actually points to: */
+ struct bpos real_pos;
struct bpos pos_after_commit;
+ /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+ unsigned snapshot;
u16 flags;
u8 idx;
return iter->flags & BTREE_ITER_TYPE;
}
+static inline bool btree_iter_is_cached(const struct btree_iter *iter)
+{
+ return btree_iter_type(iter) == BTREE_ITER_CACHED;
+}
+
static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
{
return iter->l + iter->level;
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
+ bool table_init_done;
struct list_head freed;
- struct list_head clean;
+ struct shrinker shrink;
+ unsigned shrink_iter;
+
+ size_t nr_freed;
+ atomic_long_t nr_keys;
+ atomic_long_t nr_dirty;
};
struct bkey_cached_key {
struct bpos pos;
} __attribute__((packed, aligned(4)));
-#define BKEY_CACHED_DIRTY 0
+#define BKEY_CACHED_ACCESSED 0
+#define BKEY_CACHED_DIRTY 1
struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
u8 u64s;
bool valid;
+ u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;
struct btree_insert_entry {
unsigned trigger_flags;
+ u8 bkey_type;
+ enum btree_id btree_id:8;
+ u8 level;
unsigned trans_triggers_run:1;
+ unsigned is_extent:1;
struct bkey_i *k;
struct btree_iter *iter;
};
#define BTREE_ITER_MAX 32
#endif
+struct btree_trans_commit_hook;
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
+
+struct btree_trans_commit_hook {
+ btree_trans_commit_hook_fn *fn;
+ struct btree_trans_commit_hook *next;
+};
+
struct btree_trans {
struct bch_fs *c;
#ifdef CONFIG_BCACHEFS_DEBUG
pid_t pid;
#endif
unsigned long ip;
+ int srcu_idx;
- u64 iters_linked;
- u64 iters_live;
- u64 iters_touched;
-
- u8 nr_iters;
u8 nr_updates;
u8 nr_updates2;
- u8 size;
unsigned used_mempool:1;
unsigned error:1;
- unsigned nounlock:1;
- unsigned need_reset:1;
unsigned in_traverse_all:1;
+ u64 iters_linked;
+ u64 iters_live;
+ u64 iters_touched;
+
unsigned mem_top;
unsigned mem_bytes;
void *mem;
struct btree_insert_entry *updates2;
/* update path: */
+ struct btree_trans_commit_hook *hooks;
struct jset_entry *extra_journal_entries;
unsigned extra_journal_entry_u64s;
struct journal_entry_pin *journal_pin;
unsigned journal_u64s;
unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
-
- struct btree_iter iters_onstack[2];
- struct btree_insert_entry updates_onstack[2];
- struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \
BTREE_NODE_just_written,
BTREE_NODE_dying,
BTREE_NODE_fake,
- BTREE_NODE_old_extent_overwrite,
BTREE_NODE_need_rewrite,
+ BTREE_NODE_never_write,
};
BTREE_FLAG(read_in_flight);
BTREE_FLAG(read_error);
-BTREE_FLAG(dirty);
BTREE_FLAG(need_write);
BTREE_FLAG(noevict);
BTREE_FLAG(write_idx);
BTREE_FLAG(just_written);
BTREE_FLAG(dying);
BTREE_FLAG(fake);
-BTREE_FLAG(old_extent_overwrite);
BTREE_FLAG(need_rewrite);
+BTREE_FLAG(never_write);
static inline struct btree_write *btree_current_write(struct btree *b)
{
}
enum btree_node_type {
-#define x(kwd, val, name) BKEY_TYPE_##kwd = val,
+#define x(kwd, val) BKEY_TYPE_##kwd = val,
BCH_BTREE_IDS()
#undef x
- BKEY_TYPE_BTREE,
+ BKEY_TYPE_btree,
};
/* Type of a key in btree @id at level @level: */
static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
{
- return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id;
+ return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
}
/* Type of keys @b contains: */
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
switch (type) {
- case BKEY_TYPE_EXTENTS:
- case BKEY_TYPE_REFLINK:
+ case BKEY_TYPE_extents:
+ case BKEY_TYPE_reflink:
return true;
default:
return false;
return btree_node_type_is_extents(btree_iter_key_type(iter));
}
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
+ ((1U << BKEY_TYPE_extents)| \
+ (1U << BKEY_TYPE_inodes)| \
+ (1U << BKEY_TYPE_stripes)| \
+ (1U << BKEY_TYPE_reflink)| \
+ (1U << BKEY_TYPE_btree))
+
+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
+ ((1U << BKEY_TYPE_alloc)| \
+ (1U << BKEY_TYPE_stripes))
+
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
- ((1U << BKEY_TYPE_EXTENTS)| \
- (1U << BKEY_TYPE_ALLOC)| \
- (1U << BKEY_TYPE_INODES)| \
- (1U << BKEY_TYPE_REFLINK)| \
- (1U << BKEY_TYPE_EC)| \
- (1U << BKEY_TYPE_BTREE))
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
+ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
-#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
- ((1U << BKEY_TYPE_EXTENTS)| \
- (1U << BKEY_TYPE_INODES)| \
- (1U << BKEY_TYPE_EC)| \
- (1U << BKEY_TYPE_REFLINK))
+#define BTREE_ID_HAS_SNAPSHOTS \
+ ((1U << BTREE_ID_extents)| \
+ (1U << BTREE_ID_inodes)| \
+ (1U << BTREE_ID_dirents)| \
+ (1U << BTREE_ID_xattrs))
+
+static inline bool btree_type_has_snapshots(enum btree_id id)
+{
+ return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
+}
enum btree_trigger_flags {
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
BTREE_INSERT_ENOSPC,
BTREE_INSERT_NEED_MARK_REPLICAS,
BTREE_INSERT_NEED_JOURNAL_RES,
+ BTREE_INSERT_NEED_JOURNAL_RECLAIM,
};
enum btree_gc_coalesce_fail_reason {
__BTREE_INSERT_NOCHECK_RW,
__BTREE_INSERT_LAZY_RW,
__BTREE_INSERT_USE_RESERVE,
- __BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_JOURNAL_RECLAIM,
/* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
-#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *, u64 *, int flags);
-int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *,
- struct bpos, u64 *);
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
+ struct bpos, struct bpos, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, u64 *);
int bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_trigger_flags);
+void bch2_trans_commit_hook(struct btree_trans *,
+ struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *);
/**
#include "btree_iter.h"
#include "btree_locking.h"
#include "buckets.h"
+#include "error.h"
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
struct bkey_s_c k;
struct bkey_s_c_btree_ptr_v2 bp;
struct bkey unpacked;
+ char buf1[100], buf2[100];
BUG_ON(!b->c.level);
break;
bp = bkey_s_c_to_btree_ptr_v2(k);
- BUG_ON(bkey_cmp(next_node, bp.v->min_key));
+ if (bpos_cmp(next_node, bp.v->min_key)) {
+ bch2_dump_btree_node(c, b);
+ panic("expected next min_key %s got %s\n",
+ (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
+ (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
+ }
bch2_btree_node_iter_advance(&iter, b);
if (bch2_btree_node_iter_end(&iter)) {
- BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
+ if (bpos_cmp(k.k->p, b->key.k.p)) {
+ bch2_dump_btree_node(c, b);
+ panic("expected end %s got %s\n",
+ (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
+ (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
+ }
break;
}
- next_node = bkey_successor(k.k->p);
+ next_node = bpos_successor(k.k->p);
}
#endif
}
struct bset_tree *t;
struct bkey uk;
- bch2_bkey_format_add_pos(s, b->data->min_key);
-
for_each_bset(b, t)
bset_tree_for_each_key(b, t, k)
- if (!bkey_whiteout(k)) {
+ if (!bkey_deleted(k)) {
uk = bkey_unpack_key(b, k);
bch2_bkey_format_add_key(s, &uk);
}
struct bkey_format_state s;
bch2_bkey_format_init(&s);
+ bch2_bkey_format_add_pos(&s, b->data->min_key);
+ bch2_bkey_format_add_pos(&s, b->data->max_key);
__bch2_btree_calc_format(&s, b);
return bch2_bkey_format_done(&s);
b->ob.nr = 0;
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
{
struct write_point *wp;
struct btree *b;
- BKEY_PADDED(k) tmp;
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct open_buckets ob = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
unsigned nr_reserve;
enum alloc_reserve alloc_reserve;
- if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+ if (flags & BTREE_INSERT_USE_RESERVE) {
nr_reserve = 0;
- alloc_reserve = RESERVE_ALLOC;
- } else if (flags & BTREE_INSERT_USE_RESERVE) {
- nr_reserve = BTREE_NODE_RESERVE / 2;
- alloc_reserve = RESERVE_BTREE;
+ alloc_reserve = RESERVE_BTREE_MOVINGGC;
} else {
nr_reserve = BTREE_NODE_RESERVE;
- alloc_reserve = RESERVE_NONE;
+ alloc_reserve = RESERVE_BTREE;
}
mutex_lock(&c->btree_reserve_cache_lock);
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
- wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+ wp = bch2_alloc_sectors_start(c,
+ c->opts.metadata_target ?:
+ c->opts.foreground_target,
+ 0,
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
b = as->prealloc_nodes[--as->nr_prealloc_nodes];
set_btree_node_accessed(b);
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
bch2_bset_init_first(b, &b->data->keys);
b->c.level = level;
b->c.btree_id = as->btree_id;
+ b->version_ondisk = c->sb.version;
memset(&b->nr, 0, sizeof(b->nr));
b->data->magic = cpu_to_le64(bset_magic(c));
b->data->flags = 0;
SET_BTREE_NODE_ID(b->data, as->btree_id);
SET_BTREE_NODE_LEVEL(b->data, level);
- b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
bp->v.mem_ptr = 0;
bp->v.seq = b->data->keys.seq;
bp->v.sectors_written = 0;
- bp->v.sectors = cpu_to_le16(c->opts.btree_node_size);
}
- if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
- SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
-
- if (btree_node_is_extents(b) &&
- !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
- set_btree_node_old_extent_overwrite(b);
- set_btree_node_need_rewrite(b);
- }
+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
bch2_btree_build_aux_trees(b);
goto err_free;
}
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
- if (ret)
- goto err_free;
-
as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
}
{
struct bch_fs *c = as->c;
+ if (as->took_gc_lock)
+ up_read(&c->gc_lock);
+ as->took_gc_lock = false;
+
bch2_journal_preres_put(&c->journal, &as->journal_preres);
bch2_journal_pin_drop(&c->journal, &as->journal);
trans->journal_pin = &as->journal;
for_each_keylist_key(&as->new_keys, k) {
- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+ ret = bch2_trans_mark_key(trans,
+ bkey_s_c_null,
+ bkey_i_to_s_c(k),
0, 0, BTREE_TRIGGER_INSERT);
if (ret)
return ret;
}
for_each_keylist_key(&as->old_keys, k) {
- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
+ ret = bch2_trans_mark_key(trans,
+ bkey_i_to_s_c(k),
+ bkey_s_c_null,
0, 0, BTREE_TRIGGER_OVERWRITE);
if (ret)
return ret;
{
struct bch_fs *c = as->c;
struct btree *b = as->b;
+ struct btree_trans trans;
u64 journal_seq = 0;
unsigned i;
int ret;
+ /*
+ * If we're already in an error state, it might be because a btree node
+ * was never written, and we might be trying to free that same btree
+ * node here, but it won't have been marked as allocated and we'll see
+ * spurious disk usage inconsistencies in the transactional part below
+ * if we don't skip it:
+ */
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ goto err;
+
+ BUG_ON(!journal_pin_active(&as->journal));
+
/*
* We did an update to a parent node where the pointers we added pointed
* to child nodes that weren't written yet: now, the child nodes have
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
- ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_JOURNAL_RESERVED,
- btree_update_nodes_written_trans(&trans, as));
- BUG_ON(ret && !bch2_journal_error(&c->journal));
-
+ bch2_trans_init(&trans, c, 0, 512);
+ ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_JOURNAL_RECLAIM|
+ BTREE_INSERT_JOURNAL_RESERVED,
+ btree_update_nodes_written_trans(&trans, as));
+ bch2_trans_exit(&trans);
+
+ bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
+ "error %i in btree_update_nodes_written()", ret);
+err:
if (b) {
/*
* @b is the node we did the final insert into:
list_del(&as->write_blocked_list);
- if (!ret && as->b == b) {
+ /*
+ * Node might have been freed, recheck under
+ * btree_interior_update_lock:
+ */
+ if (as->b == b) {
struct bset *i = btree_bset_last(b);
BUG_ON(!b->c.level);
BUG_ON(!btree_node_dirty(b));
- i->journal_seq = cpu_to_le64(
- max(journal_seq,
- le64_to_cpu(i->journal_seq)));
-
- bch2_btree_add_journal_pin(c, b, journal_seq);
+ if (!ret) {
+ i->journal_seq = cpu_to_le64(
+ max(journal_seq,
+ le64_to_cpu(i->journal_seq)));
+
+ bch2_btree_add_journal_pin(c, b, journal_seq);
+ } else {
+ /*
+ * If we didn't get a journal sequence number we
+ * can't write this btree node, because recovery
+ * won't know to ignore this write:
+ */
+ set_btree_node_never_write(b);
+ }
}
mutex_unlock(&c->btree_interior_update_lock);
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- /*
- * When we write a new btree root, we have to drop our journal pin
- * _before_ the new nodes are technically reachable; see
- * btree_update_nodes_written().
- *
- * This goes for journal pins that are recursively blocked on us - so,
- * just transfer the journal pin to the new interior update so
- * btree_update_nodes_written() can drop it.
- */
bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
- bch2_journal_pin_drop(&c->journal, &child->journal);
}
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
closure_wake_up(&c->btree_interior_update_wait);
}
- clear_btree_node_dirty(b);
+ clear_btree_node_dirty(c, b);
clear_btree_node_need_write(b);
/*
{
BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+ if (as->took_gc_lock)
+ up_read(&as->c->gc_lock);
+ as->took_gc_lock = false;
+
bch2_btree_reserve_put(as);
continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
}
struct btree_update *
-bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
- unsigned nr_nodes, unsigned flags,
- struct closure *cl)
+bch2_btree_update_start(struct btree_iter *iter, unsigned level,
+ unsigned nr_nodes, unsigned flags)
{
+ struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct btree_update *as;
+ struct closure cl;
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
- int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
- ? JOURNAL_RES_GET_RECLAIM : 0;
+ int journal_flags = 0;
int ret = 0;
+ if (flags & BTREE_INSERT_JOURNAL_RESERVED)
+ journal_flags |= JOURNAL_RES_GET_RESERVED;
+
+ closure_init_stack(&cl);
+retry:
/*
* This check isn't necessary for correctness - it's just to potentially
* prevent us from doing a lot of work that'll end up being wasted:
if (ret)
return ERR_PTR(ret);
+ /*
+ * XXX: figure out how far we might need to split,
+ * instead of locking/reserving all the way to the root:
+ */
+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+ trace_trans_restart_iter_upgrade(trans->ip);
+ return ERR_PTR(-EINTR);
+ }
+
+ if (flags & BTREE_INSERT_GC_LOCK_HELD)
+ lockdep_assert_held(&c->gc_lock);
+ else if (!down_read_trylock(&c->gc_lock)) {
+ if (flags & BTREE_INSERT_NOUNLOCK)
+ return ERR_PTR(-EINTR);
+
+ bch2_trans_unlock(trans);
+ down_read(&c->gc_lock);
+ if (!bch2_trans_relock(trans)) {
+ up_read(&c->gc_lock);
+ return ERR_PTR(-EINTR);
+ }
+ }
+
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
as->c = c;
as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->btree_id = id;
+ as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+ as->btree_id = iter->btree_id;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
BTREE_UPDATE_JOURNAL_RES,
journal_flags|JOURNAL_RES_GET_NONBLOCK);
if (ret == -EAGAIN) {
- if (flags & BTREE_INSERT_NOUNLOCK)
- return ERR_PTR(-EINTR);
+ /*
+ * this would be cleaner if bch2_journal_preres_get() took a
+ * closure argument
+ */
+ if (flags & BTREE_INSERT_NOUNLOCK) {
+ ret = -EINTR;
+ goto err;
+ }
bch2_trans_unlock(trans);
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+ goto err;
+
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags);
if (ret)
- return ERR_PTR(ret);
+ goto err;
if (!bch2_trans_relock(trans)) {
ret = -EINTR;
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
+ ret = bch2_btree_reserve_get(as, nr_nodes, flags,
+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
if (ret)
goto err;
+ bch2_journal_pin_add(&c->journal,
+ atomic64_read(&c->journal.seq),
+ &as->journal, NULL);
+
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
return as;
err:
bch2_btree_update_free(as);
+
+ if (ret == -EAGAIN) {
+ BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
+
+ bch2_trans_unlock(trans);
+ closure_sync(&cl);
+ ret = -EINTR;
+ }
+
+ if (ret == -EINTR && bch2_trans_relock(trans))
+ goto retry;
+
return ERR_PTR(ret);
}
list_del_init(&b->list);
mutex_unlock(&c->btree_cache.lock);
+ if (b->c.level)
+ six_lock_pcpu_alloc(&b->c.lock);
+ else
+ six_lock_pcpu_free(&b->c.lock);
+
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->c.level < btree_node_root(c, b)->c.level ||
struct bkey_i *insert,
struct btree_node_iter *node_iter)
{
+ struct bch_fs *c = as->c;
struct bkey_packed *k;
+ const char *invalid;
+
+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
+ if (invalid) {
+ char buf[160];
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
+ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
+ dump_stack();
+ }
BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
ARRAY_SIZE(as->journal_entries));
bch2_btree_node_iter_advance(node_iter, b);
bch2_btree_bset_insert_key(iter, b, node_iter, insert);
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
}
struct btree *n1,
struct btree_iter *iter)
{
+ struct bkey_format_state s;
size_t nr_packed = 0, nr_unpacked = 0;
struct btree *n2;
struct bset *set1, *set2;
- struct bkey_packed *k, *prev = NULL;
+ struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
+ struct bpos n1_pos;
n2 = bch2_btree_node_alloc(as, n1->c.level);
bch2_btree_update_add_new_node(as, n2);
SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
n2->key.k.p = n1->key.k.p;
- btree_node_set_format(n2, n2->data->format);
-
set1 = btree_bset_first(n1);
set2 = btree_bset_first(n2);
*/
k = set1->start;
while (1) {
- struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+ struct bkey_packed *n = bkey_next(k);
if (n == vstruct_last(set1))
break;
}
BUG_ON(!prev);
+ set2_start = k;
+ set2_end = vstruct_last(set1);
- btree_set_max(n1, bkey_unpack_pos(n1, prev));
- btree_set_min(n2, bkey_successor(n1->key.k.p));
-
- set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
- set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
-
+ set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
set_btree_bset_end(n1, n1->set);
- set_btree_bset_end(n2, n2->set);
-
- n2->nr.live_u64s = le16_to_cpu(set2->u64s);
- n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s);
- n2->nr.packed_keys = n1->nr.packed_keys - nr_packed;
- n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked;
n1->nr.live_u64s = le16_to_cpu(set1->u64s);
n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s);
n1->nr.packed_keys = nr_packed;
n1->nr.unpacked_keys = nr_unpacked;
+ n1_pos = bkey_unpack_pos(n1, prev);
+ if (as->c->sb.version < bcachefs_metadata_version_snapshot)
+ n1_pos.snapshot = U32_MAX;
+
+ btree_set_max(n1, n1_pos);
+ btree_set_min(n2, bpos_successor(n1->key.k.p));
+
+ bch2_bkey_format_init(&s);
+ bch2_bkey_format_add_pos(&s, n2->data->min_key);
+ bch2_bkey_format_add_pos(&s, n2->data->max_key);
+
+ for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+ struct bkey uk = bkey_unpack_key(n1, k);
+ bch2_bkey_format_add_key(&s, &uk);
+ }
+
+ n2->data->format = bch2_bkey_format_done(&s);
+ btree_node_set_format(n2, n2->data->format);
+
+ out = set2->start;
+ memset(&n2->nr, 0, sizeof(n2->nr));
+
+ for (k = set2_start; k != set2_end; k = bkey_next(k)) {
+ BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
+ ? &n1->format : &bch2_bkey_format_current, k));
+ out->format = KEY_FORMAT_LOCAL_BTREE;
+ btree_keys_account_key_add(&n2->nr, 0, out);
+ out = bkey_next(out);
+ }
+
+ set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
+ set_btree_bset_end(n2, n2->set);
+
BUG_ON(!set1->u64s);
BUG_ON(!set2->u64s);
- memcpy_u64s(set2->start,
- vstruct_end(set1),
- le16_to_cpu(set2->u64s));
-
btree_node_reset_sib_u64s(n1);
btree_node_reset_sib_u64s(n2);
struct bkey_packed *src, *dst, *n;
struct bset *i;
- BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
+ BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
i = btree_bset_first(b);
src = dst = i->start;
while (src != vstruct_last(i)) {
- n = bkey_next_skip_noops(src, vstruct_last(i));
+ n = bkey_next(src);
if (!bkey_deleted(src)) {
memmove_u64s_down(dst, src, src->u64s);
dst = bkey_next(dst);
src = n;
}
+ /* Also clear out the unwritten whiteouts area: */
+ b->whiteout_u64s = 0;
+
i->u64s = cpu_to_le16((u64 *) dst - i->_data);
set_btree_bset_end(b, b->set);
* the node the iterator points to:
*/
while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
- (bkey_cmp_packed(b, k, &insert->k) >= 0))
+ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
;
for_each_keylist_key(keys, insert)
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ lockdep_assert_held(&c->gc_lock);
BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
- if (as->must_rewrite)
- goto split;
-
bch2_btree_node_lock_for_insert(c, b, iter);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
goto split;
}
+ btree_node_interior_verify(c, b);
+
bch2_btree_insert_keys_interior(as, b, iter, keys);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
bch2_btree_node_unlock_write(b, iter);
btree_node_interior_verify(c, b);
-
- /*
- * when called from the btree_split path the new nodes aren't added to
- * the btree iterator yet, so the merge path's unlock/wait/relock dance
- * won't work:
- */
- bch2_foreground_maybe_merge(c, iter, b->c.level,
- flags|BTREE_INSERT_NOUNLOCK);
return;
split:
btree_split(as, b, iter, keys, flags);
int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
unsigned flags)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = iter_l(iter)->b;
struct btree_update *as;
- struct closure cl;
+ unsigned l;
int ret = 0;
- struct btree_insert_entry *i;
-
- /*
- * We already have a disk reservation and open buckets pinned; this
- * allocation must not block:
- */
- trans_for_each_update(trans, i)
- if (btree_node_type_needs_gc(i->iter->btree_id))
- flags |= BTREE_INSERT_USE_RESERVE;
-
- closure_init_stack(&cl);
-
- /* Hack, because gc and splitting nodes doesn't mix yet: */
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
- !down_read_trylock(&c->gc_lock)) {
- if (flags & BTREE_INSERT_NOUNLOCK) {
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
- return -EINTR;
- }
-
- bch2_trans_unlock(trans);
- down_read(&c->gc_lock);
- if (!bch2_trans_relock(trans))
- ret = -EINTR;
- }
-
- /*
- * XXX: figure out how far we might need to split,
- * instead of locking/reserving all the way to the root:
- */
- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
- trace_trans_restart_iter_upgrade(trans->ip);
- ret = -EINTR;
- goto out;
- }
-
- as = bch2_btree_update_start(trans, iter->btree_id,
- btree_update_reserve_required(c, b), flags,
- !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
- if (IS_ERR(as)) {
- ret = PTR_ERR(as);
- if (ret == -EAGAIN) {
- BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
- bch2_trans_unlock(trans);
- ret = -EINTR;
-
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
- }
- goto out;
- }
+ as = bch2_btree_update_start(iter, iter->level,
+ btree_update_reserve_required(c, b), flags);
+ if (IS_ERR(as))
+ return PTR_ERR(as);
btree_split(as, b, iter, NULL, flags);
bch2_btree_update_done(as);
- /*
- * We haven't successfully inserted yet, so don't downgrade all the way
- * back to read locks;
- */
- __bch2_btree_iter_downgrade(iter, 1);
-out:
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
- up_read(&c->gc_lock);
- closure_sync(&cl);
+ for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
+ ret = bch2_foreground_maybe_merge(c, iter, l, flags);
+
return ret;
}
-void __bch2_foreground_maybe_merge(struct bch_fs *c,
- struct btree_iter *iter,
- unsigned level,
- unsigned flags,
- enum btree_node_sibling sib)
+int __bch2_foreground_maybe_merge(struct bch_fs *c,
+ struct btree_iter *iter,
+ unsigned level,
+ unsigned flags,
+ enum btree_node_sibling sib)
{
struct btree_trans *trans = iter->trans;
+ struct btree_iter *sib_iter = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
struct bkey_i delete;
struct btree *b, *m, *n, *prev, *next, *parent;
- struct closure cl;
+ struct bpos sib_pos;
size_t sib_u64s;
- int ret = 0;
+ int ret = 0, ret2 = 0;
BUG_ON(!btree_node_locked(iter, level));
-
- closure_init_stack(&cl);
retry:
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto err;
+
BUG_ON(!btree_node_locked(iter, level));
b = iter->l[level].b;
- parent = btree_node_parent(iter, b);
- if (!parent)
+ if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
+ (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
+ b->sib_u64s[sib] = U16_MAX;
goto out;
+ }
- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
- goto out;
+ sib_pos = sib == btree_prev_sib
+ ? bpos_predecessor(b->data->min_key)
+ : bpos_successor(b->data->max_key);
- /* XXX: can't be holding read locks */
- m = bch2_btree_node_get_sibling(c, iter, b, sib);
- if (IS_ERR(m)) {
- ret = PTR_ERR(m);
+ sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
+ sib_pos, U8_MAX, level,
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(sib_iter);
+ if (ret)
goto err;
- }
- /* NULL means no sibling: */
- if (!m) {
+ m = sib_iter->l[level].b;
+
+ if (btree_node_parent(iter, b) !=
+ btree_node_parent(sib_iter, m)) {
b->sib_u64s[sib] = U16_MAX;
goto out;
}
next = m;
}
+ BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
+
bch2_bkey_format_init(&new_s);
- __bch2_btree_calc_format(&new_s, b);
- __bch2_btree_calc_format(&new_s, m);
+ bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
+ __bch2_btree_calc_format(&new_s, prev);
+ __bch2_btree_calc_format(&new_s, next);
+ bch2_bkey_format_add_pos(&new_s, next->data->max_key);
new_f = bch2_bkey_format_done(&new_s);
sib_u64s = btree_node_u64s_with_format(b, &new_f) +
}
sib_u64s = min(sib_u64s, btree_max_u64s(c));
+ sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
b->sib_u64s[sib] = sib_u64s;
- if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
- six_unlock_intent(&m->c.lock);
+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
goto out;
- }
- /* We're changing btree topology, doesn't mix with gc: */
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
- !down_read_trylock(&c->gc_lock))
- goto err_cycle_gc_lock;
-
- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
- ret = -EINTR;
- goto err_unlock;
- }
-
- as = bch2_btree_update_start(trans, iter->btree_id,
+ parent = btree_node_parent(iter, b);
+ as = bch2_btree_update_start(iter, level,
btree_update_reserve_required(c, parent) + 1,
flags|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE,
- !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
- if (IS_ERR(as)) {
- ret = PTR_ERR(as);
- goto err_unlock;
- }
+ BTREE_INSERT_USE_RESERVE);
+ ret = PTR_ERR_OR_ZERO(as);
+ if (ret)
+ goto err;
trace_btree_merge(c, b);
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
+ six_lock_increment(&m->c.lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b);
bch2_btree_iter_node_drop(iter, m);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
-
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
- up_read(&c->gc_lock);
out:
bch2_btree_trans_verify_locks(trans);
+ bch2_trans_iter_free(trans, sib_iter);
/*
* Don't downgrade locks here: we're called after successful insert,
* split path, and downgrading to read locks in there is potentially
* confusing:
*/
- closure_sync(&cl);
- return;
-
-err_cycle_gc_lock:
- six_unlock_intent(&m->c.lock);
-
- if (flags & BTREE_INSERT_NOUNLOCK)
- goto out;
-
- bch2_trans_unlock(trans);
-
- down_read(&c->gc_lock);
- up_read(&c->gc_lock);
- ret = -EINTR;
- goto err;
-
-err_unlock:
- six_unlock_intent(&m->c.lock);
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
- up_read(&c->gc_lock);
+ return ret ?: ret2;
err:
- BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
-
- if ((ret == -EAGAIN || ret == -EINTR) &&
- !(flags & BTREE_INSERT_NOUNLOCK)) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto out;
+ bch2_trans_iter_put(trans, sib_iter);
+ sib_iter = NULL;
+ if (ret == -EINTR && bch2_trans_relock(trans))
goto retry;
+
+ if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
+ ret2 = ret;
+ ret = bch2_btree_iter_traverse_all(trans);
+ if (!ret)
+ goto retry;
}
goto out;
}
-static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
- struct btree *b, unsigned flags,
- struct closure *cl)
+/**
+ * bch_btree_node_rewrite - Rewrite/move a btree node
+ */
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
+ __le64 seq, unsigned flags)
{
- struct btree *n, *parent = btree_node_parent(iter, b);
+ struct btree *b, *n, *parent;
struct btree_update *as;
+ int ret;
- as = bch2_btree_update_start(iter->trans, iter->btree_id,
+ flags |= BTREE_INSERT_NOFAIL;
+retry:
+ ret = bch2_btree_iter_traverse(iter);
+ if (ret)
+ goto out;
+
+ b = bch2_btree_iter_peek_node(iter);
+ if (!b || b->data->keys.seq != seq)
+ goto out;
+
+ parent = btree_node_parent(iter, b);
+ as = bch2_btree_update_start(iter, b->c.level,
(parent
? btree_update_reserve_required(c, parent)
: 0) + 1,
- flags, cl);
- if (IS_ERR(as)) {
+ flags);
+ ret = PTR_ERR_OR_ZERO(as);
+ if (ret == -EINTR)
+ goto retry;
+ if (ret) {
trace_btree_gc_rewrite_node_fail(c, b);
- return PTR_ERR(as);
+ goto out;
}
bch2_btree_interior_update_will_free_node(as, b);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
- return 0;
-}
-
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- *
- * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
- * btree_check_reserve() has to wait)
- */
-int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
- __le64 seq, unsigned flags)
-{
- struct btree_trans *trans = iter->trans;
- struct closure cl;
- struct btree *b;
- int ret;
-
- flags |= BTREE_INSERT_NOFAIL;
-
- closure_init_stack(&cl);
-
- bch2_btree_iter_upgrade(iter, U8_MAX);
-
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
- if (!down_read_trylock(&c->gc_lock)) {
- bch2_trans_unlock(trans);
- down_read(&c->gc_lock);
- }
- }
-
- while (1) {
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- break;
-
- b = bch2_btree_iter_peek_node(iter);
- if (!b || b->data->keys.seq != seq)
- break;
-
- ret = __btree_node_rewrite(c, iter, b, flags, &cl);
- if (ret != -EAGAIN &&
- ret != -EINTR)
- break;
-
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- }
-
+out:
bch2_btree_iter_downgrade(iter);
-
- if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
- up_read(&c->gc_lock);
-
- closure_sync(&cl);
return ret;
}
struct btree_update *as = NULL;
struct btree *new_hash = NULL;
struct closure cl;
- int ret;
+ int ret = 0;
closure_init_stack(&cl);
- if (!bch2_btree_iter_upgrade(iter, U8_MAX))
- return -EINTR;
-
- if (!down_read_trylock(&c->gc_lock)) {
- bch2_trans_unlock(iter->trans);
- down_read(&c->gc_lock);
-
- if (!bch2_trans_relock(iter->trans)) {
- ret = -EINTR;
- goto err;
- }
- }
-
/*
* check btree_ptr_hash_val() after @b is locked by
* btree_iter_traverse():
*/
if (btree_ptr_hash_val(new_key) != b->hash_val) {
- /* bch2_btree_reserve_get will unlock */
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret) {
bch2_trans_unlock(iter->trans);
- up_read(&c->gc_lock);
closure_sync(&cl);
- down_read(&c->gc_lock);
-
- if (!bch2_trans_relock(iter->trans)) {
- ret = -EINTR;
- goto err;
- }
+ if (!bch2_trans_relock(iter->trans))
+ return -EINTR;
}
new_hash = bch2_btree_node_mem_alloc(c);
}
-retry:
- as = bch2_btree_update_start(iter->trans, iter->btree_id,
- parent ? btree_update_reserve_required(c, parent) : 0,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE,
- &cl);
+ as = bch2_btree_update_start(iter, b->c.level,
+ parent ? btree_update_reserve_required(c, parent) : 0,
+ BTREE_INSERT_NOFAIL);
if (IS_ERR(as)) {
ret = PTR_ERR(as);
- if (ret == -EAGAIN)
- ret = -EINTR;
-
- if (ret == -EINTR) {
- bch2_trans_unlock(iter->trans);
- up_read(&c->gc_lock);
- closure_sync(&cl);
- down_read(&c->gc_lock);
-
- if (bch2_trans_relock(iter->trans))
- goto retry;
- }
-
goto err;
}
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
- if (ret)
- goto err_free_update;
-
__bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
bch2_btree_iter_downgrade(iter);
six_unlock_write(&new_hash->c.lock);
six_unlock_intent(&new_hash->c.lock);
}
- up_read(&c->gc_lock);
closure_sync(&cl);
+ bch2_btree_cache_cannibalize_unlock(c);
return ret;
-err_free_update:
- bch2_btree_update_free(as);
- goto err;
}
/* Init code: */
BTREE_INTERIOR_UPDATING_AS,
} mode;
- unsigned must_rewrite:1;
unsigned nodes_written:1;
+ unsigned took_gc_lock:1;
enum btree_id btree_id;
void bch2_btree_update_done(struct btree_update *);
struct btree_update *
-bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
- unsigned, struct closure *);
+bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
void bch2_btree_interior_update_will_free_node(struct btree_update *,
struct btree *);
unsigned);
int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
-void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
- unsigned, unsigned, enum btree_node_sibling);
+int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+ unsigned, unsigned, enum btree_node_sibling);
-static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
struct btree_iter *iter,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
struct btree *b;
if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
- return;
+ return 0;
if (!bch2_btree_node_relock(iter, level))
- return;
+ return 0;
b = iter->l[level].b;
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
- return;
+ return 0;
- __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+ return __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
}
-static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+static inline int bch2_foreground_maybe_merge(struct bch_fs *c,
struct btree_iter *iter,
unsigned level,
unsigned flags)
{
- bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
- btree_prev_sib);
- bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
- btree_next_sib);
+ return bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+ btree_prev_sib) ?:
+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+ btree_next_sib);
}
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
b->whiteout_u64s;
ssize_t total = c->opts.btree_node_size << 6;
+ /* Always leave one extra u64 for bch2_varint_decode: */
+ used++;
+
return total - used;
}
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+ const struct btree_insert_entry *r)
+{
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ -cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->k->k.p, r->k->k.p);
+}
+
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
- bkey_cmp(bkey_start_pos(&insert->k),
- bkey_predecessor(b->data->min_key)) < 0);
- EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
- EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
+ EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
+ EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
bch_btree_keys_u64s_remaining(iter->trans->c, b));
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
k = bch2_btree_node_iter_peek_all(node_iter, b);
- if (k && bkey_cmp_packed(b, k, &insert->k))
+ if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
k = NULL;
/* @k is the key being overwritten/deleted, if any: */
- EBUG_ON(k && bkey_whiteout(k));
+ EBUG_ON(k && bkey_deleted(k));
/* Deleting, but not found? nothing to do: */
- if (bkey_whiteout(&insert->k) && !k)
+ if (bkey_deleted(&insert->k) && !k)
return false;
- if (bkey_whiteout(&insert->k)) {
+ if (bkey_deleted(&insert->k)) {
/* Deleting: */
btree_account_key_drop(b, k);
k->type = KEY_TYPE_deleted;
return true;
}
-static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
unsigned i, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bch2_btree_node_write_cond(c, b,
(btree_current_write(b) == w && w->journal.seq == seq));
six_unlock_read(&b->c.lock);
+ return 0;
}
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 0, seq);
}
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 1, seq);
}
bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
if (unlikely(!btree_node_dirty(b)))
- set_btree_node_dirty(b);
+ set_btree_node_dirty(c, b);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) bset_u64s(t) - old_u64s;
/* Normal update interface: */
static inline void btree_insert_entry_checks(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
+ struct btree_insert_entry *i)
{
struct bch_fs *c = trans->c;
- BUG_ON(bkey_cmp(insert->k.p, iter->pos));
- BUG_ON(debug_check_bkeys(c) &&
- bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
- __btree_node_type(iter->level, iter->btree_id)));
+ if (bch2_debug_check_bkeys) {
+ const char *invalid = bch2_bkey_invalid(c,
+ bkey_i_to_s_c(i->k), i->bkey_type);
+ if (invalid) {
+ char buf[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+ panic("invalid bkey %s on insert: %s\n", buf, invalid);
+ }
+ }
+ BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
+ BUG_ON(i->level != i->iter->level);
+ BUG_ON(i->btree_id != i->iter->btree_id);
}
static noinline int
BUG_ON(iter->level);
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+ bch2_btree_key_cache_must_wait(trans->c) &&
+ !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
+ return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
+
if (u64s <= ck->u64s)
return BTREE_INSERT_OK;
}
}
-static inline bool iter_has_trans_triggers(struct btree_iter *iter)
-{
- return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id);
-}
-
-static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
-{
- return (((BTREE_NODE_TYPE_HAS_TRIGGERS &
- ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) |
- (1U << BTREE_ID_EC)) &
- (1U << iter->btree_id);
-}
-
static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
{
__bch2_btree_iter_unlock(iter);
struct btree_insert_entry **stopped_at)
{
struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage = NULL;
struct btree_insert_entry *i;
+ struct btree_trans_commit_hook *h;
unsigned u64s = 0;
bool marking = false;
int ret;
prefetch(&trans->c->journal.flags);
+ h = trans->hooks;
+ while (h) {
+ ret = h->fn(trans, h);
+ if (ret)
+ return ret;
+ h = h->next;
+ }
+
trans_for_each_update2(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
return ret;
}
- if (btree_node_type_needs_gc(i->iter->btree_id))
+ if (btree_node_type_needs_gc(i->bkey_type))
marking = true;
}
if (marking) {
percpu_down_read(&c->mark_lock);
- fs_usage = bch2_fs_usage_scratch_get(c);
+ }
+
+ /* Must be called under mark_lock: */
+ if (marking && trans->fs_usage_deltas &&
+ !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
+ ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+ goto err;
}
/*
*/
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
- if (journal_seq_verify(c))
+ if (bch2_journal_seq_verify)
trans_for_each_update2(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
- else if (inject_invalid_keys(c))
+ else if (bch2_inject_invalid_keys)
trans_for_each_update2(trans, i)
i->k->k.version = MAX_VERSION;
}
- /* Must be called under mark_lock: */
- if (marking && trans->fs_usage_deltas &&
- bch2_replicas_delta_list_apply(c, fs_usage,
- trans->fs_usage_deltas)) {
- ret = BTREE_INSERT_NEED_MARK_REPLICAS;
- goto err;
- }
-
trans_for_each_update(trans, i)
- if (iter_has_nontrans_triggers(i->iter))
+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
bch2_mark_update(trans, i->iter, i->k,
- fs_usage, i->trigger_flags);
+ NULL, i->trigger_flags);
- if (marking)
- bch2_trans_fs_usage_apply(trans, fs_usage);
+ if (marking && trans->fs_usage_deltas)
+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
if (unlikely(c->gc_pos.phase))
bch2_trans_mark_gc(trans);
do_btree_insert_one(trans, i->iter, i->k);
err:
if (marking) {
- bch2_fs_usage_scratch_put(c, fs_usage);
percpu_up_read(&c->mark_lock);
}
return ret;
}
+static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
+{
+ struct btree_insert_entry *i;
+ struct btree *b = iter_l(iter)->b;
+ struct bkey_s_c old;
+ int u64s_delta = 0;
+ int ret;
+
+ /*
+ * Inserting directly into interior nodes is an uncommon operation with
+ * various weird edge cases: also, a lot of things about
+ * BTREE_ITER_NODES iters need to be audited
+ */
+ if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
+ return 0;
+
+ BUG_ON(iter->level);
+
+ trans_for_each_update2(trans, i) {
+ if (iter_l(i->iter)->b != b)
+ continue;
+
+ old = bch2_btree_iter_peek_slot(i->iter);
+ ret = bkey_err(old);
+ if (ret)
+ return ret;
+
+ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+ u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+ }
+
+ return u64s_delta <= 0
+ ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level,
+ trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR)
+ : 0;
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
static inline int do_bch2_trans_commit(struct btree_trans *trans,
struct btree_insert_entry **stopped_at)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct btree_iter *iter;
int ret;
+ trans_for_each_update2(trans, i) {
+ struct btree *b;
+
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+
+ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
+ continue;
+
+ b = iter_l(i->iter)->b;
+ if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
+ b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
+ ret = maybe_do_btree_merge(trans, i->iter);
+ if (unlikely(ret))
+ return ret;
+ }
+ }
+
trans_for_each_update2(trans, i)
- BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
- ret = bch2_journal_preres_get(&trans->c->journal,
+ ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK|
- ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
- ? JOURNAL_RES_GET_RECLAIM : 0));
+ ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+ ? JOURNAL_RES_GET_RESERVED : 0));
if (unlikely(ret == -EAGAIN))
ret = bch2_trans_journal_preres_get_cold(trans,
trans->journal_preres_u64s);
/*
* Can't be holding any read locks when we go to take write locks:
+ * another thread could be holding an intent lock on the same node we
+ * have a read lock on, and it'll block trying to take a write lock
+ * (because we hold a read lock) and it could be blocking us by holding
+ * its own read lock (while we're trying to to take write locks).
*
* note - this must be done after bch2_trans_journal_preres_get_cold()
* or anything else that might call bch2_trans_relock(), since that
*/
trans_for_each_iter(trans, iter) {
if (iter->nodes_locked != iter->nodes_intent_locked) {
- EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
- EBUG_ON(trans->iters_live & (1ULL << iter->idx));
- bch2_btree_iter_unlock_noinline(iter);
+ if (btree_iter_keep(trans, iter)) {
+ if (!bch2_btree_iter_upgrade(iter, 1)) {
+ trace_trans_restart_upgrade(trans->ip);
+ return -EINTR;
+ }
+ } else {
+ bch2_btree_iter_unlock_noinline(iter);
+ }
}
}
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
trans_for_each_update2(trans, i)
- btree_insert_entry_checks(trans, i->iter, i->k);
+ btree_insert_entry_checks(trans, i);
bch2_btree_trans_verify_locks(trans);
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_lock_for_insert(trans->c,
+ bch2_btree_node_lock_for_insert(c,
iter_l(i->iter)->b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
i->iter);
if (!ret && trans->journal_pin)
- bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq,
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
trans->journal_pin, NULL);
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:
*/
- bch2_journal_res_put(&trans->c->journal, &trans->journal_res);
+ bch2_journal_res_put(&c->journal, &trans->journal_res);
if (unlikely(ret))
return ret;
- if (trans->flags & BTREE_INSERT_NOUNLOCK)
- trans->nounlock = true;
+ bch2_trans_downgrade(trans);
- trans_for_each_update2(trans, i)
- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
- !same_leaf_as_prev(trans, i))
- bch2_foreground_maybe_merge(trans->c, i->iter,
- 0, trans->flags);
+ return 0;
+}
- trans->nounlock = false;
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+ int ret;
- bch2_trans_downgrade(trans);
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ return ret;
- return 0;
+ ret = !bch2_btree_key_cache_must_wait(c);
+ if (ret)
+ return ret;
+
+ if (mutex_trylock(&c->journal.reclaim_lock)) {
+ ret = bch2_journal_reclaim(&c->journal);
+ mutex_unlock(&c->journal.reclaim_lock);
+ }
+
+ if (!ret)
+ ret = !bch2_btree_key_cache_must_wait(c);
+ return ret;
}
static noinline
case BTREE_INSERT_NEED_MARK_REPLICAS:
bch2_trans_unlock(trans);
- trans_for_each_update(trans, i) {
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
- if (ret)
- return ret;
- }
+ ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
+ if (ret)
+ return ret;
if (bch2_trans_relock(trans))
return 0;
case BTREE_INSERT_NEED_JOURNAL_RES:
bch2_trans_unlock(trans);
+ if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
+ return -EAGAIN;
+
ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
if (ret)
return ret;
trace_trans_restart_journal_res_get(trans->ip);
ret = -EINTR;
break;
- default:
- BUG_ON(ret >= 0);
- break;
- }
+ case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
+ bch2_trans_unlock(trans);
- if (ret == -EINTR) {
- int ret2 = bch2_btree_iter_traverse_all(trans);
+ wait_event(c->journal.reclaim_wait,
+ (ret = journal_reclaim_wait_done(c)));
- if (ret2) {
- trace_trans_restart_traverse(trans->ip);
- return ret2;
- }
+ if (!ret && bch2_trans_relock(trans))
+ return 0;
- trace_trans_restart_atomic(trans->ip);
+ trace_trans_restart_journal_reclaim(trans->ip);
+ ret = -EINTR;
+ break;
+ default:
+ BUG_ON(ret >= 0);
+ break;
}
return ret;
return 0;
}
-static void bch2_trans_update2(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
+static void __bch2_trans_update2(struct btree_trans *trans,
+ struct btree_insert_entry n)
{
- struct btree_insert_entry *i, n = (struct btree_insert_entry) {
- .iter = iter, .k = insert
- };
-
- btree_insert_entry_checks(trans, n.iter, n.k);
-
- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+ struct btree_insert_entry *i;
- EBUG_ON(trans->nr_updates2 >= trans->nr_iters);
+ btree_insert_entry_checks(trans, &n);
- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+ EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
- trans_for_each_update2(trans, i) {
- if (btree_iter_cmp(n.iter, i->iter) == 0) {
- *i = n;
- return;
- }
+ n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- if (btree_iter_cmp(n.iter, i->iter) <= 0)
+ trans_for_each_update2(trans, i)
+ if (btree_insert_entry_cmp(&n, i) <= 0)
break;
- }
- array_insert_item(trans->updates2, trans->nr_updates2,
- i - trans->updates2, n);
+ if (i < trans->updates2 + trans->nr_updates2 &&
+ !btree_insert_entry_cmp(&n, i))
+ *i = n;
+ else
+ array_insert_item(trans->updates2, trans->nr_updates2,
+ i - trans->updates2, n);
+}
+
+static void bch2_trans_update2(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert)
+{
+ __bch2_trans_update2(trans, (struct btree_insert_entry) {
+ .bkey_type = __btree_node_type(iter->level, iter->btree_id),
+ .btree_id = iter->btree_id,
+ .level = iter->level,
+ .iter = iter,
+ .k = insert,
+ });
}
static int extent_update_to_keys(struct btree_trans *trans,
- struct btree_iter *orig_iter,
- struct bkey_i *insert)
+ struct btree_insert_entry n)
{
- struct btree_iter *iter;
int ret;
- ret = bch2_extent_can_insert(trans, orig_iter, insert);
+ if (bkey_deleted(&n.k->k))
+ return 0;
+
+ ret = bch2_extent_can_insert(trans, n.iter, n.k);
if (ret)
return ret;
- if (bkey_deleted(&insert->k))
- return 0;
-
- iter = bch2_trans_copy_iter(trans, orig_iter);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
+ n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_NOT_EXTENTS);
+ n.is_extent = false;
- iter->flags |= BTREE_ITER_INTENT;
- __bch2_btree_iter_set_pos(iter, insert->k.p, false);
- bch2_trans_update2(trans, iter, insert);
- bch2_trans_iter_put(trans, iter);
+ __bch2_trans_update2(trans, n);
+ bch2_trans_iter_put(trans, n.iter);
return 0;
}
static int extent_handle_overwrites(struct btree_trans *trans,
enum btree_id btree_id,
- struct bpos start, struct bpos end)
+ struct bkey_i *insert)
{
- struct btree_iter *iter = NULL, *update_iter;
+ struct btree_iter *iter, *update_iter;
+ struct bpos start = bkey_start_pos(&insert->k);
struct bkey_i *update;
struct bkey_s_c k;
int ret = 0;
- iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter);
- if (ret)
- return ret;
-
+ iter = bch2_trans_get_iter(trans, btree_id, start,
+ BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_with_updates(iter);
while (k.k && !(ret = bkey_err(k))) {
- if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0)
+ if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
break;
if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
- update_iter = bch2_trans_copy_iter(trans, iter);
- if ((ret = PTR_ERR_OR_ZERO(update_iter)))
- goto err;
-
update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
- goto err;
+ break;
bkey_reassemble(update, k);
+
bch2_cut_back(start, update);
- __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
}
- if (bkey_cmp(k.k->p, end) > 0) {
- update_iter = bch2_trans_copy_iter(trans, iter);
- if ((ret = PTR_ERR_OR_ZERO(update_iter)))
- goto err;
-
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if (bkey_cmp(k.k->p, insert->k.p) < 0 ||
+ (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) {
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
if ((ret = PTR_ERR_OR_ZERO(update)))
- goto err;
+ break;
- bkey_reassemble(update, k);
- bch2_cut_front(end, update);
+ bkey_init(&update->k);
+ update->k.p = k.k->p;
- __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
- } else {
- update_iter = bch2_trans_copy_iter(trans, iter);
- if ((ret = PTR_ERR_OR_ZERO(update_iter)))
- goto err;
+ }
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
+ if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
if ((ret = PTR_ERR_OR_ZERO(update)))
- goto err;
+ break;
- update->k = *k.k;
- set_bkey_val_u64s(&update->k, 0);
- update->k.type = KEY_TYPE_deleted;
- update->k.size = 0;
+ bkey_reassemble(update, k);
+ bch2_cut_front(insert->k.p, update);
- __bch2_btree_iter_set_pos(update_iter, update->k.p, false);
+ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
+ break;
}
k = bch2_btree_iter_next_with_updates(iter);
}
-err:
- if (!IS_ERR_OR_NULL(iter))
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_put(trans, iter);
+
return ret;
}
struct btree_insert_entry *i = NULL;
struct btree_iter *iter;
bool trans_trigger_run;
- unsigned u64s;
+ unsigned u64s, reset_flags = 0;
int ret = 0;
- BUG_ON(trans->need_reset);
-
if (!trans->nr_updates)
- goto out_noupdates;
+ goto out_reset;
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
lockdep_assert_held(&trans->c->gc_lock);
unlikely(!percpu_ref_tryget(&trans->c->writes))) {
ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
- return ret;
+ goto out_reset;
}
#ifdef CONFIG_BCACHEFS_DEBUG
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
!(i->trigger_flags & BTREE_TRIGGER_NORUN))
bch2_btree_key_cache_verify_clean(trans,
- i->iter->btree_id, i->iter->pos);
+ i->btree_id, i->k->k.p);
#endif
/*
trans_trigger_run = false;
trans_for_each_update(trans, i) {
- if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK &&
- (ret = bch2_btree_iter_traverse(i->iter)))) {
- trace_trans_restart_traverse(trans->ip);
- goto out;
- }
-
- /*
- * We're not using bch2_btree_iter_upgrade here because
- * we know trans->nounlock can't be set:
- */
- if (unlikely(i->iter->locks_want < 1 &&
- !__bch2_btree_iter_upgrade(i->iter, 1))) {
- trace_trans_restart_upgrade(trans->ip);
- ret = -EINTR;
- goto out;
- }
-
- if (iter_has_trans_triggers(i->iter) &&
+ if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
!i->trans_triggers_run) {
i->trans_triggers_run = true;
trans_trigger_run = true;
/* Turn extents updates into keys: */
trans_for_each_update(trans, i)
- if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
- struct bpos start = bkey_start_pos(&i->k->k);
-
- while (i + 1 < trans->updates + trans->nr_updates &&
- i[0].iter->btree_id == i[1].iter->btree_id &&
- !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)))
- i++;
-
- ret = extent_handle_overwrites(trans, i->iter->btree_id,
- start, i->k->k.p);
- if (ret)
+ if (i->is_extent) {
+ ret = extent_handle_overwrites(trans, i->btree_id, i->k);
+ if (unlikely(ret))
goto out;
}
trans_for_each_update(trans, i) {
- if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
- ret = extent_update_to_keys(trans, i->iter, i->k);
- if (ret)
- goto out;
- } else {
- bch2_trans_update2(trans, i->iter, i->k);
- }
+ ret = i->is_extent
+ ? extent_update_to_keys(trans, *i)
+ : (__bch2_trans_update2(trans, *i), 0);
+ if (unlikely(ret))
+ goto out;
}
trans_for_each_update2(trans, i) {
- BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
- BUG_ON(i->iter->locks_want < 1);
+ ret = bch2_btree_iter_traverse(i->iter);
+ if (unlikely(ret)) {
+ trace_trans_restart_traverse(trans->ip);
+ goto out;
+ }
+
+ if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
+ trace_trans_restart_upgrade(trans->ip);
+ ret = -EINTR;
+ goto out;
+ }
+
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
u64s = jset_u64s(i->k->k.u64s);
if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
goto err;
trans_for_each_iter(trans, iter)
- if ((trans->iters_live & (1ULL << iter->idx)) &&
- (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) {
- if (trans->flags & BTREE_INSERT_NOUNLOCK)
- bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit);
- else
- bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
- }
+ if (btree_iter_live(trans, iter) &&
+ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
+ bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
out:
bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&trans->c->writes);
-out_noupdates:
- bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0);
+out_reset:
+ if (!ret)
+ reset_flags |= TRANS_RESET_NOTRAVERSE;
+ if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
+ reset_flags |= TRANS_RESET_NOUNLOCK;
+ bch2_trans_reset(trans, reset_flags);
return ret;
err:
struct bkey_i *k, enum btree_trigger_flags flags)
{
struct btree_insert_entry *i, n = (struct btree_insert_entry) {
- .trigger_flags = flags, .iter = iter, .k = k
+ .trigger_flags = flags,
+ .bkey_type = __btree_node_type(iter->level, iter->btree_id),
+ .btree_id = iter->btree_id,
+ .level = iter->level,
+ .is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0,
+ .iter = iter,
+ .k = k
};
- EBUG_ON(bkey_cmp(iter->pos,
- (iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bkey_start_pos(&k->k)
- : k->k.p));
+ BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ BUG_ON(bkey_cmp(iter->pos,
+ n.is_extent ? bkey_start_pos(&k->k) : k->k.p));
+
+ trans_for_each_update(trans, i) {
+ BUG_ON(bkey_cmp(i->iter->pos,
+ i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p));
+
+ BUG_ON(i != trans->updates &&
+ btree_insert_entry_cmp(i - 1, i) >= 0);
+ }
+#endif
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- if (btree_node_type_is_extents(iter->btree_id)) {
+ if (n.is_extent) {
iter->pos_after_commit = k->k.p;
iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
}
/*
- * Pending updates are kept sorted: first, find position of new update:
+ * Pending updates are kept sorted: first, find position of new update,
+ * then delete/trim any updates the new update overwrites:
*/
- trans_for_each_update(trans, i)
- if (btree_iter_cmp(iter, i->iter) <= 0)
- break;
+ if (!n.is_extent) {
+ trans_for_each_update(trans, i)
+ if (btree_insert_entry_cmp(&n, i) <= 0)
+ break;
- /*
- * Now delete/trim any updates the new update overwrites:
- */
- if (i > trans->updates &&
- i[-1].iter->btree_id == iter->btree_id &&
- bkey_cmp(iter->pos, i[-1].k->k.p) < 0)
- bch2_cut_back(n.iter->pos, i[-1].k);
-
- while (i < trans->updates + trans->nr_updates &&
- iter->btree_id == i->iter->btree_id &&
- bkey_cmp(n.k->k.p, i->k->k.p) >= 0)
- array_remove_item(trans->updates, trans->nr_updates,
- i - trans->updates);
-
- if (i < trans->updates + trans->nr_updates &&
- iter->btree_id == i->iter->btree_id &&
- bkey_cmp(n.k->k.p, i->iter->pos) > 0) {
- /*
- * When we have an extent that overwrites the start of another
- * update, trimming that extent will mean the iterator's
- * position has to change since the iterator position has to
- * match the extent's start pos - but we don't want to change
- * the iterator pos if some other code is using it, so we may
- * need to clone it:
- */
- if (trans->iters_live & (1ULL << i->iter->idx)) {
- i->iter = bch2_trans_copy_iter(trans, i->iter);
- if (IS_ERR(i->iter)) {
- trans->need_reset = true;
- return PTR_ERR(i->iter);
+ if (i < trans->updates + trans->nr_updates &&
+ !btree_insert_entry_cmp(&n, i))
+ *i = n;
+ else
+ array_insert_item(trans->updates, trans->nr_updates,
+ i - trans->updates, n);
+ } else {
+ trans_for_each_update(trans, i)
+ if (btree_insert_entry_cmp(&n, i) < 0)
+ break;
+
+ while (i > trans->updates &&
+ i[-1].btree_id == n.btree_id &&
+ bkey_cmp(bkey_start_pos(&n.k->k),
+ bkey_start_pos(&i[-1].k->k)) <= 0) {
+ --i;
+ array_remove_item(trans->updates, trans->nr_updates,
+ i - trans->updates);
+ }
+
+ if (i > trans->updates &&
+ i[-1].btree_id == n.btree_id &&
+ bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0)
+ bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k);
+
+ if (i < trans->updates + trans->nr_updates &&
+ i->btree_id == n.btree_id &&
+ bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) {
+ /* We don't handle splitting extents here: */
+ BUG_ON(bkey_cmp(bkey_start_pos(&n.k->k),
+ bkey_start_pos(&i->k->k)) > 0);
+
+ /*
+ * When we have an extent that overwrites the start of another
+ * update, trimming that extent will mean the iterator's
+ * position has to change since the iterator position has to
+ * match the extent's start pos - but we don't want to change
+ * the iterator pos if some other code is using it, so we may
+ * need to clone it:
+ */
+ if (btree_iter_live(trans, i->iter)) {
+ i->iter = bch2_trans_copy_iter(trans, i->iter);
+
+ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+ bch2_trans_iter_put(trans, i->iter);
}
- i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- bch2_trans_iter_put(trans, i->iter);
+ bch2_cut_front(n.k->k.p, i->k);
+ bch2_btree_iter_set_pos(i->iter, n.k->k.p);
}
- bch2_cut_front(n.k->k.p, i->k);
- bch2_btree_iter_set_pos(i->iter, n.k->k.p);
+ array_insert_item(trans->updates, trans->nr_updates,
+ i - trans->updates, n);
}
- EBUG_ON(trans->nr_updates >= trans->nr_iters);
-
- array_insert_item(trans->updates, trans->nr_updates,
- i - trans->updates, n);
return 0;
}
+void bch2_trans_commit_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *h)
+{
+ h->next = trans->hooks;
+ trans->hooks = h;
+}
+
int __bch2_btree_insert(struct btree_trans *trans,
enum btree_id id, struct bkey_i *k)
{
iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
ret = bch2_btree_iter_traverse(iter) ?:
bch2_trans_update(trans, iter, k, 0);
__bch2_btree_insert(&trans, id, k));
}
-int bch2_btree_delete_at_range(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end,
- u64 *journal_seq)
+int bch2_btree_delete_at(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
+{
+ struct bkey_i k;
+
+ bkey_init(&k.k);
+ k.k.p = iter->pos;
+
+ bch2_trans_update(trans, iter, &k, 0);
+ return bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|flags);
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+ struct bpos start, struct bpos end,
+ u64 *journal_seq)
{
+ struct btree_iter *iter;
struct bkey_s_c k;
int ret = 0;
+
+ iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
retry:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
bkey_init(&delete.k);
+ /*
+ * This could probably be more efficient for extents:
+ */
+
/*
* For extents, iter.pos won't necessarily be the same as
* bkey_start_pos(k.k) (for non extents they always will be the
goto retry;
}
+ bch2_trans_iter_free(trans, iter);
return ret;
-
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_i k;
-
- bkey_init(&k.k);
- k.k.p = iter->pos;
-
- bch2_trans_update(trans, iter, &k, 0);
- return bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|flags);
}
/*
struct bpos start, struct bpos end,
u64 *journal_seq)
{
- struct btree_trans trans;
- struct btree_iter *iter;
- int ret = 0;
-
- /*
- * XXX: whether we need mem/more iters depends on whether this btree id
- * has triggers
- */
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
-
- iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);
-
- ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
- ret = bch2_trans_exit(&trans) ?: ret;
-
- BUG_ON(ret == -EINTR);
- return ret;
+ return bch2_trans_do(c, NULL, journal_seq, 0,
+ bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
}
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
+ struct bch_dev *ca;
unsigned i;
percpu_down_write(&c->mark_lock);
usage = c->usage_base;
- bch2_fs_usage_acc_to_base(c, 0);
- bch2_fs_usage_acc_to_base(c, 1);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
for (i = 0; i < BCH_REPLICAS_MAX; i++)
usage->reserved += usage->persistent_reserved[i];
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
- percpu_up_write(&c->mark_lock);
-}
+ for_each_member_device(ca, c, i) {
+ struct bch_dev_usage dev = bch2_dev_usage_read(ca);
-void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage)
-{
- if (fs_usage == c->usage_scratch)
- mutex_unlock(&c->usage_scratch_lock);
- else
- kfree(fs_usage);
+ usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
+ ca->mi.bucket_size;
+ }
+
+ percpu_up_write(&c->mark_lock);
}
-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c)
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+ unsigned journal_seq,
+ bool gc)
{
- struct bch_fs_usage *ret;
- unsigned bytes = fs_usage_u64s(c) * sizeof(u64);
-
- ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (ret)
- return ret;
-
- if (mutex_trylock(&c->usage_scratch_lock))
- goto out_pool;
-
- ret = kzalloc(bytes, GFP_NOFS);
- if (ret)
- return ret;
-
- mutex_lock(&c->usage_scratch_lock);
-out_pool:
- ret = c->usage_scratch;
- memset(ret, 0, bytes);
- return ret;
+ return this_cpu_ptr(gc
+ ? ca->usage_gc
+ : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
+ struct bch_fs *c = ca->fs;
struct bch_dev_usage ret;
+ unsigned seq, i, u64s = dev_usage_u64s();
- memset(&ret, 0, sizeof(ret));
- acc_u64s_percpu((u64 *) &ret,
- (u64 __percpu *) ca->usage[0],
- sizeof(ret) / sizeof(u64));
+ do {
+ seq = read_seqcount_begin(&c->usage_lock);
+ memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+ } while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
}
{
return this_cpu_ptr(gc
? c->usage_gc
- : c->usage[journal_seq & 1]);
+ : c->usage[journal_seq & JOURNAL_BUF_MASK]);
}
u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
{
ssize_t offset = v - (u64 *) c->usage_base;
- unsigned seq;
+ unsigned i, seq;
u64 ret;
BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
do {
seq = read_seqcount_begin(&c->usage_lock);
- ret = *v +
- percpu_u64_get((u64 __percpu *) c->usage[0] + offset) +
- percpu_u64_get((u64 __percpu *) c->usage[1] + offset);
+ ret = *v;
+
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
}
-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
{
- struct bch_fs_usage *ret;
- unsigned seq, v, u64s = fs_usage_u64s(c);
-retry:
- ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
- if (unlikely(!ret))
- return NULL;
+ struct bch_fs_usage_online *ret;
+ unsigned seq, i, u64s;
percpu_down_read(&c->mark_lock);
- v = fs_usage_u64s(c);
- if (unlikely(u64s != v)) {
- u64s = v;
+ ret = kmalloc(sizeof(struct bch_fs_usage_online) +
+ sizeof(u64) + c->replicas.nr, GFP_NOFS);
+ if (unlikely(!ret)) {
percpu_up_read(&c->mark_lock);
- kfree(ret);
- goto retry;
+ return NULL;
}
+ ret->online_reserved = percpu_u64_get(c->online_reserved);
+
+ u64s = fs_usage_u64s(c);
do {
seq = read_seqcount_begin(&c->usage_lock);
- memcpy(ret, c->usage_base, u64s * sizeof(u64));
- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
- acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s);
+ memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
- unsigned u64s = fs_usage_u64s(c);
+ struct bch_dev *ca;
+ unsigned i, u64s = fs_usage_u64s(c);
- BUG_ON(idx >= 2);
+ BUG_ON(idx >= ARRAY_SIZE(c->usage));
preempt_disable();
write_seqcount_begin(&c->usage_lock);
(u64 __percpu *) c->usage[idx], u64s);
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL) {
+ u64s = dev_usage_u64s();
+
+ acc_u64s_percpu((u64 *) ca->usage_base,
+ (u64 __percpu *) ca->usage[idx], u64s);
+ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+ }
+ rcu_read_unlock();
+
write_seqcount_end(&c->usage_lock);
preempt_enable();
}
void bch2_fs_usage_to_text(struct printbuf *out,
struct bch_fs *c,
- struct bch_fs_usage *fs_usage)
+ struct bch_fs_usage_online *fs_usage)
{
unsigned i;
pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
pr_buf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->hidden);
+ fs_usage->u.hidden);
pr_buf(out, "data:\t\t\t\t%llu\n",
- fs_usage->data);
+ fs_usage->u.data);
pr_buf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->cached);
+ fs_usage->u.cached);
pr_buf(out, "reserved:\t\t\t%llu\n",
- fs_usage->reserved);
+ fs_usage->u.reserved);
pr_buf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->nr_inodes);
+ fs_usage->u.nr_inodes);
pr_buf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
for (i = 0;
- i < ARRAY_SIZE(fs_usage->persistent_reserved);
+ i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
i++) {
pr_buf(out, "%u replicas:\n", i + 1);
pr_buf(out, "\treserved:\t\t%llu\n",
- fs_usage->persistent_reserved[i]);
+ fs_usage->u.persistent_reserved[i]);
}
for (i = 0; i < c->replicas.nr; i++) {
pr_buf(out, "\t");
bch2_replicas_entry_to_text(out, e);
- pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]);
+ pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
}
}
static u64 avail_factor(u64 r)
{
- return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
+ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
}
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
{
- return min(fs_usage->hidden +
- fs_usage->btree +
- fs_usage->data +
- reserve_factor(fs_usage->reserved +
+ return min(fs_usage->u.hidden +
+ fs_usage->u.btree +
+ fs_usage->u.data +
+ reserve_factor(fs_usage->u.reserved +
fs_usage->online_reserved),
c->capacity);
}
data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
bch2_fs_usage_read_one(c, &c->usage_base->btree);
reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
- bch2_fs_usage_read_one(c, &c->usage_base->online_reserved);
+ percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
return !is_available_bucket(m);
}
-static inline int is_fragmented_bucket(struct bucket_mark m,
- struct bch_dev *ca)
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
+ struct bucket_mark m)
{
- if (!m.owned_by_allocator &&
- m.data_type == BCH_DATA_user &&
- bucket_sectors_used(m))
- return max_t(int, 0, (int) ca->mi.bucket_size -
- bucket_sectors_used(m));
- return 0;
+ return bucket_sectors_used(m)
+ ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+ : 0;
}
static inline int is_stripe_data_bucket(struct bucket_mark m)
return m.stripe && m.data_type != BCH_DATA_parity;
}
-static inline int bucket_stripe_sectors(struct bucket_mark m)
-{
- return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
-}
-
static inline enum bch_data_type bucket_type(struct bucket_mark m)
{
return m.cached_sectors && !m.dirty_sectors
!is_available_bucket(new);
}
-int bch2_fs_usage_apply(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct disk_reservation *disk_res,
- unsigned journal_seq)
-{
- s64 added = fs_usage->data + fs_usage->reserved;
- s64 should_not_have_added;
- int ret = 0;
-
- percpu_rwsem_assert_held(&c->mark_lock);
-
- /*
- * Not allowed to reduce sectors_available except by getting a
- * reservation:
- */
- should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
- if (WARN_ONCE(should_not_have_added > 0,
- "disk usage increased by %lli more than reservation of %llu",
- added, disk_res ? disk_res->sectors : 0)) {
- atomic64_sub(should_not_have_added, &c->sectors_available);
- added -= should_not_have_added;
- ret = -1;
- }
-
- if (added > 0) {
- disk_res->sectors -= added;
- fs_usage->online_reserved -= added;
- }
-
- preempt_disable();
- acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false),
- (u64 *) fs_usage, fs_usage_u64s(c));
- preempt_enable();
-
- return ret;
-}
-
static inline void account_bucket(struct bch_fs_usage *fs_usage,
struct bch_dev_usage *dev_usage,
enum bch_data_type type,
if (type == BCH_DATA_sb || type == BCH_DATA_journal)
fs_usage->hidden += size;
- dev_usage->buckets[type] += nr;
+ dev_usage->d[type].buckets += nr;
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
- bool gc)
+ u64 journal_seq, bool gc)
{
struct bch_dev_usage *u;
percpu_rwsem_assert_held(&c->mark_lock);
preempt_disable();
- u = this_cpu_ptr(ca->usage[gc]);
+ if (!fs_usage)
+ fs_usage = fs_usage_ptr(c, journal_seq, gc);
+ u = dev_usage_ptr(ca, journal_seq, gc);
if (bucket_type(old))
account_bucket(fs_usage, u, bucket_type(old),
account_bucket(fs_usage, u, bucket_type(new),
1, ca->mi.bucket_size);
- u->buckets_alloc +=
- (int) new.owned_by_allocator - (int) old.owned_by_allocator;
+ u->buckets_ec += (int) new.stripe - (int) old.stripe;
u->buckets_unavailable +=
is_unavailable_bucket(new) - is_unavailable_bucket(old);
- u->buckets_ec += (int) new.stripe - (int) old.stripe;
- u->sectors_ec += bucket_stripe_sectors(new) -
- bucket_stripe_sectors(old);
-
- u->sectors[old.data_type] -= old.dirty_sectors;
- u->sectors[new.data_type] += new.dirty_sectors;
- u->sectors[BCH_DATA_cached] +=
+ u->d[old.data_type].sectors -= old.dirty_sectors;
+ u->d[new.data_type].sectors += new.dirty_sectors;
+ u->d[BCH_DATA_cached].sectors +=
(int) new.cached_sectors - (int) old.cached_sectors;
- u->sectors_fragmented +=
- is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+
+ u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
+ u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+
preempt_enable();
if (!is_available_bucket(old) && is_available_bucket(new))
bch2_wake_allocator(ca);
}
-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
- struct bch_dev *ca;
- struct bucket_mark old = { .v.counter = 0 };
- struct bucket_array *buckets;
- struct bucket *g;
- unsigned i;
- int cpu;
-
- c->usage_base->hidden = 0;
-
- for_each_member_device(ca, c, i) {
- for_each_possible_cpu(cpu)
- memset(per_cpu_ptr(ca->usage[0], cpu), 0,
- sizeof(*ca->usage[0]));
-
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- bch2_dev_usage_update(c, ca, c->usage_base,
- old, g->mark, false);
- }
-}
-
-static inline int update_replicas(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
- s64 sectors)
+static inline void update_replicas(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct bch_replicas_entry *r,
+ s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
- if (idx < 0)
- return -1;
-
- if (!fs_usage)
- return 0;
+ BUG_ON(idx < 0);
fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
- return 0;
}
static inline void update_cached_sectors(struct bch_fs *c,
n = (void *) d->d + d->used;
n->delta = sectors;
memcpy(&n->r, r, replicas_entry_bytes(r));
+ bch2_replicas_entry_sort(&n->r);
d->used += b;
}
update_replicas_list(trans, &r.e, sectors);
}
-static inline struct replicas_delta *
-replicas_delta_next(struct replicas_delta *d)
-{
- return (void *) d + replicas_entry_bytes(&d->r) + 8;
-}
-
-int bch2_replicas_delta_list_apply(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct replicas_delta_list *r)
-{
- struct replicas_delta *d = r->d;
- struct replicas_delta *top = (void *) r->d + r->used;
- unsigned i;
-
- for (d = r->d; d != top; d = replicas_delta_next(d))
- if (update_replicas(c, fs_usage, &d->r, d->delta)) {
- top = d;
- goto unwind;
- }
-
- if (!fs_usage)
- return 0;
-
- fs_usage->nr_inodes += r->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- fs_usage->reserved += r->persistent_reserved[i];
- fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
- }
-
- return 0;
-unwind:
- for (d = r->d; d != top; d = replicas_delta_next(d))
- update_replicas(c, fs_usage, &d->r, -d->delta);
- return -1;
-}
-
#define do_mark_fn(fn, c, pos, flags, ...) \
({ \
int gc, ret = 0; \
ret; \
})
-static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *ret,
- bool gc)
-{
- struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
- struct bucket *g = __bucket(ca, b, gc);
- struct bucket_mark old, new;
-
- old = bucket_cmpxchg(g, new, ({
- BUG_ON(!is_available_bucket(new));
-
- new.owned_by_allocator = true;
- new.data_type = 0;
- new.cached_sectors = 0;
- new.dirty_sectors = 0;
- new.gen++;
- }));
-
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
- if (old.cached_sectors)
- update_cached_sectors(c, fs_usage, ca->dev_idx,
- -((s64) old.cached_sectors));
-
- if (!gc)
- *ret = old;
- return 0;
-}
-
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *old)
-{
- do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
- ca, b, old);
-
- if (!old->owned_by_allocator && old->cached_sectors)
- trace_invalidate(ca, bucket_to_sector(ca, b),
- old->cached_sectors);
-}
-
static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
bool gc)
{
- struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
new.owned_by_allocator = owned_by_allocator;
}));
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
struct bucket_mark old_m, m;
/* We don't do anything for deletions - do we?: */
- if (new.k->type != KEY_TYPE_alloc)
+ if (new.k->type != KEY_TYPE_alloc &&
+ new.k->type != KEY_TYPE_alloc_v2)
return 0;
/*
m.data_type = u.data_type;
m.dirty_sectors = u.dirty_sectors;
m.cached_sectors = u.cached_sectors;
+ m.stripe = u.stripe != 0;
if (journal_seq) {
m.journal_seq_valid = 1;
}
}));
- bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
+ g->stripe = u.stripe;
+ g->stripe_redundancy = u.stripe_redundancy;
/*
* need to know if we're getting called from the invalidate path or
if (c)
bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
- old, new, gc);
+ old, new, 0, gc);
return 0;
}
return 0;
}
-static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
unsigned ptr_idx,
struct bch_fs_usage *fs_usage,
- u64 journal_seq, unsigned flags,
- bool enabled)
+ u64 journal_seq, unsigned flags)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
char buf[200];
int ret;
- if (enabled)
- g->ec_redundancy = s->nr_redundant;
+ if (g->stripe && g->stripe != k.k->p.offset) {
+ bch2_fs_inconsistent(c,
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+ return -EINVAL;
+ }
old = bucket_cmpxchg(g, new, ({
ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
if (ret)
return ret;
- if (new.stripe && enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
- if (!new.stripe && !enabled)
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
- new.stripe = enabled;
-
- if ((flags & BTREE_TRIGGER_GC) && parity) {
- new.data_type = enabled ? BCH_DATA_parity : 0;
- new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+ if (parity) {
+ new.data_type = BCH_DATA_parity;
+ new.dirty_sectors = le16_to_cpu(s->sectors);
}
if (journal_seq) {
}
}));
- if (!enabled)
- g->ec_redundancy = 0;
+ g->stripe = k.k->p.offset;
+ g->stripe_redundancy = s->nr_redundant;
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
return 0;
}
old.v.counter,
new.v.counter)) != old.v.counter);
- bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
unsigned i;
int ret;
+ BUG_ON(gc && old_s);
+
if (!m || (old_s && !m->alive)) {
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
}
if (!new_s) {
- /* Deleting: */
- for (i = 0; i < old_s->nr_blocks; i++) {
- ret = bucket_set_stripe(c, old, i, fs_usage,
- journal_seq, flags, false);
- if (ret)
- return ret;
- }
-
- if (!gc && m->on_heap) {
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_del(c, m, idx);
- spin_unlock(&c->ec_stripes_heap_lock);
- }
-
- if (gc)
- update_replicas(c, fs_usage, &m->r.e,
- -((s64) m->sectors * m->nr_redundant));
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_del(c, m, idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
memset(m, 0, sizeof(*m));
} else {
- BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
- BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
-
- for (i = 0; i < new_s->nr_blocks; i++) {
- if (!old_s ||
- memcmp(new_s->ptrs + i,
- old_s->ptrs + i,
- sizeof(struct bch_extent_ptr))) {
-
- if (old_s) {
- bucket_set_stripe(c, old, i, fs_usage,
- journal_seq, flags, false);
- if (ret)
- return ret;
- }
- ret = bucket_set_stripe(c, new, i, fs_usage,
- journal_seq, flags, true);
- if (ret)
- return ret;
- }
- }
-
m->alive = true;
m->sectors = le16_to_cpu(new_s->sectors);
m->algorithm = new_s->algorithm;
m->block_sectors[i] =
stripe_blockcount_get(new_s, i);
m->blocks_nonempty += !!m->block_sectors[i];
- }
- if (gc && old_s)
- update_replicas(c, fs_usage, &m->r.e,
- -((s64) m->sectors * m->nr_redundant));
+ m->ptrs[i] = new_s->ptrs[i];
+ }
bch2_bkey_to_replicas(&m->r.e, new);
- if (gc)
- update_replicas(c, fs_usage, &m->r.e,
- ((s64) m->sectors * m->nr_redundant));
-
if (!gc) {
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, idx);
}
}
+ if (gc) {
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(m->block_sectors, 0, sizeof(m->block_sectors));
+ m->blocks_nonempty = 0;
+
+ for (i = 0; i < new_s->nr_blocks; i++) {
+ ret = mark_stripe_bucket(c, new, i, fs_usage,
+ journal_seq, flags);
+ if (ret)
+ return ret;
+ }
+
+ update_replicas(c, fs_usage, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant));
+ }
+
return 0;
}
switch (k.k->type) {
case KEY_TYPE_alloc:
+ case KEY_TYPE_alloc_v2:
ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_inode:
- if (!(flags & BTREE_TRIGGER_OVERWRITE))
- fs_usage->nr_inodes++;
- else
- fs_usage->nr_inodes--;
+ fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
+ fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
break;
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter_l(iter)->b;
- struct btree_node_iter node_iter = iter_l(iter)->iter;
- struct bkey_packed *_old;
struct bkey_s_c old;
struct bkey unpacked;
int ret = 0;
old = (struct bkey_s_c) { &unpacked, NULL };
if (!btree_node_type_is_extents(iter->btree_id)) {
+ /* iterators should be uptodate, shouldn't get errors here: */
if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
- _old = bch2_btree_node_iter_peek(&node_iter, b);
- if (_old)
- old = bkey_disassemble(b, _old, &unpacked);
+ old = bch2_btree_iter_peek_slot(iter);
+ BUG_ON(bkey_err(old));
} else {
struct bkey_cached *ck = (void *) iter->l[0].b;
BTREE_TRIGGER_OVERWRITE|flags);
}
} else {
+ struct btree_iter *copy;
+
BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
0, new->k.size,
fs_usage, trans->journal_res.seq,
BTREE_TRIGGER_INSERT|flags);
- while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
- unsigned offset = 0;
- s64 sectors;
+ copy = bch2_trans_copy_iter(trans, iter);
- old = bkey_disassemble(b, _old, &unpacked);
- sectors = -((s64) old.k->size);
+ for_each_btree_key_continue(copy, 0, old, ret) {
+ unsigned offset = 0;
+ s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
- return 0;
+ break;
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
trans->journal_res.seq, flags) ?: 1;
if (ret <= 0)
break;
-
- bch2_btree_node_iter_advance(&node_iter, b);
}
+ bch2_trans_iter_put(trans, copy);
}
return ret;
}
-void bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct bch_fs_usage *fs_usage)
+static noinline __cold
+void fs_usage_apply_warn(struct btree_trans *trans,
+ unsigned disk_res_sectors)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- static int warned_disk_usage = 0;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
char buf[200];
- if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res,
- trans->journal_res.seq) ||
- warned_disk_usage ||
- xchg(&warned_disk_usage, 1))
- return;
-
- bch_err(c, "disk usage increased more than %llu sectors reserved",
+ bch_err(c, "disk usage increased more than %u sectors reserved",
disk_res_sectors);
trans_for_each_update(trans, i) {
pr_err("overlapping with");
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
- struct btree *b = iter_l(i->iter)->b;
- struct btree_node_iter node_iter = iter_l(i->iter)->iter;
- struct bkey_packed *_k;
-
- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
- struct bkey unpacked;
- struct bkey_s_c k;
-
- pr_info("_k %px format %u", _k, _k->format);
- k = bkey_disassemble(b, _k, &unpacked);
+ struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
+ struct bkey_s_c k;
+ int ret;
- if (btree_node_is_extents(b)
+ for_each_btree_key_continue(copy, 0, k, ret) {
+ if (btree_node_type_is_extents(i->iter->btree_id)
? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
: bkey_cmp(i->k->k.p, k.k->p))
break;
bch2_bkey_val_to_text(&PBUF(buf), c, k);
pr_err("%s", buf);
-
- bch2_btree_node_iter_advance(&node_iter, b);
}
+ bch2_trans_iter_put(trans, copy);
} else {
struct bkey_cached *ck = (void *) i->iter->l[0].b;
}
}
+void bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ static int warned_disk_usage = 0;
+ bool warn = false;
+ unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+ struct replicas_delta *d = deltas->d;
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
+ struct bch_fs_usage *dst;
+ s64 added = 0, should_not_have_added;
+ unsigned i;
+
+ percpu_rwsem_assert_held(&c->mark_lock);
+
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
+ switch (d->r.data_type) {
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ added += d->delta;
+ }
+
+ update_replicas(c, dst, &d->r, d->delta);
+ }
+
+ dst->nr_inodes += deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ added += deltas->persistent_reserved[i];
+ dst->reserved += deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
+ }
+
+ /*
+ * Not allowed to reduce sectors_available except by getting a
+ * reservation:
+ */
+ should_not_have_added = added - (s64) disk_res_sectors;
+ if (unlikely(should_not_have_added > 0)) {
+ atomic64_sub(should_not_have_added, &c->sectors_available);
+ added -= should_not_have_added;
+ warn = true;
+ }
+
+ if (added > 0) {
+ trans->disk_res->sectors -= added;
+ this_cpu_sub(*c->online_reserved, added);
+ }
+
+ preempt_enable();
+
+ if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
+ fs_usage_apply_warn(trans, disk_res_sectors);
+}
+
/* trans_mark: */
static struct btree_iter *trans_get_update(struct btree_trans *trans,
bkey_cmp(pos, i->k->k.p) < 0
: !bkey_cmp(pos, i->iter->pos))) {
*k = bkey_i_to_s_c(i->k);
+
+ /* ugly hack.. */
+ BUG_ON(btree_iter_live(trans, i->iter));
+ trans->iters_live |= 1ULL << i->iter->idx;
return i->iter;
}
struct btree_iter **iter,
struct bkey_s_c *k)
{
- unsigned flags = btree_id != BTREE_ID_ALLOC
+ unsigned flags = btree_id != BTREE_ID_alloc
? BTREE_ITER_SLOTS
: BTREE_ITER_CACHED;
int ret;
*iter = bch2_trans_get_iter(trans, btree_id, pos,
flags|BTREE_ITER_INTENT);
- if (IS_ERR(*iter))
- return PTR_ERR(*iter);
-
*k = __bch2_btree_iter_peek(*iter, flags);
ret = bkey_err(*k);
if (ret)
return ret;
}
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
- const struct bch_extent_ptr *ptr,
- struct bkey_alloc_unpacked *u)
+static struct bkey_alloc_buf *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+ const struct bch_extent_ptr *ptr,
+ struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g;
struct btree_iter *iter;
struct bkey_s_c k;
+ struct bkey_alloc_buf *a;
int ret;
- iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+ if (IS_ERR(a))
+ return a;
+
+ iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
if (iter) {
*u = bch2_alloc_unpack(k);
} else {
- iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
+ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
ret = bch2_btree_iter_traverse(iter);
if (ret) {
bch2_trans_iter_put(trans, iter);
- return ret;
+ return ERR_PTR(ret);
}
percpu_down_read(&c->mark_lock);
g = bucket(ca, pos.offset);
- *u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+ *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
}
*_iter = iter;
- return 0;
+ return a;
}
static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
- struct bkey_i_alloc *a;
+ struct bkey_alloc_buf *a;
int ret;
- ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
- if (ret)
- return ret;
+ a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto out;
-
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i, 0);
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
out:
bch2_trans_iter_put(trans, iter);
return ret;
}
static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
- struct bch_extent_stripe_ptr p,
+ struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
struct bch_replicas_padded r;
int ret = 0;
- ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
+ ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k);
if (ret < 0)
return ret;
if (k.k->type != KEY_TYPE_stripe) {
bch2_fs_inconsistent(c,
"pointer to nonexistent stripe %llu",
- (u64) p.idx);
+ (u64) p.ec.idx);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
+ bch2_fs_inconsistent(c,
+ "stripe pointer doesn't match stripe %llu",
+ (u64) p.ec.idx);
ret = -EIO;
goto out;
}
goto out;
bkey_reassemble(&s->k_i, k);
- stripe_blockcount_set(&s->v, p.block,
- stripe_blockcount_get(&s->v, p.block) +
+ stripe_blockcount_set(&s->v, p.ec.block,
+ stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
bch2_trans_update(trans, iter, &s->k_i, 0);
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
+ ret = bch2_trans_mark_stripe_ptr(trans, p,
disk_sectors, data_type);
if (ret)
return ret;
return 0;
}
+static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+ struct bkey_alloc_buf *a;
+ struct btree_iter *iter;
+ struct bkey_alloc_unpacked u;
+ bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+ int ret = 0;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ if (parity) {
+ s64 sectors = le16_to_cpu(s.v->sectors);
+
+ if (deleting)
+ sectors = -sectors;
+
+ u.dirty_sectors += sectors;
+ u.data_type = u.dirty_sectors
+ ? BCH_DATA_parity
+ : 0;
+ }
+
+ if (!deleting) {
+ if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
+ "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ u.stripe, s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ u.stripe = s.k->p.offset;
+ u.stripe_redundancy = s.v->nr_redundant;
+ } else {
+ u.stripe = 0;
+ u.stripe_redundancy = 0;
+ }
+
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
static int bch2_trans_mark_stripe(struct btree_trans *trans,
- struct bkey_s_c k,
+ struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ struct bkey_s_c_stripe old_s = { NULL };
+ struct bkey_s_c_stripe new_s = { NULL };
struct bch_replicas_padded r;
- struct bkey_alloc_unpacked u;
- struct bkey_i_alloc *a;
- struct btree_iter *iter;
- bool deleting = flags & BTREE_TRIGGER_OVERWRITE;
- s64 sectors = le16_to_cpu(s->sectors);
unsigned i;
int ret = 0;
- if (deleting)
- sectors = -sectors;
-
- bch2_bkey_to_replicas(&r.e, k);
- update_replicas_list(trans, &r.e, sectors * s->nr_redundant);
+ if (old.k->type == KEY_TYPE_stripe)
+ old_s = bkey_s_c_to_stripe(old);
+ if (new.k->type == KEY_TYPE_stripe)
+ new_s = bkey_s_c_to_stripe(new);
/*
- * The allocator code doesn't necessarily update bucket gens in the
- * btree when incrementing them, right before handing out new buckets -
- * we just need to persist those updates here along with the new stripe:
+ * If the pointers aren't changing, we don't need to do anything:
*/
+ if (new_s.k && old_s.k &&
+ new_s.v->nr_blocks == old_s.v->nr_blocks &&
+ new_s.v->nr_redundant == old_s.v->nr_redundant &&
+ !memcmp(old_s.v->ptrs, new_s.v->ptrs,
+ new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
+ return 0;
- for (i = 0; i < s->nr_blocks && !ret; i++) {
- bool parity = i >= nr_data;
+ if (new_s.k) {
+ s64 sectors = le16_to_cpu(new_s.v->sectors);
- ret = bch2_trans_start_alloc_update(trans, &iter,
- &s->ptrs[i], &u);
- if (ret)
- break;
+ bch2_bkey_to_replicas(&r.e, new);
+ update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
- if (parity) {
- u.dirty_sectors += sectors;
- u.data_type = u.dirty_sectors
- ? BCH_DATA_parity
- : 0;
+ for (i = 0; i < new_s.v->nr_blocks; i++) {
+ ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
+ i, false);
+ if (ret)
+ return ret;
}
+ }
- a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto put_iter;
-
- bkey_alloc_init(&a->k_i);
- a->k.p = iter->pos;
- bch2_alloc_pack(a, u);
- bch2_trans_update(trans, iter, &a->k_i, 0);
-put_iter:
- bch2_trans_iter_put(trans, iter);
+ if (old_s.k) {
+ s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
+
+ bch2_bkey_to_replicas(&r.e, old);
+ update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+
+ for (i = 0; i < old_s.v->nr_blocks; i++) {
+ ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
+ i, true);
+ if (ret)
+ return ret;
+ }
}
return ret;
__le64 *refcount;
s64 ret;
- ret = trans_get_key(trans, BTREE_ID_REFLINK,
+ ret = trans_get_key(trans, BTREE_ID_reflink,
POS(0, idx), &iter, &k);
if (ret < 0)
return ret;
}
bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
-
bch2_trans_update(trans, iter, n, 0);
out:
ret = sectors;
return ret;
}
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+int bch2_trans_mark_key(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_s_c new,
unsigned offset, s64 sectors, unsigned flags)
{
- struct replicas_delta_list *d;
struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+ struct replicas_delta_list *d;
+
+ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
return bch2_trans_mark_extent(trans, k, offset, sectors,
flags, BCH_DATA_user);
case KEY_TYPE_stripe:
- return bch2_trans_mark_stripe(trans, k, flags);
- case KEY_TYPE_inode:
- d = replicas_deltas_realloc(trans, 0);
+ return bch2_trans_mark_stripe(trans, old, new, flags);
+ case KEY_TYPE_inode: {
+ int nr = (new.k->type == KEY_TYPE_inode) -
+ (old.k->type == KEY_TYPE_inode);
+
+ if (nr) {
+ d = replicas_deltas_realloc(trans, 0);
+ d->nr_inodes += nr;
+ }
- if (!(flags & BTREE_TRIGGER_OVERWRITE))
- d->nr_inodes++;
- else
- d->nr_inodes--;
return 0;
+ }
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
int bch2_trans_mark_update(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
+ struct bkey_i *new,
unsigned flags)
{
- struct btree *b = iter_l(iter)->b;
- struct btree_node_iter node_iter = iter_l(iter)->iter;
- struct bkey_packed *_k;
+ struct bkey_s_c old;
int ret;
if (unlikely(flags & BTREE_TRIGGER_NORUN))
if (!btree_node_type_needs_gc(iter->btree_id))
return 0;
- ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
- 0, insert->k.size, BTREE_TRIGGER_INSERT);
- if (ret)
- return ret;
+ if (!btree_node_type_is_extents(iter->btree_id)) {
+ /* iterators should be uptodate, shouldn't get errors here: */
+ if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
+ old = bch2_btree_iter_peek_slot(iter);
+ BUG_ON(bkey_err(old));
+ } else {
+ struct bkey_cached *ck = (void *) iter->l[0].b;
- if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
- struct bkey_cached *ck = (void *) iter->l[0].b;
+ BUG_ON(!ck->valid);
+ old = bkey_i_to_s_c(ck->k);
+ }
- return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k),
- 0, 0, BTREE_TRIGGER_OVERWRITE);
- }
+ if (old.k->type == new->k.type) {
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+ } else {
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+ BTREE_TRIGGER_INSERT|flags) ?:
+ bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
+ BTREE_TRIGGER_OVERWRITE|flags);
+ }
+ } else {
+ struct btree_iter *copy;
+ struct bkey _old;
- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
- struct bkey unpacked;
- struct bkey_s_c k;
- unsigned offset = 0;
- s64 sectors = 0;
- unsigned flags = BTREE_TRIGGER_OVERWRITE;
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
- k = bkey_disassemble(b, _k, &unpacked);
+ bkey_init(&_old);
+ old = (struct bkey_s_c) { &_old, NULL };
- if (btree_node_is_extents(b)
- ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0
- : bkey_cmp(insert->k.p, k.k->p))
- break;
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
+ 0, new->k.size,
+ BTREE_TRIGGER_INSERT);
+ if (ret)
+ return ret;
- if (btree_node_is_extents(b)) {
- switch (bch2_extent_overlap(&insert->k, k.k)) {
+ copy = bch2_trans_copy_iter(trans, iter);
+
+ for_each_btree_key_continue(copy, 0, old, ret) {
+ unsigned offset = 0;
+ s64 sectors = -((s64) old.k->size);
+
+ flags |= BTREE_TRIGGER_OVERWRITE;
+
+ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
+ break;
+
+ switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
offset = 0;
- sectors = -((s64) k.k->size);
+ sectors = -((s64) old.k->size);
break;
case BCH_EXTENT_OVERLAP_BACK:
- offset = bkey_start_offset(&insert->k) -
- bkey_start_offset(k.k);
- sectors = bkey_start_offset(&insert->k) -
- k.k->p.offset;
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
+ sectors = bkey_start_offset(&new->k) -
+ old.k->p.offset;
break;
case BCH_EXTENT_OVERLAP_FRONT:
offset = 0;
- sectors = bkey_start_offset(k.k) -
- insert->k.p.offset;
+ sectors = bkey_start_offset(old.k) -
+ new->k.p.offset;
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
- offset = bkey_start_offset(&insert->k) -
- bkey_start_offset(k.k);
- sectors = -((s64) insert->k.size);
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
+ sectors = -((s64) new->k.size);
flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
break;
}
BUG_ON(sectors >= 0);
+
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
+ offset, sectors, flags);
+ if (ret)
+ break;
}
+ bch2_trans_iter_put(trans, copy);
+ }
- ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
- if (ret)
- return ret;
+ return ret;
+}
+
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+ struct bch_dev *ca, size_t b,
+ enum bch_data_type type,
+ unsigned sectors)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter;
+ struct bkey_alloc_unpacked u;
+ struct bkey_alloc_buf *a;
+ struct bch_extent_ptr ptr = {
+ .dev = ca->dev_idx,
+ .offset = bucket_to_sector(ca, b),
+ };
+ int ret = 0;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
- bch2_btree_node_iter_advance(&node_iter, b);
+ if (u.data_type && u.data_type != type) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ bch2_data_types[u.data_type],
+ bch2_data_types[type],
+ bch2_data_types[type]);
+ ret = -EIO;
+ goto out;
}
- return 0;
+ if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
+ "while marking %s",
+ iter->pos.inode, iter->pos.offset, u.gen,
+ bch2_data_types[u.data_type ?: type],
+ u.dirty_sectors, sectors, ca->mi.bucket_size,
+ bch2_data_types[type]);
+ ret = -EIO;
+ goto out;
+ }
+
+ if (u.data_type == type &&
+ u.dirty_sectors == sectors)
+ goto out;
+
+ u.data_type = type;
+ u.dirty_sectors = sectors;
+
+ bch2_alloc_pack(c, a, u);
+ bch2_trans_update(trans, iter, &a->k, 0);
+out:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
-/* Disk reservations: */
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca, size_t b,
+ enum bch_data_type type,
+ unsigned sectors)
+{
+ return __bch2_trans_do(trans, res, NULL, 0,
+ __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
+ ca->mi.bucket_size));
-static u64 bch2_recalc_sectors_available(struct bch_fs *c)
+}
+
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca,
+ u64 start, u64 end,
+ enum bch_data_type type,
+ u64 *bucket, unsigned *bucket_sectors)
{
- percpu_u64_set(&c->pcpu->sectors_available, 0);
+ int ret;
+
+ do {
+ u64 b = sector_to_bucket(ca, start);
+ unsigned sectors =
+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
+
+ if (b != *bucket) {
+ if (*bucket_sectors) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ *bucket, type, *bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ *bucket = b;
+ *bucket_sectors = 0;
+ }
+
+ *bucket_sectors += sectors;
+ start += sectors;
+ } while (!ret && start < end);
- return avail_factor(__bch2_fs_usage_read_short(c).free);
+ return 0;
}
-void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
+ struct disk_reservation *res,
+ struct bch_dev *ca)
{
- percpu_down_read(&c->mark_lock);
- this_cpu_sub(c->usage[0]->online_reserved,
- res->sectors);
- percpu_up_read(&c->mark_lock);
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 bucket = 0;
+ unsigned i, bucket_sectors = 0;
+ int ret;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset == BCH_SB_SECTOR) {
+ ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
+ 0, BCH_SB_SECTOR,
+ BCH_DATA_sb, &bucket, &bucket_sectors);
+ if (ret)
+ return ret;
+ }
- res->sectors = 0;
+ ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
+ offset + (1 << layout->sb_max_size_bits),
+ BCH_DATA_sb, &bucket, &bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ if (bucket_sectors) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ bucket, BCH_DATA_sb, bucket_sectors);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < ca->journal.nr; i++) {
+ ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
+ ca->journal.buckets[i],
+ BCH_DATA_journal, ca->mi.bucket_size);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
+int bch2_trans_mark_dev_sb(struct bch_fs *c,
+ struct disk_reservation *res,
+ struct bch_dev *ca)
+{
+ return bch2_trans_do(c, res, NULL, 0,
+ __bch2_trans_mark_dev_sb(&trans, res, ca));
+}
+
+/* Disk reservations: */
+
#define SECTORS_CACHE 1024
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- unsigned sectors, int flags)
+ u64 sectors, int flags)
{
struct bch_fs_pcpu *pcpu;
u64 old, v, get;
if (get < sectors) {
preempt_enable();
- percpu_up_read(&c->mark_lock);
goto recalculate;
}
} while ((v = atomic64_cmpxchg(&c->sectors_available,
out:
pcpu->sectors_available -= sectors;
- this_cpu_add(c->usage[0]->online_reserved, sectors);
+ this_cpu_add(*c->online_reserved, sectors);
res->sectors += sectors;
preempt_enable();
return 0;
recalculate:
- percpu_down_write(&c->mark_lock);
+ mutex_lock(&c->sectors_available_lock);
- sectors_available = bch2_recalc_sectors_available(c);
+ percpu_u64_set(&c->pcpu->sectors_available, 0);
+ sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
max_t(s64, 0, sectors_available - sectors));
- this_cpu_add(c->usage[0]->online_reserved, sectors);
+ this_cpu_add(*c->online_reserved, sectors);
res->sectors += sectors;
ret = 0;
} else {
ret = -ENOSPC;
}
- percpu_up_write(&c->mark_lock);
+ mutex_unlock(&c->sectors_available_lock);
+ percpu_up_read(&c->mark_lock);
return ret;
}
ca->mi.bucket_size / c->opts.btree_node_size);
/* XXX: these should be tunable */
size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
- size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
+ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
btree_reserve * 2);
bool resize = ca->buckets[0] != NULL;
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
- !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_MOVINGGC],
copygc_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
- free_percpu(ca->usage[0]);
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+ free_percpu(ca->usage[i]);
+ kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+ unsigned i;
+
+ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+ if (!ca->usage_base)
return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+ ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage[i])
+ return -ENOMEM;
+ }
+
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
}
return __bucket(ca, b, false);
}
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
- return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
{
- struct bucket *g = bucket(ca, b);
-
return g->mark.gen - g->oldest_gen;
}
return mark.dirty_sectors + mark.cached_sectors;
}
-static inline bool bucket_unused(struct bucket_mark mark)
-{
- return !mark.owned_by_allocator &&
- !mark.data_type &&
- !bucket_sectors_used(mark);
-}
-
static inline bool is_available_bucket(struct bucket_mark mark)
{
- return (!mark.owned_by_allocator &&
- !mark.dirty_sectors &&
- !mark.stripe);
+ return !mark.dirty_sectors && !mark.stripe;
}
static inline bool bucket_needs_journal_commit(struct bucket_mark m,
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
READ_ONCE(c->replicas.nr);
}
-void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
-struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
+static inline unsigned dev_usage_u64s(void)
+{
+ return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
-struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
void bch2_fs_usage_to_text(struct printbuf *,
- struct bch_fs *, struct bch_fs_usage *);
+ struct bch_fs *, struct bch_fs_usage_online *);
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
void bch2_bucket_seq_cleanup(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
-void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
- size_t, struct bucket_mark *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
size_t, bool, struct gc_pos, unsigned);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
s64, struct bch_fs_usage *, u64, unsigned);
-int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
- struct disk_reservation *, unsigned);
int bch2_mark_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct bch_fs_usage *, unsigned);
-int bch2_replicas_delta_list_apply(struct bch_fs *,
- struct bch_fs_usage *,
- struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
unsigned, s64, unsigned);
int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
struct bkey_i *insert, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
+void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
-/* disk reservations: */
+int bch2_trans_mark_metadata_bucket(struct btree_trans *,
+ struct disk_reservation *, struct bch_dev *,
+ size_t, enum bch_data_type, unsigned);
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *,
+ struct bch_dev *);
-void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
{
- if (res->sectors)
- __bch2_disk_reservation_put(c, res);
+ this_cpu_sub(*c->online_reserved, res->sectors);
+ res->sectors = 0;
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
int bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- unsigned, int);
+ struct disk_reservation *,
+ u64, int);
static inline struct disk_reservation
bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
static inline int bch2_disk_reservation_get(struct bch_fs *c,
struct disk_reservation *res,
- unsigned sectors,
- unsigned nr_replicas,
+ u64 sectors, unsigned nr_replicas,
int flags)
{
*res = bch2_disk_reservation_init(c, nr_replicas);
const struct bucket_mark mark;
};
- u16 io_time[2];
+ u64 io_time[2];
u8 oldest_gen;
u8 gc_gen;
unsigned gen_valid:1;
- u8 ec_redundancy;
+ u8 stripe_redundancy;
+ u32 stripe;
};
struct bucket_array {
};
struct bch_dev_usage {
- u64 buckets[BCH_DATA_NR];
- u64 buckets_alloc;
+ u64 buckets_ec;
u64 buckets_unavailable;
- /* _compressed_ sectors: */
- u64 sectors[BCH_DATA_NR];
- u64 sectors_fragmented;
-
- u64 buckets_ec;
- u64 sectors_ec;
+ struct {
+ u64 buckets;
+ u64 sectors; /* _compressed_ sectors: */
+ u64 fragmented;
+ } d[BCH_DATA_NR];
};
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
-
- u64 online_reserved;
-
- /* fields after online_reserved are cleared/recalculated by gc: */
- u64 gc_start[0];
-
u64 hidden;
u64 btree;
u64 data;
u64 replicas[];
};
+struct bch_fs_usage_online {
+ u64 online_reserved;
+ struct bch_fs_usage u;
+};
+
struct bch_fs_usage_short {
u64 capacity;
u64 used;
u64 nr_inodes;
};
-struct replicas_delta {
- s64 delta;
- struct bch_replicas_entry r;
-} __packed;
-
-struct replicas_delta_list {
- unsigned size;
- unsigned used;
-
- struct {} memset_start;
- u64 nr_inodes;
- u64 persistent_reserved[BCH_REPLICAS_MAX];
- struct {} memset_end;
- struct replicas_delta d[0];
-};
-
/*
* A reservation for space on disk:
*/
#include "bcachefs_ioctl.h"
#include "buckets.h"
#include "chardev.h"
+#include "journal.h"
#include "move.h"
#include "replicas.h"
#include "super.h"
ctx->c = c;
ctx->arg = arg;
- ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+ ctx->thread = kthread_create(bch2_data_thread, ctx,
+ "bch-data/%s", c->name);
if (IS_ERR(ctx->thread)) {
ret = PTR_ERR(ctx->thread);
goto err;
{
struct bch_ioctl_fs_usage *arg = NULL;
struct bch_replicas_usage *dst_e, *dst_end;
- struct bch_fs_usage *src;
+ struct bch_fs_usage_online *src;
u32 replica_entries_bytes;
unsigned i;
int ret = 0;
arg->online_reserved = src->online_reserved;
for (i = 0; i < BCH_REPLICAS_MAX; i++)
- arg->persistent_reserved[i] = src->persistent_reserved[i];
+ arg->persistent_reserved[i] = src->u.persistent_reserved[i];
dst_e = arg->replicas;
dst_end = (void *) arg->replicas + replica_entries_bytes;
break;
}
- dst_e->sectors = src->replicas[i];
+ dst_e->sectors = src->u.replicas[i];
dst_e->r = *src_e;
/* recheck after setting nr_devs: */
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
arg.available_buckets = arg.nr_buckets - src.buckets_unavailable;
arg.ec_buckets = src.buckets_ec;
- arg.ec_sectors = src.sectors_ec;
+ arg.ec_sectors = 0;
for (i = 0; i < BCH_DATA_NR; i++) {
- arg.buckets[i] = src.buckets[i];
- arg.sectors[i] = src.sectors[i];
+ arg.buckets[i] = src.d[i].buckets;
+ arg.sectors[i] = src.d[i].sectors;
}
percpu_ref_put(&ca->ref);
return ret;
}
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
+ struct bch_ioctl_disk_resize_journal arg)
+{
+ struct bch_dev *ca;
+ int ret;
+
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad)
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
+
+ percpu_ref_put(&ca->ref);
+ return ret;
+}
+
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
BCH_IOCTL(data, struct bch_ioctl_data);
case BCH_IOCTL_DISK_RESIZE:
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+ case BCH_IOCTL_DISK_RESIZE_JOURNAL:
+ BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
default:
return -ENOTTY;
bool data)
{
switch (type) {
- case BCH_CSUM_OPT_NONE:
+ case BCH_CSUM_OPT_none:
return BCH_CSUM_NONE;
- case BCH_CSUM_OPT_CRC32C:
+ case BCH_CSUM_OPT_crc32c:
return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
- case BCH_CSUM_OPT_CRC64:
+ case BCH_CSUM_OPT_crc64:
return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
default:
BUG();
spin_lock(&clock->timer_lock);
- if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+ if (time_after_eq((unsigned long) atomic64_read(&clock->now),
timer->expire)) {
spin_unlock(&clock->timer_lock);
timer->fn(timer);
void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
{
struct io_timer *timer;
- unsigned long now = atomic_long_add_return(sectors, &clock->now);
+ unsigned long now = atomic64_add_return(sectors, &clock->now);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
unsigned i;
spin_lock(&clock->timer_lock);
- now = atomic_long_read(&clock->now);
+ now = atomic64_read(&clock->now);
for (i = 0; i < clock->timers.used; i++)
pr_buf(out, "%ps:\t%li\n",
int bch2_io_clock_init(struct io_clock *clock)
{
- atomic_long_set(&clock->now, 0);
+ atomic64_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock);
clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock {
- atomic_long_t now;
+ atomic64_t now;
u16 __percpu *pcpu_buf;
unsigned max_slop;
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
- if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+ if (!PageHighMem(bio_iter_page(bio, start)) &&
bio_phys_contig(bio, start))
return (struct bbuf) {
.b = page_address(bio_iter_page(bio, start)) +
ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+ /*
+ * ZSTD requires that when we decompress we pass in the exact
+ * compressed size - rounding it up to the nearest sector
+ * doesn't work, so we use the first 4 bytes of the buffer for
+ * that.
+ *
+ * Additionally, the ZSTD code seems to have a bug where it will
+ * write just past the end of the buffer - so subtract a fudge
+ * factor (7 bytes) from the dst buffer size to account for
+ * that.
+ */
size_t len = ZSTD_compressCCtx(ctx,
- dst + 4, dst_len - 4,
+ dst + 4, dst_len - 4 - 7,
src, src_len,
c->zstd_params);
if (ZSTD_isError(len))
v->written = 0;
v->c.level = b->c.level;
v->c.btree_id = b->c.btree_id;
- bch2_btree_keys_init(v, &c->expensive_debug_checks);
+ bch2_btree_keys_init(v);
if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick) <= 0)
memcpy(n_ondisk, n_sorted, btree_bytes(c));
- if (bch2_btree_node_read_done(c, v, false))
+ if (bch2_btree_node_read_done(c, ca, v, false))
goto out;
n_sorted = c->verify_data->data;
bch2_trans_init(&trans, i->c, 0, 0);
- iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+ iter = bch2_trans_get_iter(&trans, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
k = bch2_btree_iter_peek(iter);
while (k.k && !(err = bkey_err(k))) {
if (!i->size)
break;
}
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
return err < 0 ? err : i->ret;
if (err)
return err;
- if (!i->size || !bkey_cmp(POS_MAX, i->from))
+ if (!i->size || !bpos_cmp(POS_MAX, i->from))
return i->ret;
bch2_trans_init(&trans, i->c, 0, 0);
* can't easily correctly restart a btree node traversal across
* all nodes, meh
*/
- i->from = bkey_cmp(POS_MAX, b->key.k.p)
- ? bkey_successor(b->key.k.p)
+ i->from = bpos_cmp(POS_MAX, b->key.k.p)
+ ? bpos_successor(b->key.k.p)
: b->key.k.p;
if (!i->size)
break;
}
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
return err < 0 ? err : i->ret;
if (err)
break;
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_advance(iter);
i->from = iter->pos;
err = flush_buf(i);
struct btree;
struct bch_fs;
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) \
- { return bch2_##name || c->name; }
-BCH_DEBUG_PARAMS_ALWAYS()
-#undef BCH_DEBUG_PARAM
-
#ifdef CONFIG_BCACHEFS_DEBUG
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) \
- { return bch2_##name || c->name; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
void __bch2_btree_verify(struct bch_fs *, struct btree *);
-
-#define bypass_torture_test(d) ((d)->bypass_torture_test)
-
-#else /* DEBUG */
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) { return false; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
+#else
static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-
-#define bypass_torture_test(d) 0
-
#endif
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
{
- if (verify_btree_ondisk(c))
+ if (bch2_verify_btree_ondisk)
__bch2_btree_verify(c, b);
}
}
const struct bch_hash_desc bch2_dirent_hash_desc = {
- .btree_id = BTREE_ID_DIRENTS,
+ .btree_id = BTREE_ID_dirents,
.key_type = KEY_TYPE_dirent,
.hash_key = dirent_hash_key,
.hash_bkey = dirent_hash_bkey,
int bch2_dirent_create(struct btree_trans *trans,
u64 dir_inum, const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
- int flags)
+ u64 *dir_offset, int flags)
{
struct bkey_i_dirent *dirent;
int ret;
if (ret)
return ret;
- return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, &dirent->k_i, flags);
+ ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+ dir_inum, &dirent->k_i, flags);
+ *dir_offset = dirent->k.p.offset;
+
+ return ret;
}
static void dirent_copy_target(struct bkey_i_dirent *dst,
int bch2_dirent_rename(struct btree_trans *trans,
u64 src_dir, struct bch_hash_info *src_hash,
u64 dst_dir, struct bch_hash_info *dst_hash,
- const struct qstr *src_name, u64 *src_inum,
- const struct qstr *dst_name, u64 *dst_inum,
+ const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
+ const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
{
struct btree_iter *src_iter = NULL, *dst_iter = NULL;
new_dst->k.p = src_iter->pos;
bch2_trans_update(trans, src_iter,
&new_dst->k_i, 0);
- goto out;
+ goto out_set_offset;
} else {
/* If we're overwriting, we can't insert new_dst
* at a different slot because it has to
* overwrite old_dst - just make sure to use a
* whiteout when deleting src:
*/
- new_src->k.type = KEY_TYPE_whiteout;
+ new_src->k.type = KEY_TYPE_hash_whiteout;
}
} else {
/* Check if we need a whiteout to delete src: */
goto out;
if (ret)
- new_src->k.type = KEY_TYPE_whiteout;
+ new_src->k.type = KEY_TYPE_hash_whiteout;
}
}
bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+out_set_offset:
+ *src_offset = new_src->k.p.offset;
+ *dst_offset = new_dst->k.p.offset;
out:
bch2_trans_iter_put(trans, src_iter);
bch2_trans_iter_put(trans, dst_iter);
k = bch2_btree_iter_peek_slot(iter);
inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+ bch2_trans_iter_put(&trans, iter);
out:
bch2_trans_exit(&trans);
return inum;
struct bkey_s_c k;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_DIRENTS,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
POS(dir_inum, 0), 0, k, ret) {
if (k.k->p.inode > dir_inum)
break;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+ for_each_btree_key(&trans, iter, BTREE_ID_dirents,
POS(inum, ctx->pos), 0, k, ret) {
if (k.k->p.inode > inum)
break;
break;
ctx->pos = dirent.k->p.offset + 1;
}
+ bch2_trans_iter_put(&trans, iter);
+
ret = bch2_trans_exit(&trans) ?: ret;
return ret;
int bch2_dirent_create(struct btree_trans *, u64,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, int);
+ const struct qstr *, u64, u64 *, int);
int bch2_dirent_delete_at(struct btree_trans *,
const struct bch_hash_info *,
int bch2_dirent_rename(struct btree_trans *,
u64, struct bch_hash_info *,
u64, struct bch_hash_info *,
- const struct qstr *, u64 *,
- const struct qstr *, u64 *,
+ const struct qstr *, u64 *, u64 *,
+ const struct qstr *, u64 *, u64 *,
enum bch_rename_mode);
struct btree_iter *
#include "bcachefs.h"
#include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "bset.h"
#include "btree_gc.h"
#include "btree_update.h"
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ if (!bkey_cmp(k.k->p, POS_MIN))
+ return "stripe at pos 0";
+
if (k.k->p.inode)
return "invalid stripe key";
stripe_blockcount_get(s, i));
}
-static int ptr_matches_stripe(struct bch_fs *c,
- struct bch_stripe *v,
- const struct bch_extent_ptr *ptr)
+/* returns blocknr in stripe that we matched: */
+static int bkey_matches_stripe(struct bch_stripe *s,
+ struct bkey_s_c k)
{
- unsigned i;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+ unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
- for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
- const struct bch_extent_ptr *ptr2 = v->ptrs + i;
-
- if (ptr->dev == ptr2->dev &&
- ptr->gen == ptr2->gen &&
- ptr->offset >= ptr2->offset &&
- ptr->offset < ptr2->offset + le16_to_cpu(v->sectors))
- return i;
- }
-
- return -1;
-}
-
-static int extent_matches_stripe(struct bch_fs *c,
- struct bch_stripe *v,
- struct bkey_s_c k)
-{
-
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr;
- int idx;
-
- extent_for_each_ptr(e, ptr) {
- idx = ptr_matches_stripe(c, v, ptr);
- if (idx >= 0)
- return idx;
- }
- break;
- }
- }
+ bkey_for_each_ptr(ptrs, ptr)
+ for (i = 0; i < nr_data; i++)
+ if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
+ le16_to_cpu(s->sectors)))
+ return i;
return -1;
}
return false;
}
+/* Stripe bufs: */
+
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
+{
+ unsigned i;
+
+ for (i = 0; i < buf->key.v.nr_blocks; i++) {
+ kvpfree(buf->data[i], buf->size << 9);
+ buf->data[i] = NULL;
+ }
+}
+
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+ unsigned offset, unsigned size)
+{
+ struct bch_stripe *v = &buf->key.v;
+ unsigned csum_granularity = 1U << v->csum_granularity_bits;
+ unsigned end = offset + size;
+ unsigned i;
+
+ BUG_ON(end > le16_to_cpu(v->sectors));
+
+ offset = round_down(offset, csum_granularity);
+ end = min_t(unsigned, le16_to_cpu(v->sectors),
+ round_up(end, csum_granularity));
+
+ buf->offset = offset;
+ buf->size = end - offset;
+
+ memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+ for (i = 0; i < buf->key.v.nr_blocks; i++) {
+ buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+ if (!buf->data[i])
+ goto err;
+ }
+
+ return 0;
+err:
+ ec_stripe_buf_exit(buf);
+ return -ENOMEM;
+}
+
/* Checksumming: */
-static void ec_generate_checksums(struct ec_stripe_buf *buf)
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
+ unsigned block, unsigned offset)
{
struct bch_stripe *v = &buf->key.v;
unsigned csum_granularity = 1 << v->csum_granularity_bits;
- unsigned csums_per_device = stripe_csums_per_device(v);
- unsigned csum_bytes = bch_crc_bytes[v->csum_type];
- unsigned i, j;
+ unsigned end = buf->offset + buf->size;
+ unsigned len = min(csum_granularity, end - offset);
+
+ BUG_ON(offset >= end);
+ BUG_ON(offset < buf->offset);
+ BUG_ON(offset & (csum_granularity - 1));
+ BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+ (len & (csum_granularity - 1)));
+
+ return bch2_checksum(NULL, v->csum_type,
+ null_nonce(),
+ buf->data[block] + ((offset - buf->offset) << 9),
+ len << 9);
+}
- if (!csum_bytes)
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+ struct bch_stripe *v = &buf->key.v;
+ unsigned i, j, csums_per_device = stripe_csums_per_device(v);
+
+ if (!v->csum_type)
return;
BUG_ON(buf->offset);
BUG_ON(buf->size != le16_to_cpu(v->sectors));
- for (i = 0; i < v->nr_blocks; i++) {
- for (j = 0; j < csums_per_device; j++) {
- unsigned offset = j << v->csum_granularity_bits;
- unsigned len = min(csum_granularity, buf->size - offset);
-
- struct bch_csum csum =
- bch2_checksum(NULL, v->csum_type,
- null_nonce(),
- buf->data[i] + (offset << 9),
- len << 9);
-
- memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
- }
- }
+ for (i = 0; i < v->nr_blocks; i++)
+ for (j = 0; j < csums_per_device; j++)
+ stripe_csum_set(v, i, j,
+ ec_block_checksum(buf, i, j << v->csum_granularity_bits));
}
static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
{
struct bch_stripe *v = &buf->key.v;
unsigned csum_granularity = 1 << v->csum_granularity_bits;
- unsigned csum_bytes = bch_crc_bytes[v->csum_type];
unsigned i;
- if (!csum_bytes)
+ if (!v->csum_type)
return;
for (i = 0; i < v->nr_blocks; i++) {
while (offset < end) {
unsigned j = offset >> v->csum_granularity_bits;
unsigned len = min(csum_granularity, end - offset);
- struct bch_csum csum;
+ struct bch_csum want = stripe_csum_get(v, i, j);
+ struct bch_csum got = ec_block_checksum(buf, i, offset);
- BUG_ON(offset & (csum_granularity - 1));
- BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
- ((offset + len) & (csum_granularity - 1)));
+ if (bch2_crc_cmp(want, got)) {
+ char buf2[200];
- csum = bch2_checksum(NULL, v->csum_type,
- null_nonce(),
- buf->data[i] + ((offset - buf->offset) << 9),
- len << 9);
+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
- if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
- __bcache_io_error(c,
- "checksum error while doing reconstruct read (%u:%u)",
- i, j);
+ bch_err_ratelimited(c,
+ "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
+ (void *) _RET_IP_, i, j, v->csum_type,
+ want.lo, got.lo, buf2);
clear_bit(i, buf->valid);
break;
}
raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
}
-static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
-{
- return nr - bitmap_weight(buf->valid, nr);
-}
-
static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
{
- return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+ return buf->key.v.nr_blocks -
+ bitmap_weight(buf->valid, buf->key.v.nr_blocks);
}
static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
{
struct bch_stripe *v = &buf->key.v;
- unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
+ unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
unsigned nr_data = v->nr_blocks - v->nr_redundant;
unsigned bytes = buf->size << 9;
if (ec_nr_failed(buf) > v->nr_redundant) {
- __bcache_io_error(c,
+ bch_err_ratelimited(c,
"error doing reconstruct read: unable to read enough blocks");
return -1;
}
static void ec_block_endio(struct bio *bio)
{
struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+ struct bch_stripe *v = &ec_bio->buf->key.v;
+ struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ if (ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(ca->fs,
+ "error %s stripe: stale pointer after io",
+ bio_data_dir(bio) == READ ? "reading from" : "writing to");
+ clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ }
+
bio_put(&ec_bio->bio);
percpu_ref_put(&ca->io_ref);
closure_put(cl);
? BCH_DATA_user
: BCH_DATA_parity;
+ if (ptr_stale(ca, ptr)) {
+ bch_err_ratelimited(c,
+ "error %s stripe: stale pointer",
+ rw == READ ? "reading from" : "writing to");
+ clear_bit(idx, buf->valid);
+ return;
+ }
+
if (!bch2_dev_get_ioref(ca, rw)) {
clear_bit(idx, buf->valid);
return;
percpu_ref_put(&ca->io_ref);
}
-/* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
struct btree_trans trans;
struct btree_iter *iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+ if (k.k->type != KEY_TYPE_stripe) {
+ ret = -ENOENT;
+ goto err;
+ }
+ bkey_reassemble(&stripe->key.k_i, k);
+err:
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
struct ec_stripe_buf *buf;
struct closure cl;
- struct bkey_s_c k;
struct bch_stripe *v;
- unsigned stripe_idx;
- unsigned offset, end;
- unsigned i, nr_data, csum_granularity;
- int ret = 0, idx;
+ unsigned i, offset;
+ int ret = 0;
closure_init_stack(&cl);
BUG_ON(!rbio->pick.has_ec);
- stripe_idx = rbio->pick.ec.idx;
-
buf = kzalloc(sizeof(*buf), GFP_NOIO);
if (!buf)
return -ENOMEM;
- bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
- POS(0, stripe_idx),
- BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
- __bcache_io_error(c,
- "error doing reconstruct read: stripe not found");
+ ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+ if (ret) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: error %i looking up stripe", ret);
kfree(buf);
- return bch2_trans_exit(&trans) ?: -EIO;
+ return -EIO;
}
- bkey_reassemble(&buf->key.k_i, k);
- bch2_trans_exit(&trans);
-
v = &buf->key.v;
- nr_data = v->nr_blocks - v->nr_redundant;
-
- idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
- BUG_ON(idx < 0);
-
- csum_granularity = 1U << v->csum_granularity_bits;
-
- offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
- end = offset + bio_sectors(&rbio->bio);
-
- BUG_ON(end > le16_to_cpu(v->sectors));
-
- buf->offset = round_down(offset, csum_granularity);
- buf->size = min_t(unsigned, le16_to_cpu(v->sectors),
- round_up(end, csum_granularity)) - buf->offset;
-
- for (i = 0; i < v->nr_blocks; i++) {
- buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
- if (!buf->data[i]) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: pointer doesn't match stripe");
+ ret = -EIO;
+ goto err;
}
- memset(buf->valid, 0xFF, sizeof(buf->valid));
-
- for (i = 0; i < v->nr_blocks; i++) {
- struct bch_extent_ptr *ptr = v->ptrs + i;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
+ if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
+ bch_err_ratelimited(c,
+ "error doing reconstruct read: read is bigger than stripe");
+ ret = -EIO;
+ goto err;
+ }
- if (ptr_stale(ca, ptr)) {
- __bcache_io_error(c,
- "error doing reconstruct read: stale pointer");
- clear_bit(i, buf->valid);
- continue;
- }
+ ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+ if (ret)
+ goto err;
+ for (i = 0; i < v->nr_blocks; i++)
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
- }
closure_sync(&cl);
if (ec_nr_failed(buf) > v->nr_redundant) {
- __bcache_io_error(c,
+ bch_err_ratelimited(c,
"error doing reconstruct read: unable to read enough blocks");
ret = -EIO;
goto err;
goto err;
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
- buf->data[idx] + ((offset - buf->offset) << 9));
+ buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
err:
- for (i = 0; i < v->nr_blocks; i++)
- kfree(buf->data[i]);
+ ec_stripe_buf_exit(buf);
kfree(buf);
return ret;
}
static int ec_stripe_delete(struct bch_fs *c, size_t idx)
{
- //pr_info("deleting stripe %zu", idx);
- return bch2_btree_delete_range(c, BTREE_ID_EC,
+ return bch2_btree_delete_range(c, BTREE_ID_stripes,
POS(0, idx),
POS(0, idx + 1),
NULL);
/* stripe creation: */
static int ec_stripe_bkey_insert(struct bch_fs *c,
- struct ec_stripe_new *s,
- struct bkey_i_stripe *stripe)
+ struct bkey_i_stripe *stripe,
+ struct disk_reservation *res)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bpos start_pos = POS(0, c->ec_stripe_hint);
+ struct bpos min_pos = POS(0, 1);
+ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
- for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos,
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
if (start_pos.offset) {
- start_pos = POS_MIN;
+ start_pos = min_pos;
bch2_btree_iter_set_pos(iter, start_pos);
continue;
}
bch2_trans_update(&trans, iter, &stripe->k_i, 0);
- ret = bch2_trans_commit(&trans, &s->res, NULL,
+ ret = bch2_trans_commit(&trans, res, NULL,
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_put(&trans, iter);
return ret;
}
+static int ec_stripe_bkey_update(struct btree_trans *trans,
+ struct bkey_i_stripe *new)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ const struct bch_stripe *existing;
+ unsigned i;
+ int ret;
+
+ iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
+ new->k.p, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || k.k->type != KEY_TYPE_stripe) {
+ bch_err(trans->c, "error updating stripe: not found");
+ ret = -ENOENT;
+ goto err;
+ }
+
+ existing = bkey_s_c_to_stripe(k).v;
+
+ if (existing->nr_blocks != new->v.nr_blocks) {
+ bch_err(trans->c, "error updating stripe: nr_blocks does not match");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i,
+ stripe_blockcount_get(existing, i));
+
+ bch2_trans_update(trans, iter, &new->k_i, 0);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
static void extent_stripe_ptr_add(struct bkey_s_extent e,
struct ec_stripe_buf *s,
struct bch_extent_ptr *ptr,
*dst = (struct bch_extent_stripe_ptr) {
.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
.block = block,
+ .redundancy = s->key.v.nr_redundant,
.idx = s->key.k.p.offset,
};
}
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_s_extent e;
- struct bkey_on_stack sk;
- int ret = 0, dev, idx;
+ struct bkey_buf sk;
+ int ret = 0, dev, block;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
/* XXX this doesn't support the reflink btree */
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
bkey_start_pos(pos),
BTREE_ITER_INTENT);
struct bch_extent_ptr *ptr, *ec_ptr = NULL;
if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_advance(iter);
continue;
}
- idx = extent_matches_stripe(c, &s->key.v, k);
- if (idx < 0) {
- bch2_btree_iter_next(iter);
+ block = bkey_matches_stripe(&s->key.v, k);
+ if (block < 0) {
+ bch2_btree_iter_advance(iter);
continue;
}
- dev = s->key.v.ptrs[idx].dev;
+ dev = s->key.v.ptrs[block].dev;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
e = bkey_i_to_s_extent(sk.k);
bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
BUG_ON(!ec_ptr);
- extent_stripe_ptr_add(e, s, ec_ptr, idx);
+ extent_stripe_ptr_add(e, s, ec_ptr, block);
bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
bch2_trans_update(&trans, iter, sk.k, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
+ BTREE_INSERT_NOFAIL);
if (ret == -EINTR)
ret = 0;
if (ret)
break;
}
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
struct open_bucket *ob;
struct bkey_i *k;
struct stripe *m;
- struct bch_stripe *v = &s->stripe.key.v;
+ struct bch_stripe *v = &s->new_stripe.key.v;
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
- struct closure cl;
int ret;
BUG_ON(s->h->s == s);
- closure_init_stack(&cl);
+ closure_sync(&s->iodone);
if (s->err) {
if (s->err != -EROFS)
goto err;
}
+ if (s->have_existing_stripe) {
+ ec_validate_checksums(c, &s->existing_stripe);
+
+ if (ec_do_recov(c, &s->existing_stripe)) {
+ bch_err(c, "error creating stripe: error reading existing stripe");
+ goto err;
+ }
+
+ for (i = 0; i < nr_data; i++)
+ if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
+ swap(s->new_stripe.data[i],
+ s->existing_stripe.data[i]);
+
+ ec_stripe_buf_exit(&s->existing_stripe);
+ }
+
BUG_ON(!s->allocated);
if (!percpu_ref_tryget(&c->writes))
goto err;
- BUG_ON(bitmap_weight(s->blocks_allocated,
- s->blocks.nr) != s->blocks.nr);
-
- ec_generate_ec(&s->stripe);
+ ec_generate_ec(&s->new_stripe);
- ec_generate_checksums(&s->stripe);
+ ec_generate_checksums(&s->new_stripe);
/* write p/q: */
for (i = nr_data; i < v->nr_blocks; i++)
- ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
+ ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+ closure_sync(&s->iodone);
- closure_sync(&cl);
-
- for (i = nr_data; i < v->nr_blocks; i++)
- if (!test_bit(i, s->stripe.valid)) {
- bch_err(c, "error creating stripe: error writing redundancy buckets");
- goto err_put_writes;
- }
+ if (ec_nr_failed(&s->new_stripe)) {
+ bch_err(c, "error creating stripe: error writing redundancy buckets");
+ goto err_put_writes;
+ }
- ret = s->existing_stripe
- ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
- &s->res, NULL, BTREE_INSERT_NOFAIL)
- : ec_stripe_bkey_insert(c, s, &s->stripe.key);
+ ret = s->have_existing_stripe
+ ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+ ec_stripe_bkey_update(&trans, &s->new_stripe.key))
+ : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
if (ret) {
bch_err(c, "error creating stripe: error creating stripe key");
goto err_put_writes;
}
for_each_keylist_key(&s->keys, k) {
- ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+ ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
if (ret) {
- bch_err(c, "error creating stripe: error updating pointers");
+ bch_err(c, "error creating stripe: error %i updating pointers", ret);
break;
}
}
spin_lock(&c->ec_stripes_heap_lock);
- m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
-#if 0
- pr_info("created a %s stripe %llu",
- s->existing_stripe ? "existing" : "new",
- s->stripe.key.k.p.offset);
-#endif
+ m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
+
BUG_ON(m->on_heap);
- bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
+ bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
spin_unlock(&c->ec_stripes_heap_lock);
err_put_writes:
percpu_ref_put(&c->writes);
err:
bch2_disk_reservation_put(c, &s->res);
- open_bucket_for_each(c, &s->blocks, ob, i) {
- ob->ec = NULL;
- __bch2_open_bucket_put(c, ob);
- }
-
- bch2_open_buckets_put(c, &s->parity);
+ for (i = 0; i < v->nr_blocks; i++)
+ if (s->blocks[i]) {
+ ob = c->open_buckets + s->blocks[i];
+
+ if (i < nr_data) {
+ ob->ec = NULL;
+ __bch2_open_bucket_put(c, ob);
+ } else {
+ bch2_open_bucket_put(c, ob);
+ }
+ }
bch2_keylist_free(&s->keys, s->inline_keys);
- for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
- kvpfree(s->stripe.data[i], s->stripe.size << 9);
+ ec_stripe_buf_exit(&s->existing_stripe);
+ ec_stripe_buf_exit(&s->new_stripe);
+ closure_debug_destroy(&s->iodone);
kfree(s);
}
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
offset = ca->mi.bucket_size - ob->sectors_free;
- return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+ return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
if (!ob)
return;
- //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
-
ec = ob->ec;
mutex_lock(&ec->lock);
static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
{
struct ec_stripe_new *s;
- unsigned i;
lockdep_assert_held(&h->lock);
return -ENOMEM;
mutex_init(&s->lock);
+ closure_init(&s->iodone, NULL);
atomic_set(&s->pin, 1);
s->c = c;
s->h = h;
s->nr_data = min_t(unsigned, h->nr_active_devs,
- EC_STRIPE_MAX) - h->redundancy;
+ BCH_BKEY_PTRS_MAX) - h->redundancy;
s->nr_parity = h->redundancy;
bch2_keylist_init(&s->keys, s->inline_keys);
- s->stripe.offset = 0;
- s->stripe.size = h->blocksize;
- memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
-
- ec_stripe_key_init(c, &s->stripe.key, s->nr_data,
+ ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
s->nr_parity, h->blocksize);
- for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
- s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
- if (!s->stripe.data[i])
- goto err;
- }
-
h->s = s;
-
return 0;
-err:
- for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
- kvpfree(s->stripe.data[i], s->stripe.size << 9);
- kfree(s);
- return -ENOMEM;
}
static struct ec_stripe_head *
ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
- unsigned algo, unsigned redundancy)
+ unsigned algo, unsigned redundancy,
+ bool copygc)
{
struct ec_stripe_head *h;
struct bch_dev *ca;
h->target = target;
h->algo = algo;
h->redundancy = redundancy;
+ h->copygc = copygc;
rcu_read_lock();
h->devs = target_rw_devs(c, BCH_DATA_user, target);
if (h->s &&
h->s->allocated &&
bitmap_weight(h->s->blocks_allocated,
- h->s->blocks.nr) == h->s->blocks.nr)
+ h->s->nr_data) == h->s->nr_data)
ec_stripe_set_pending(c, h);
mutex_unlock(&h->lock);
}
struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
- unsigned target,
- unsigned algo,
- unsigned redundancy)
+ unsigned target,
+ unsigned algo,
+ unsigned redundancy,
+ bool copygc)
{
struct ec_stripe_head *h;
list_for_each_entry(h, &c->ec_stripe_head_list, list)
if (h->target == target &&
h->algo == algo &&
- h->redundancy == redundancy) {
+ h->redundancy == redundancy &&
+ h->copygc == copygc) {
mutex_lock(&h->lock);
goto found;
}
- h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
found:
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
-/*
- * XXX: use a higher watermark for allocating open buckets here:
- */
-static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
+static enum bucket_alloc_ret
+new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+ struct closure *cl)
{
- struct bch_devs_mask devs;
+ struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
- unsigned i, nr_have, nr_data =
- min_t(unsigned, h->nr_active_devs,
- EC_STRIPE_MAX) - h->redundancy;
+ struct open_buckets buckets;
+ unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
- int ret = 0;
-
- devs = h->devs;
-
- for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) {
- __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
- --nr_data;
+ enum bucket_alloc_ret ret = ALLOC_SUCCESS;
+
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+ if (test_bit(i, h->s->blocks_gotten)) {
+ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
+ if (i < h->s->nr_data)
+ nr_have_data++;
+ else
+ nr_have_parity++;
+ }
}
- BUG_ON(h->s->blocks.nr > nr_data);
- BUG_ON(h->s->parity.nr > h->redundancy);
-
- open_bucket_for_each(c, &h->s->parity, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
- open_bucket_for_each(c, &h->s->blocks, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
+ BUG_ON(nr_have_data > h->s->nr_data);
+ BUG_ON(nr_have_parity > h->s->nr_parity);
percpu_down_read(&c->mark_lock);
rcu_read_lock();
- if (h->s->parity.nr < h->redundancy) {
- nr_have = h->s->parity.nr;
-
- ret = bch2_bucket_alloc_set(c, &h->s->parity,
+ buckets.nr = 0;
+ if (nr_have_parity < h->s->nr_parity) {
+ ret = bch2_bucket_alloc_set(c, &buckets,
&h->parity_stripe,
&devs,
- h->redundancy,
- &nr_have,
+ h->s->nr_parity,
+ &nr_have_parity,
&have_cache,
- RESERVE_NONE,
+ h->copygc
+ ? RESERVE_MOVINGGC
+ : RESERVE_NONE,
0,
- NULL);
+ cl);
+
+ open_bucket_for_each(c, &buckets, ob, i) {
+ j = find_next_zero_bit(h->s->blocks_gotten,
+ h->s->nr_data + h->s->nr_parity,
+ h->s->nr_data);
+ BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
+
+ h->s->blocks[j] = buckets.v[i];
+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ __set_bit(j, h->s->blocks_gotten);
+ }
+
if (ret)
goto err;
}
- if (h->s->blocks.nr < nr_data) {
- nr_have = h->s->blocks.nr;
-
- ret = bch2_bucket_alloc_set(c, &h->s->blocks,
+ buckets.nr = 0;
+ if (nr_have_data < h->s->nr_data) {
+ ret = bch2_bucket_alloc_set(c, &buckets,
&h->block_stripe,
&devs,
- nr_data,
- &nr_have,
+ h->s->nr_data,
+ &nr_have_data,
&have_cache,
- RESERVE_NONE,
+ h->copygc
+ ? RESERVE_MOVINGGC
+ : RESERVE_NONE,
0,
- NULL);
+ cl);
+
+ open_bucket_for_each(c, &buckets, ob, i) {
+ j = find_next_zero_bit(h->s->blocks_gotten,
+ h->s->nr_data, 0);
+ BUG_ON(j >= h->s->nr_data);
+
+ h->s->blocks[j] = buckets.v[i];
+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ __set_bit(j, h->s->blocks_gotten);
+ }
+
if (ret)
goto err;
}
/* XXX: doesn't obey target: */
static s64 get_existing_stripe(struct bch_fs *c,
- unsigned target,
- unsigned algo,
- unsigned redundancy)
+ struct ec_stripe_head *head)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m;
size_t heap_idx;
u64 stripe_idx;
+ s64 ret = -1;
if (may_create_new_stripe(c))
return -1;
spin_lock(&c->ec_stripes_heap_lock);
for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+ /* No blocks worth reusing, stripe will just be deleted: */
if (!h->data[heap_idx].blocks_nonempty)
continue;
stripe_idx = h->data[heap_idx].idx;
m = genradix_ptr(&c->stripes[0], stripe_idx);
- if (m->algorithm == algo &&
- m->nr_redundant == redundancy &&
+ if (m->algorithm == head->algo &&
+ m->nr_redundant == head->redundancy &&
+ m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
bch2_stripes_heap_del(c, m, stripe_idx);
- spin_unlock(&c->ec_stripes_heap_lock);
- return stripe_idx;
+ ret = stripe_idx;
+ break;
}
}
-
spin_unlock(&c->ec_stripes_heap_lock);
- return -1;
+ return ret;
}
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
+static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
+ struct ec_stripe_head *h)
{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
+ unsigned i;
+ s64 idx;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (!ret)
- bkey_reassemble(&stripe->key.k_i, k);
- bch2_trans_exit(&trans);
+ idx = get_existing_stripe(c, h);
+ if (idx < 0) {
+ bch_err(c, "failed to find an existing stripe");
+ return -ENOSPC;
+ }
+
+ h->s->have_existing_stripe = true;
+ ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+ if (ret) {
+ bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
+ return ret;
+ }
+
+ if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
+ /*
+ * this is a problem: we have deleted from the
+ * stripes heap already
+ */
+ BUG();
+ }
+
+ BUG_ON(h->s->existing_stripe.size != h->blocksize);
+ BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
+
+ for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
+ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
+ __set_bit(i, h->s->blocks_gotten);
+ __set_bit(i, h->s->blocks_allocated);
+ }
+
+ ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+ }
+
+ bkey_copy(&h->s->new_stripe.key.k_i,
+ &h->s->existing_stripe.key.k_i);
+
+ return 0;
+}
+
+static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
+ struct ec_stripe_head *h)
+{
+ int ret;
+
+ ret = bch2_disk_reservation_get(c, &h->s->res,
+ h->blocksize,
+ h->s->nr_parity, 0);
+
+ if (ret) {
+ /*
+ * This means we need to wait for copygc to
+ * empty out buckets from existing stripes:
+ */
+ bch_err(c, "failed to reserve stripe");
+ }
return ret;
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
unsigned target,
unsigned algo,
- unsigned redundancy)
+ unsigned redundancy,
+ bool copygc,
+ struct closure *cl)
{
- struct closure cl;
struct ec_stripe_head *h;
- struct open_bucket *ob;
- unsigned i, data_idx = 0;
- s64 idx;
int ret;
+ bool needs_stripe_new;
- closure_init_stack(&cl);
-
- h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
- if (!h)
- return NULL;
-
- if (!h->s && ec_new_stripe_alloc(c, h)) {
- bch2_ec_stripe_head_put(c, h);
+ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
+ if (!h) {
+ bch_err(c, "no stripe head");
return NULL;
}
- if (!h->s->allocated) {
- if (!h->s->existing_stripe &&
- (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) {
- //pr_info("got existing stripe %llu", idx);
-
- h->s->existing_stripe = true;
- h->s->existing_stripe_idx = idx;
- if (get_stripe_key(c, idx, &h->s->stripe)) {
- /* btree error */
- BUG();
- }
-
- for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++)
- if (stripe_blockcount_get(&h->s->stripe.key.v, i)) {
- __set_bit(i, h->s->blocks_allocated);
- ec_block_io(c, &h->s->stripe, READ, i, &cl);
- }
- }
-
- if (!h->s->existing_stripe &&
- !h->s->res.sectors) {
- ret = bch2_disk_reservation_get(c, &h->s->res,
- h->blocksize,
- h->s->nr_parity, 0);
- if (ret) {
- /* What should we do here? */
- bch_err(c, "unable to create new stripe: %i", ret);
- bch2_ec_stripe_head_put(c, h);
- h = NULL;
- goto out;
-
- }
-
- }
-
- if (new_stripe_alloc_buckets(c, h)) {
- bch2_ec_stripe_head_put(c, h);
- h = NULL;
- goto out;
+ needs_stripe_new = !h->s;
+ if (needs_stripe_new) {
+ if (ec_new_stripe_alloc(c, h)) {
+ ret = -ENOMEM;
+ bch_err(c, "failed to allocate new stripe");
+ goto err;
}
- open_bucket_for_each(c, &h->s->blocks, ob, i) {
- data_idx = find_next_zero_bit(h->s->blocks_allocated,
- h->s->nr_data, data_idx);
- BUG_ON(data_idx >= h->s->nr_data);
+ if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize))
+ BUG();
+ }
- h->s->stripe.key.v.ptrs[data_idx] = ob->ptr;
- h->s->data_block_idx[i] = data_idx;
- data_idx++;
- }
+ /*
+ * Try reserve a new stripe before reusing an
+ * existing stripe. This will prevent unnecessary
+ * read amplification during write oriented workloads.
+ */
+ ret = 0;
+ if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
+ ret = __bch2_ec_stripe_head_reserve(c, h);
+ if (ret && needs_stripe_new)
+ ret = __bch2_ec_stripe_head_reuse(c, h);
+ if (ret)
+ goto err;
- open_bucket_for_each(c, &h->s->parity, ob, i)
- h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
+ if (!h->s->allocated) {
+ ret = new_stripe_alloc_buckets(c, h, cl);
+ if (ret)
+ goto err;
- //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
h->s->allocated = true;
}
-out:
- closure_sync(&cl);
+
return h;
+
+err:
+ bch2_ec_stripe_head_put(c, h);
+ return ERR_PTR(-ret);
}
void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
if (!h->s)
goto unlock;
- open_bucket_for_each(c, &h->s->blocks, ob, i)
- if (ob->ptr.dev == ca->dev_idx)
- goto found;
- open_bucket_for_each(c, &h->s->parity, ob, i)
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
+ if (!h->s->blocks[i])
+ continue;
+
+ ob = c->open_buckets + h->s->blocks[i];
if (ob->ptr.dev == ca->dev_idx)
goto found;
+ }
goto unlock;
found:
h->s->err = -EROFS;
mutex_unlock(&c->ec_stripe_head_lock);
}
+void bch2_stripes_heap_start(struct bch_fs *c)
+{
+ struct genradix_iter iter;
+ struct stripe *m;
+
+ genradix_for_each(&c->stripes[0], iter, m)
+ if (m->alive)
+ bch2_stripes_heap_insert(c, m, iter.pos);
+}
+
static int __bch2_stripe_write_key(struct btree_trans *trans,
struct btree_iter *iter,
struct stripe *m,
size_t idx,
struct bkey_i_stripe *new_key)
{
- struct bch_fs *c = trans->c;
+ const struct bch_stripe *v;
struct bkey_s_c k;
unsigned i;
int ret;
if (k.k->type != KEY_TYPE_stripe)
return -EIO;
+ v = bkey_s_c_to_stripe(k).v;
+ for (i = 0; i < v->nr_blocks; i++)
+ if (m->block_sectors[i] != stripe_blockcount_get(v, i))
+ goto write;
+ return 0;
+write:
bkey_reassemble(&new_key->k_i, k);
- spin_lock(&c->ec_stripes_heap_lock);
-
for (i = 0; i < new_key->v.nr_blocks; i++)
stripe_blockcount_set(&new_key->v, i,
m->block_sectors[i]);
- m->dirty = false;
-
- spin_unlock(&c->ec_stripes_heap_lock);
bch2_trans_update(trans, iter, &new_key->k_i, 0);
return 0;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
genradix_for_each(&c->stripes[0], giter, m) {
- if (!m->dirty)
+ if (!m->alive)
continue;
ret = __bch2_trans_do(&trans, NULL, NULL,
int ret = 0;
if (k.k->type == KEY_TYPE_stripe) {
- struct stripe *m;
-
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
bch2_mark_key(c, k, 0, 0, NULL, 0,
BTREE_TRIGGER_NOATOMIC);
if (ret)
return ret;
-
- spin_lock(&c->ec_stripes_heap_lock);
- m = genradix_ptr(&c->stripes[0], k.k->p.offset);
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- spin_unlock(&c->ec_stripes_heap_lock);
}
return ret;
int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
- int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC,
+ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_stripes,
NULL, bch2_stripes_read_fn);
if (ret)
bch_err(c, "error reading stripes: %i", ret);
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
k = bch2_btree_iter_prev(iter);
if (!IS_ERR_OR_NULL(k.k))
idx = k.k->p.offset + 1;
+
+ bch2_trans_iter_put(&trans, iter);
ret = bch2_trans_exit(&trans);
if (ret)
return ret;
size_t i;
spin_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min(h->used, 20UL); i++) {
+ for (i = 0; i < min_t(size_t, h->used, 20); i++) {
m = genradix_ptr(&c->stripes[0], h->data[i].idx);
pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
h->target, h->algo, h->redundancy);
if (h->s)
- pr_buf(out, "\tpending: blocks %u allocated %u\n",
- h->s->blocks.nr,
+ pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
+ h->s->nr_data, h->s->nr_parity,
bitmap_weight(h->s->blocks_allocated,
- h->s->blocks.nr));
+ h->s->nr_data));
}
mutex_unlock(&c->ec_stripe_head_lock);
mutex_lock(&c->ec_stripe_new_lock);
list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n",
- s->blocks.nr,
- bitmap_weight(s->blocks_allocated,
- s->blocks.nr),
+ pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
+ s->nr_data, s->nr_parity,
atomic_read(&s->pin));
}
mutex_unlock(&c->ec_stripe_new_lock);
}
static inline void *stripe_csum(struct bch_stripe *s,
- unsigned dev, unsigned csum_idx)
+ unsigned block, unsigned csum_idx)
{
- return (void *) s + stripe_csum_offset(s, dev, csum_idx);
+ EBUG_ON(block >= s->nr_blocks);
+ EBUG_ON(csum_idx >= stripe_csums_per_device(s));
+
+ return (void *) s + stripe_csum_offset(s, block, csum_idx);
+}
+
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
+ unsigned block, unsigned csum_idx)
+{
+ struct bch_csum csum = { 0 };
+
+ memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
+ return csum;
+}
+
+static inline void stripe_csum_set(struct bch_stripe *s,
+ unsigned block, unsigned csum_idx,
+ struct bch_csum csum)
+{
+ memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
+}
+
+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
+ const struct bch_extent_ptr *data_ptr,
+ unsigned sectors)
+{
+ return data_ptr->dev == stripe_ptr->dev &&
+ data_ptr->gen == stripe_ptr->gen &&
+ data_ptr->offset >= stripe_ptr->offset &&
+ data_ptr->offset < stripe_ptr->offset + sectors;
+}
+
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
+ struct extent_ptr_decoded p)
+{
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+
+ BUG_ON(!p.has_ec);
+
+ if (p.ec.block >= nr_data)
+ return false;
+
+ return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
+ le16_to_cpu(s->sectors));
+}
+
+static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m,
+ struct extent_ptr_decoded p)
+{
+ unsigned nr_data = m->nr_blocks - m->nr_redundant;
+
+ BUG_ON(!p.has_ec);
+
+ if (p.ec.block >= nr_data)
+ return false;
+
+ return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
+ m->sectors);
}
struct bch_read_bio;
/* might not be buffering the entire stripe: */
unsigned offset;
unsigned size;
- unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+ unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
- void *data[EC_STRIPE_MAX];
+ void *data[BCH_BKEY_PTRS_MAX];
union {
struct bkey_i_stripe key;
struct ec_stripe_head *h;
struct mutex lock;
struct list_head list;
+ struct closure iodone;
/* counts in flight writes, stripe is created when pin == 0 */
atomic_t pin;
u8 nr_parity;
bool allocated;
bool pending;
- bool existing_stripe;
- u64 existing_stripe_idx;
-
- unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
+ bool have_existing_stripe;
- struct open_buckets blocks;
- u8 data_block_idx[EC_STRIPE_MAX];
- struct open_buckets parity;
+ unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+ unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
+ open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
struct disk_reservation res;
struct keylist keys;
u64 inline_keys[BKEY_U64s * 8];
- struct ec_stripe_buf stripe;
+ struct ec_stripe_buf new_stripe;
+ struct ec_stripe_buf existing_stripe;
};
struct ec_stripe_head {
unsigned target;
unsigned algo;
unsigned redundancy;
+ bool copygc;
struct bch_devs_mask devs;
unsigned nr_active_devs;
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
- unsigned, unsigned);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
+ unsigned, unsigned, unsigned, bool, struct closure *);
void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
void bch2_ec_flush_new_stripes(struct bch_fs *);
+void bch2_stripes_heap_start(struct bch_fs *);
+
struct journal_keys;
int bch2_stripes_read(struct bch_fs *, struct journal_keys *);
int bch2_stripes_write(struct bch_fs *, unsigned);
#include <linux/llist.h>
-#define EC_STRIPE_MAX 16
-
struct bch_replicas_padded {
struct bch_replicas_entry e;
- u8 pad[EC_STRIPE_MAX];
+ u8 pad[BCH_BKEY_PTRS_MAX];
};
struct stripe {
u8 nr_blocks;
u8 nr_redundant;
- unsigned alive:1;
- unsigned dirty:1;
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
unsigned on_heap:1;
u8 blocks_nonempty;
- u16 block_sectors[EC_STRIPE_MAX];
+ u16 block_sectors[BCH_BKEY_PTRS_MAX];
+ struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
struct bch_replicas_padded r;
};
set_bit(BCH_FS_ERROR, &c->flags);
switch (c->opts.errors) {
- case BCH_ON_ERROR_CONTINUE:
+ case BCH_ON_ERROR_continue:
return false;
- case BCH_ON_ERROR_RO:
+ case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
bch_err(c, "emergency read only");
return true;
- case BCH_ON_ERROR_PANIC:
+ case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
return true;
default:
bool dev;
down_write(&c->state_lock);
- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
BCH_FORCE_IF_DEGRADED);
if (dev
- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
BCH_FORCE_IF_DEGRADED)
: bch2_fs_emergency_read_only(c))
bch_err(ca,
/* Logs message and handles the error: */
#define bch2_dev_io_error(ca, fmt, ...) \
do { \
- printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \
- "IO error on %s for " fmt), \
+ printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \
(ca)->name, ##__VA_ARGS__); \
bch2_io_error(ca); \
} while (0)
+#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \
+do { \
+ printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
+ (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \
+ bch2_io_error(ca); \
+} while (0)
+
#define bch2_dev_io_err_on(cond, ca, ...) \
({ \
bool _ret = (cond); \
_ret; \
})
-/* kill? */
-
-#define __bcache_io_error(c, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt(c, \
- "IO error: " fmt), ##__VA_ARGS__)
-
-#define bcache_io_error(c, bio, fmt, ...) \
-do { \
- __bcache_io_error(c, fmt, ##__VA_ARGS__); \
- (bio)->bi_status = BLK_STS_IOERR; \
-} while (0)
+#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \
+({ \
+ bool _ret = (cond); \
+ \
+ if (_ret) \
+ bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
+ _ret; \
+})
#endif /* _BCACHEFS_ERROR_H */
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "bkey_on_stack.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
struct bkey_s_c r_k;
for_each_btree_key(trans, iter,
- BTREE_ID_REFLINK, POS(0, idx + offset),
+ BTREE_ID_reflink, POS(0, idx + offset),
BTREE_ITER_SLOTS, r_k, ret2) {
if (bkey_cmp(bkey_start_pos(r_k.k),
POS(0, idx + sectors)) >= 0)
struct bpos *end)
{
struct btree_trans *trans = iter->trans;
- struct btree *b;
- struct btree_node_iter node_iter;
- struct bkey_packed *_k;
- unsigned nr_iters = 0;
+ struct btree_iter *copy;
+ struct bkey_s_c k;
+ unsigned nr_iters = 0;
int ret;
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- b = iter->l[0].b;
- node_iter = iter->l[0].iter;
-
- BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
- bkey_cmp(bkey_start_pos(&insert->k),
- bkey_predecessor(b->data->min_key)) < 0);
-
- *end = bpos_min(insert->k.p, b->key.k.p);
+ *end = insert->k.p;
/* extent_update_to_keys(): */
nr_iters += 1;
if (ret < 0)
return ret;
- while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
- struct bkey unpacked;
- struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+ copy = bch2_trans_copy_iter(trans, iter);
+
+ for_each_btree_key_continue(copy, 0, k, ret) {
unsigned offset = 0;
if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
&nr_iters, EXTENT_ITERS_MAX);
if (ret)
break;
-
- bch2_btree_node_iter_advance(&node_iter, b);
}
+ bch2_trans_iter_put(trans, copy);
return ret < 0 ? ret : 0;
}
struct btree_iter *iter,
struct bkey_i *insert)
{
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter = l->iter;
- struct bkey_packed *_k;
struct bkey_s_c k;
- struct bkey unpacked;
- int sectors;
-
- _k = bch2_btree_node_iter_peek(&node_iter, l->b);
- if (!_k)
- return BTREE_INSERT_OK;
+ int ret, sectors;
- k = bkey_disassemble(l->b, _k, &unpacked);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
/* Check if we're splitting a compressed extent: */
return bch2_rand_range(l1 + l2) > l1;
}
- if (force_reconstruct_read(c))
+ if (bch2_force_reconstruct_read)
return p1.idx > p2.idx;
return p1.idx < p2.idx;
!bch2_dev_is_readable(ca))
p.idx++;
- if (force_reconstruct_read(c) &&
+ if (bch2_force_reconstruct_read &&
!p.idx && p.has_ec)
p.idx++;
const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
return "value too big";
return bch2_bkey_ptrs_invalid(c, k);
}
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
- const char *err;
- char buf[160];
- struct bucket_mark mark;
- struct bch_dev *ca;
-
- if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
- return;
-
- if (!percpu_down_read_trylock(&c->mark_lock))
- return;
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
- bkey_for_each_ptr(ptrs, ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
+const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- mark = ptr_bucket_mark(ca, ptr);
+ if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
+ return "value too small";
- err = "stale";
- if (gen_after(mark.gen, ptr->gen))
- goto err;
+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+ return "value too big";
- err = "inconsistent";
- if (mark.data_type != BCH_DATA_btree ||
- mark.dirty_sectors < c->opts.btree_node_size)
- goto err;
- }
-out:
- percpu_up_read(&c->mark_lock);
- return;
-err:
- bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
- err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- PTR_BUCKET_NR(ca, ptr),
- mark.gen, (unsigned) mark.v.counter);
- goto out;
-}
+ if (c->sb.version < bcachefs_metadata_version_snapshot &&
+ bp.v->min_key.snapshot)
+ return "invalid min_key.snapshot";
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- bch2_bkey_ptrs_to_text(out, c, k);
+ return bch2_bkey_ptrs_invalid(c, k);
}
void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
- pr_buf(out, "seq %llx sectors %u written %u min_key ",
+ pr_buf(out, "seq %llx written %u min_key ",
le64_to_cpu(bp.v->seq),
- le16_to_cpu(bp.v->sectors),
le16_to_cpu(bp.v->sectors_written));
bch2_bpos_to_text(out, bp.v->min_key);
btree_node_type_is_extents(btree_id) &&
bkey_cmp(bp.v->min_key, POS_MIN))
bp.v->min_key = write
- ? bkey_predecessor(bp.v->min_key)
- : bkey_successor(bp.v->min_key);
+ ? bpos_nosnap_predecessor(bp.v->min_key)
+ : bpos_nosnap_successor(bp.v->min_key);
}
/* KEY_TYPE_extent: */
return bch2_bkey_ptrs_invalid(c, k);
}
-void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
-{
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- char buf[160];
-
- if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) ||
- !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
- return;
-
- if (!percpu_down_read_trylock(&c->mark_lock))
- return;
-
- extent_for_each_ptr_decode(e, p, entry) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
- unsigned stale = gen_after(mark.gen, p.ptr.gen);
- unsigned disk_sectors = ptr_disk_sectors(p);
- unsigned mark_sectors = p.ptr.cached
- ? mark.cached_sectors
- : mark.dirty_sectors;
-
- bch2_fs_inconsistent_on(stale && !p.ptr.cached, c,
- "stale dirty pointer (ptr gen %u bucket %u",
- p.ptr.gen, mark.gen);
-
- bch2_fs_inconsistent_on(stale > 96, c,
- "key too stale: %i", stale);
-
- bch2_fs_inconsistent_on(!stale &&
- (mark.data_type != BCH_DATA_user ||
- mark_sectors < disk_sectors), c,
- "extent pointer not marked: %s:\n"
- "type %u sectors %u < %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
- mark.data_type,
- mark_sectors, disk_sectors);
- }
-
- percpu_up_read(&c->mark_lock);
-}
-
void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
}
bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
- unsigned nr_replicas)
+ unsigned nr_replicas, bool compressed)
{
struct btree_trans trans;
struct btree_iter *iter;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
BTREE_ITER_SLOTS, k, err) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
- if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) {
+ if (nr_replicas > bch2_bkey_replicas(c, k) ||
+ (!compressed && bch2_bkey_sectors_compressed(k))) {
ret = false;
break;
}
}
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
return ret;
}
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p = { 0 };
+ unsigned replicas = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.ptr.cached)
+ continue;
+
+ if (p.has_ec)
+ replicas += p.ec.redundancy;
+
+ replicas++;
+
+ }
+
+ return replicas;
+}
+
static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
struct extent_ptr_decoded p)
{
ca = bch_dev_bkey_exists(c, p.ptr.dev);
- if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+ if (ca->mi.state != BCH_MEMBER_STATE_failed)
durability = max_t(unsigned, durability, ca->mi.durability);
- if (p.has_ec) {
- struct stripe *s =
- genradix_ptr(&c->stripes[0], p.ec.idx);
+ if (p.has_ec)
+ durability += p.ec.redundancy;
- if (WARN_ON(!s))
- goto out;
-
- durability += s->nr_redundant;
- }
-out:
return durability;
}
}
}
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+ k->k.u64s -= extent_entry_u64s(entry);
+}
+
void bch2_bkey_append_ptr(struct bkey_i *k,
struct bch_extent_ptr ptr)
{
/* will only happen if all pointers were cached: */
if (!bch2_bkey_nr_ptrs(k.s_c))
- k.k->type = KEY_TYPE_discard;
+ k.k->type = KEY_TYPE_deleted;
- return bkey_whiteout(k.k);
+ return bkey_deleted(k.k);
}
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_devs_list devs;
const union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
unsigned size_ondisk = k.k->size;
const char *reason;
unsigned nonce = UINT_MAX;
+ unsigned i;
- if (k.k->type == KEY_TYPE_btree_ptr)
+ if (k.k->type == KEY_TYPE_btree_ptr ||
+ k.k->type == KEY_TYPE_btree_ptr_v2)
size_ondisk = c->opts.btree_node_size;
- if (k.k->type == KEY_TYPE_btree_ptr_v2)
- size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors);
bkey_extent_entry_for_each(ptrs, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
}
}
+ devs = bch2_bkey_devs(k);
+ bubble_sort(devs.devs, devs.nr, u8_cmp);
+ for (i = 0; i + 1 < devs.nr; i++)
+ if (devs.devs[i] == devs.devs[i + 1])
+ return "multiple ptrs to same device";
+
return NULL;
}
len = where.offset - bkey_start_offset(k.k);
- k.k->p = where;
+ k.k->p.offset = where.offset;
k.k->size = len;
if (!len) {
/* KEY_TYPE_btree_ptr: */
const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
+const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \
.key_invalid = bch2_btree_ptr_invalid, \
- .key_debugcheck = bch2_btree_ptr_debugcheck, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
}
#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \
- .key_invalid = bch2_btree_ptr_invalid, \
- .key_debugcheck = bch2_btree_ptr_debugcheck, \
+ .key_invalid = bch2_btree_ptr_v2_invalid, \
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
/* KEY_TYPE_extent: */
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
enum merge_result bch2_extent_merge(struct bch_fs *,
struct bkey_s, struct bkey_s);
#define bch2_bkey_ops_extent (struct bkey_ops) { \
.key_invalid = bch2_extent_invalid, \
- .key_debugcheck = bch2_extent_debugcheck, \
.val_to_text = bch2_extent_to_text, \
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
unsigned, unsigned);
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
/* Generic extent code: */
+enum bch_extent_overlap {
+ BCH_EXTENT_OVERLAP_ALL = 0,
+ BCH_EXTENT_OVERLAP_BACK = 1,
+ BCH_EXTENT_OVERLAP_FRONT = 2,
+ BCH_EXTENT_OVERLAP_MIDDLE = 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+ const struct bkey *m)
+{
+ int cmp1 = bkey_cmp(k->p, m->p) < 0;
+ int cmp2 = bkey_cmp(bkey_start_pos(k),
+ bkey_start_pos(m)) > 0;
+
+ return (cmp1 << 1) + cmp2;
+}
+
int bch2_cut_front_s(struct bpos, struct bkey_s);
int bch2_cut_back_s(struct bpos, struct bkey_s);
{
struct bch_fs *c = trans->c;
struct btree_iter *dir_iter = NULL;
+ struct btree_iter *inode_iter = NULL;
struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
- u64 now = bch2_current_time(trans->c);
+ u64 now = bch2_current_time(c);
+ u64 dir_offset = 0;
int ret;
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
if (!name)
new_inode->bi_flags |= BCH_INODE_UNLINKED;
- ret = bch2_inode_create(trans, new_inode,
- BLOCKDEV_INODE_MAX, 0,
- &c->unused_inode_hint);
+ inode_iter = bch2_inode_create(trans, new_inode, U32_MAX);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
if (ret)
goto err;
ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
mode_to_type(new_inode->bi_mode),
name, new_inode->bi_inum,
+ &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
}
+
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
+ }
+
+ /* XXX use bch2_btree_iter_set_snapshot() */
+ inode_iter->snapshot = U32_MAX;
+ bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+
+ ret = bch2_inode_write(trans, inode_iter, new_inode);
err:
+ bch2_trans_iter_put(trans, inode_iter);
bch2_trans_iter_put(trans, dir_iter);
return ret;
}
u64 inum, struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u, const struct qstr *name)
{
+ struct bch_fs *c = trans->c;
struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
struct bch_hash_info dir_hash;
- u64 now = bch2_current_time(trans->c);
+ u64 now = bch2_current_time(c);
+ u64 dir_offset = 0;
int ret;
inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
+ inode_u->bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
ret = PTR_ERR_OR_ZERO(dir_iter);
if (ret)
dir_u->bi_mtime = dir_u->bi_ctime = now;
- dir_hash = bch2_hash_info_init(trans->c, dir_u);
+ dir_hash = bch2_hash_info_init(c, dir_u);
- ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
- mode_to_type(inode_u->bi_mode),
- name, inum, BCH_HASH_SET_MUST_CREATE) ?:
- bch2_inode_write(trans, dir_iter, dir_u) ?:
+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+ mode_to_type(inode_u->bi_mode),
+ name, inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ goto err;
+
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ inode_u->bi_dir = dir_inum;
+ inode_u->bi_dir_offset = dir_offset;
+ }
+
+ ret = bch2_inode_write(trans, dir_iter, dir_u) ?:
bch2_inode_write(trans, inode_iter, inode_u);
err:
bch2_trans_iter_put(trans, dir_iter);
struct bch_inode_unpacked *inode_u,
const struct qstr *name)
{
+ struct bch_fs *c = trans->c;
struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
*inode_iter = NULL;
struct bch_hash_info dir_hash;
- u64 inum, now = bch2_current_time(trans->c);
+ u64 inum, now = bch2_current_time(c);
struct bkey_s_c k;
int ret;
if (ret)
goto err;
- dir_hash = bch2_hash_info_init(trans->c, dir_u);
+ dir_hash = bch2_hash_info_init(c, dir_u);
dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
name, BTREE_ITER_INTENT);
const struct qstr *dst_name,
enum bch_rename_mode mode)
{
+ struct bch_fs *c = trans->c;
struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
struct bch_hash_info src_hash, dst_hash;
- u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
+ u64 src_inode, src_offset, dst_inode, dst_offset;
+ u64 now = bch2_current_time(c);
int ret;
src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
if (ret)
goto err;
- src_hash = bch2_hash_info_init(trans->c, src_dir_u);
+ src_hash = bch2_hash_info_init(c, src_dir_u);
if (dst_dir != src_dir) {
dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
if (ret)
goto err;
- dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
+ dst_hash = bch2_hash_info_init(c, dst_dir_u);
} else {
dst_dir_u = src_dir_u;
dst_hash = src_hash;
ret = bch2_dirent_rename(trans,
src_dir, &src_hash,
dst_dir, &dst_hash,
- src_name, &src_inode,
- dst_name, &dst_inode,
+ src_name, &src_inode, &src_offset,
+ dst_name, &dst_inode, &dst_offset,
mode);
if (ret)
goto err;
goto err;
}
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ src_inode_u->bi_dir = dst_dir_u->bi_inum;
+ src_inode_u->bi_dir_offset = dst_offset;
+
+ if (mode == BCH_RENAME_EXCHANGE) {
+ dst_inode_u->bi_dir = src_dir_u->bi_inum;
+ dst_inode_u->bi_dir_offset = src_offset;
+ }
+ }
+
if (mode == BCH_RENAME_OVERWRITE) {
if (S_ISDIR(src_inode_u->bi_mode) !=
S_ISDIR(dst_inode_u->bi_mode)) {
#include "bcachefs.h"
#include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
#include <trace/events/bcachefs.h>
#include <trace/events/writeback.h>
+static inline struct address_space *faults_disabled_mapping(void)
+{
+ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+ current->faults_disabled_mapping =
+ (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+ return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
struct quota_res {
u64 sectors;
};
struct closure cl;
struct kiocb *req;
long ret;
+ bool should_dirty;
struct bch_read_bio rbio;
};
/* for newly allocated pages: */
static void __bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = __bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ kfree(detach_page_private(page));
}
static void bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ EBUG_ON(!PageLocked(page));
+ __bch2_page_state_release(page);
}
/* for newly allocated pages: */
return NULL;
spin_lock_init(&s->lock);
- /*
- * migrate_page_move_mapping() assumes that pages with private data
- * have their count elevated by 1.
- */
- get_page(page);
- set_page_private(page, (unsigned long) s);
- SetPagePrivate(page);
+ attach_page_private(page, s);
return s;
}
vm_fault_t bch2_page_fault(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct address_space *fdm = faults_disabled_mapping();
struct bch_inode_info *inode = file_bch_inode(file);
int ret;
+ if (fdm == mapping)
+ return VM_FAULT_SIGBUS;
+
+ /* Lock ordering: */
+ if (fdm > mapping) {
+ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+ if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
+ goto got_lock;
+
+ bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
+
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+
+ bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
+
+ /* Signal that lock has been dropped: */
+ set_fdm_dropped_locks();
+ return VM_FAULT_SIGBUS;
+ }
+
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
+got_lock:
ret = filemap_fault(vmf);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
+ if (PagePrivate(page))
+ attach_page_private(newpage, detach_page_private(page));
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
bio_put(bio);
}
-static inline void page_state_init_for_read(struct page *page)
-{
- SetPagePrivate(page);
- page->private = 0;
-}
-
struct readpages_iter {
struct address_space *mapping;
struct page **pages;
unsigned nr_pages;
- unsigned nr_added;
unsigned idx;
pgoff_t offset;
};
static int readpages_iter_init(struct readpages_iter *iter,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct readahead_control *ractl)
{
+ unsigned i, nr_pages = readahead_count(ractl);
+
memset(iter, 0, sizeof(*iter));
- iter->mapping = mapping;
- iter->offset = list_last_entry(pages, struct page, lru)->index;
+ iter->mapping = ractl->mapping;
+ iter->offset = readahead_index(ractl);
+ iter->nr_pages = nr_pages;
iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!iter->pages)
return -ENOMEM;
- while (!list_empty(pages)) {
- struct page *page = list_last_entry(pages, struct page, lru);
-
- __bch2_page_state_create(page, __GFP_NOFAIL);
-
- iter->pages[iter->nr_pages++] = page;
- list_del(&page->lru);
+ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+ put_page(iter->pages[i]);
}
return 0;
static inline struct page *readpage_iter_next(struct readpages_iter *iter)
{
- struct page *page;
- unsigned i;
- int ret;
-
- BUG_ON(iter->idx > iter->nr_added);
- BUG_ON(iter->nr_added > iter->nr_pages);
-
- if (iter->idx < iter->nr_added)
- goto out;
-
- while (1) {
- if (iter->idx == iter->nr_pages)
- return NULL;
-
- ret = add_to_page_cache_lru_vec(iter->mapping,
- iter->pages + iter->nr_added,
- iter->nr_pages - iter->nr_added,
- iter->offset + iter->nr_added,
- GFP_NOFS);
- if (ret > 0)
- break;
-
- page = iter->pages[iter->nr_added];
- iter->idx++;
- iter->nr_added++;
-
- __bch2_page_state_release(page);
- put_page(page);
- }
-
- iter->nr_added += ret;
+ if (iter->idx >= iter->nr_pages)
+ return NULL;
- for (i = iter->idx; i < iter->nr_added; i++)
- put_page(iter->pages[i]);
-out:
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
return iter->pages[iter->idx];
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
int ret = 0;
rbio->c = c;
rbio->start_time = local_clock();
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
retry:
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
+ enum btree_id data_btree = BTREE_ID_extents;
bch2_btree_iter_set_pos(iter,
POS(inum, rbio->bio.bi_iter.bi_sector));
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
- ret = bch2_read_indirect_extent(trans,
+ ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
break;
if (bkey_extent_is_allocation(k.k))
bch2_add_page_sectors(&rbio->bio, k);
- bch2_read_extent(trans, rbio, k, offset_into_extent, flags);
+ bch2_read_extent(trans, rbio, iter->pos,
+ data_btree, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
break;
goto retry;
if (ret) {
- bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+ bch_err_inum_ratelimited(c, inum,
+ "read error %i from btree lookup", ret);
+ rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
}
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
}
-int bch2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
struct readpages_iter readpages_iter;
int ret;
- ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+ ret = readpages_iter_init(&readpages_iter, ractl);
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
BTREE_ITER_SLOTS);
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
-
- return 0;
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
BTREE_ITER_SLOTS);
bchfs_read(&trans, iter, rbio, inum, NULL);
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
}
unsigned i;
if (io->op.error) {
+ set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
bio_for_each_segment_all(bvec, bio, iter) {
struct bch_page_state *s;
/* O_DIRECT reads */
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+ if (check_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ bio_release_pages(bio, false);
+ bio_put(bio);
+ }
+}
+
static void bch2_dio_read_complete(struct closure *cl)
{
struct dio_read *dio = container_of(cl, struct dio_read, cl);
dio->req->ki_complete(dio->req, dio->ret, 0);
- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
}
static void bch2_direct_IO_read_endio(struct bio *bio)
static void bch2_direct_IO_read_split_endio(struct bio *bio)
{
+ struct dio_read *dio = bio->bi_private;
+ bool should_dirty = dio->should_dirty;
+
bch2_direct_IO_read_endio(bio);
- bio_check_pages_dirty(bio); /* transfers ownership */
+ bio_check_or_release(bio, should_dirty);
}
static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
dio->req = req;
dio->ret = ret;
+ /*
+ * This is one of the sketchier things I've encountered: we have to skip
+ * the dirtying of requests that are internal from the kernel (i.e. from
+ * loopback), because we'll deadlock on page_lock.
+ */
+ dio->should_dirty = iter_is_iovec(iter);
goto start;
while (iter->count) {
}
offset += bio->bi_iter.bi_size;
- bio_set_pages_dirty(bio);
+
+ if (dio->should_dirty)
+ bio_set_pages_dirty(bio);
if (iter->count)
closure_get(&dio->cl);
closure_sync(&dio->cl);
closure_debug_destroy(&dio->cl);
ret = dio->ret;
- bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
return ret;
} else {
return -EIOCBQUEUED;
struct bio *bio = &dio->op.wbio.bio;
struct bvec_iter_all iter;
struct bio_vec *bv;
- unsigned unaligned;
- bool sync = dio->sync;
+ unsigned unaligned, iter_count;
+ bool sync = dio->sync, dropped_locks;
long ret;
if (dio->loop)
goto loop;
while (1) {
+ iter_count = dio->iter.count;
+
if (kthread)
kthread_use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
ret = bio_iov_iter_get_pages(bio, &dio->iter);
+ dropped_locks = fdm_dropped_locks();
+
current->faults_disabled_mapping = NULL;
if (kthread)
kthread_unuse_mm(dio->mm);
+ /*
+ * If the fault handler returned an error but also signalled
+ * that it dropped & retook ei_pagecache_lock, we just need to
+ * re-shoot down the page cache and retry:
+ */
+ if (dropped_locks && ret)
+ ret = 0;
+
if (unlikely(ret < 0))
goto err;
+ if (unlikely(dropped_locks)) {
+ ret = write_invalidate_inode_pages_range(mapping,
+ req->ki_pos,
+ req->ki_pos + iter_count - 1);
+ if (unlikely(ret))
+ goto err;
+
+ if (!bio->bi_iter.bi_size)
+ continue;
+ }
+
unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
bio->bi_iter.bi_size -= unaligned;
iov_iter_revert(&dio->iter, unaligned);
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
!bch2_check_range_allocated(c, dio->op.pos,
- bio_sectors(bio), dio->op.opts.data_replicas))
+ bio_sectors(bio),
+ dio->op.opts.data_replicas,
+ dio->op.opts.compression != 0))
goto err;
task_io_account_write(bio->bi_iter.bi_size);
bio_for_each_segment_all(bv, bio, iter)
put_page(bv->bv_page);
- if (!dio->iter.count || dio->op.error)
+
+ if (dio->op.error) {
+ set_bit(EI_INODE_ERROR, &inode->ei_flags);
+ break;
+ }
+
+ if (!dio->iter.count)
break;
bio_reset(bio);
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) {
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
break;
}
}
+ bch2_trans_iter_put(&trans, iter);
return bch2_trans_exit(&trans) ?: ret;
}
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
ret = PTR_ERR_OR_ZERO(iter);
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
if (ret)
if (ret)
goto err;
- BUG_ON(inode->v.i_size < inode_u.bi_size);
+ WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
+ inode->v.i_size < inode_u.bi_size);
if (iattr->ia_size > inode->v.i_size) {
ret = bch2_extend(inode, &inode_u, iattr);
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
- struct bkey_on_stack copy;
+ struct bkey_buf copy;
struct btree_trans trans;
- struct btree_iter *src, *dst;
+ struct btree_iter *src, *dst, *del;
loff_t shift, new_size;
u64 src_start;
- int ret;
+ int ret = 0;
if ((offset | len) & (block_bytes(c) - 1))
return -EINVAL;
- bkey_on_stack_init(©);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
-
/*
* We need i_mutex to keep the page cache consistent with the extents
* btree, and the btree consistent with i_size - we don't need outside
goto err;
}
- src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ bch2_bkey_buf_init(©);
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
+ src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
POS(inode->v.i_ino, src_start >> 9),
BTREE_ITER_INTENT);
- BUG_ON(IS_ERR_OR_NULL(src));
-
dst = bch2_trans_copy_iter(&trans, src);
- BUG_ON(IS_ERR_OR_NULL(dst));
+ del = bch2_trans_copy_iter(&trans, src);
- while (1) {
+ while (ret == 0 || ret == -EINTR) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
? bch2_btree_iter_peek_prev(src)
: bch2_btree_iter_peek(src);
if ((ret = bkey_err(k)))
- goto bkey_err;
+ continue;
if (!k.k || k.k->p.inode != inode->v.i_ino)
break;
- BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k)));
-
if (insert &&
bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
break;
reassemble:
- bkey_on_stack_reassemble(©, c, k);
+ bch2_bkey_buf_reassemble(©, c, k);
if (insert &&
bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
if (ret)
- goto bkey_err;
+ continue;
if (bkey_cmp(atomic_end, copy.k->k.p)) {
if (insert) {
delete.k.p = copy.k->k.p;
delete.k.size = copy.k->k.size;
delete.k.p.offset -= shift >> 9;
+ bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
BUG_ON(ret);
}
- bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k));
-
- ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?:
+ ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
bch2_trans_commit(&trans, &disk_res,
&inode->ei_journal_seq,
BTREE_INSERT_NOFAIL);
bch2_disk_reservation_put(c, &disk_res);
-bkey_err:
+
if (!ret)
bch2_btree_iter_set_pos(src, next_pos);
-
- if (ret == -EINTR)
- ret = 0;
- if (ret)
- goto err;
-
- bch2_trans_cond_resched(&trans);
}
- bch2_trans_unlock(&trans);
+ bch2_trans_iter_put(&trans, del);
+ bch2_trans_iter_put(&trans, dst);
+ bch2_trans_iter_put(&trans, src);
+ bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(©, c);
+
+ if (ret)
+ goto err;
if (!insert) {
i_size_write(&inode->v, new_size);
mutex_unlock(&inode->ei_update_lock);
}
err:
- bch2_trans_exit(&trans);
- bkey_on_stack_exit(©, c);
bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v);
return ret;
truncate_pagecache_range(&inode->v, offset, end - 1);
}
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
POS(inode->v.i_ino, block_start >> 9),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
end_pos = POS(inode->v.i_ino, block_end >> 9);
- while (bkey_cmp(iter->pos, end_pos) < 0) {
+ while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
s64 i_sectors_delta = 0;
struct disk_reservation disk_res = { 0 };
struct quota_res quota_res = { 0 };
bch2_disk_reservation_put(c, &disk_res);
if (ret == -EINTR)
ret = 0;
- if (ret)
- goto err;
}
+ bch2_trans_iter_put(&trans, iter);
+
+ if (ret)
+ goto err;
/*
* Do we need to extend the file?
ret = PTR_ERR_OR_ZERO(inode_iter);
} while (ret == -EINTR);
+ bch2_trans_iter_put(&trans, inode_iter);
bch2_trans_unlock(&trans);
if (ret)
u64 aligned_len;
loff_t ret = 0;
- if (!c->opts.reflink)
- return -EOPNOTSUPP;
-
if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
return -EINVAL;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
break;
} else if (k.k->p.offset >> 9 > isize)
break;
}
+ bch2_trans_iter_put(&trans, iter);
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
int pg_offset;
loff_t ret = -1;
- page = find_lock_entry(mapping, index);
- if (!page || xa_is_value(page))
+ page = find_lock_page(mapping, index);
+ if (!page)
return offset;
pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
POS(inode->v.i_ino, offset >> 9),
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
offset = max(offset, bkey_start_offset(k.k) << 9);
}
}
+ bch2_trans_iter_put(&trans, iter);
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
int bch2_readpage(struct file *, struct page *);
int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
- struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page **, void **);
struct bch_inode_info *src,
const char __user *name)
{
+ struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
struct bch_inode_info *dst;
struct inode *vinode = NULL;
char *kname = NULL;
qstr.name = kname;
ret = -ENOENT;
- inum = bch2_dirent_lookup(c, src->v.i_ino,
- &src->ei_str_hash,
+ inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
&qstr);
if (!inum)
goto err1;
#include "bcachefs.h"
#include "acl.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
#include "chardev.h"
struct bch_inode_info *dst,
u64 journal_seq)
{
+ /*
+ * atomic64_cmpxchg has a fallback for archs that don't support it,
+ * cmpxchg does not:
+ */
+ atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
u64 old, v = READ_ONCE(dst->ei_journal_seq);
do {
if (old >= journal_seq)
break;
- } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+ } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
}
__pagecache_lock_put(lock, 1);
}
+bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
+{
+ return __pagecache_lock_tryget(lock, 1);
+}
+
void bch2_pagecache_add_get(struct pagecache_lock *lock)
{
__pagecache_lock_get(lock, 1);
return &inode->v;
}
+static int inum_test(struct inode *inode, void *p)
+{
+ unsigned long *ino = p;
+
+ return *ino == inode->i_ino;
+}
+
static struct bch_inode_info *
__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
umode_t mode, dev_t rdev, bool tmpfile)
if (!tmpfile)
mutex_lock(&dir->ei_update_lock);
- bch2_trans_init(&trans, c, 8, 1024);
+ bch2_trans_init(&trans, c, 8,
+ 2048 + (!tmpfile ? dentry->d_name.len : 0));
retry:
bch2_trans_begin(&trans);
* thread pulling the inode in and modifying it:
*/
- old = to_bch_ei(insert_inode_locked2(&inode->v));
- if (unlikely(old)) {
+ inode->v.i_state |= I_CREATING;
+ old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
+ inum_test, NULL, &inode->v.i_ino));
+ BUG_ON(!old);
+
+ if (unlikely(old != inode)) {
/*
* We raced, another process pulled the new inode into cache
* before us:
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
+ struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct inode *vinode = NULL;
u64 inum;
- inum = bch2_dirent_lookup(c, dir->v.i_ino,
- &dir->ei_str_hash,
+ inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
&dentry->d_name);
if (inum)
mutex_lock(&inode->ei_update_lock);
bch2_trans_init(&trans, c, 4, 1024);
- do {
- bch2_trans_begin(&trans);
- ret = bch2_link_trans(&trans,
+ ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq,
+ BTREE_INSERT_NOUNLOCK,
+ bch2_link_trans(&trans,
dir->v.i_ino,
inode->v.i_ino, &dir_u, &inode_u,
- &dentry->d_name) ?:
- bch2_trans_commit(&trans, NULL,
- &inode->ei_journal_seq,
- BTREE_INSERT_NOUNLOCK);
- } while (ret == -EINTR);
+ &dentry->d_name));
if (likely(!ret)) {
BUG_ON(inode_u.bi_inum != inode->v.i_ino);
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
bch2_trans_init(&trans, c, 4, 1024);
- do {
- bch2_trans_begin(&trans);
-
- ret = bch2_unlink_trans(&trans,
+ ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
+ BTREE_INSERT_NOUNLOCK|
+ BTREE_INSERT_NOFAIL,
+ bch2_unlink_trans(&trans,
dir->v.i_ino, &dir_u,
- &inode_u, &dentry->d_name) ?:
- bch2_trans_commit(&trans, NULL,
- &dir->ei_journal_seq,
- BTREE_INSERT_NOUNLOCK|
- BTREE_INSERT_NOFAIL);
- } while (ret == -EINTR);
+ &inode_u, &dentry->d_name));
if (likely(!ret)) {
BUG_ON(inode_u.bi_inum != inode->v.i_ino);
goto err;
}
-retry:
- bch2_trans_begin(&trans);
- ret = bch2_rename_trans(&trans,
- src_dir->v.i_ino, &src_dir_u,
- dst_dir->v.i_ino, &dst_dir_u,
- &src_inode_u,
- &dst_inode_u,
- &src_dentry->d_name,
- &dst_dentry->d_name,
- mode) ?:
- bch2_trans_commit(&trans, NULL,
- &journal_seq,
- BTREE_INSERT_NOUNLOCK);
- if (ret == -EINTR)
- goto retry;
+ ret = __bch2_trans_do(&trans, NULL, &journal_seq,
+ BTREE_INSERT_NOUNLOCK,
+ bch2_rename_trans(&trans,
+ src_dir->v.i_ino, &src_dir_u,
+ dst_dir->v.i_ino, &dst_dir_u,
+ &src_inode_u,
+ &dst_inode_u,
+ &src_dentry->d_name,
+ &dst_dentry->d_name,
+ mode));
if (unlikely(ret))
goto err;
bch2_setattr_copy(inode, &inode_u, attr);
if (attr->ia_valid & ATTR_MODE) {
- ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl);
+ ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
if (ret)
goto btree_err;
}
BTREE_INSERT_NOUNLOCK|
BTREE_INSERT_NOFAIL);
btree_err:
+ bch2_trans_iter_put(&trans, inode_iter);
+
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
struct fiemap_extent_info *info,
struct bkey_s_c k, unsigned flags)
{
- if (bkey_extent_is_data(k.k)) {
+ if (bkey_extent_is_direct_data(k.k)) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
}
return 0;
+ } else if (bkey_extent_is_inline_data(k.k)) {
+ return fiemap_fill_next_extent(info,
+ bkey_start_offset(k.k) << 9,
+ 0, k.k->size << 9,
+ flags|
+ FIEMAP_EXTENT_DATA_INLINE);
} else if (k.k->type == KEY_TYPE_reservation) {
return fiemap_fill_next_extent(info,
bkey_start_offset(k.k) << 9,
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_on_stack cur, prev;
+ struct bkey_buf cur, prev;
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
if (start + len < start)
return -EINVAL;
- bkey_on_stack_init(&cur);
- bkey_on_stack_init(&prev);
+ bch2_bkey_buf_init(&cur);
+ bch2_bkey_buf_init(&prev);
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
POS(ei->v.i_ino, start >> 9), 0);
retry:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
bkey_cmp(iter->pos, end) < 0) {
+ enum btree_id data_btree = BTREE_ID_extents;
+
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_advance(iter);
continue;
}
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
- bkey_on_stack_realloc(&cur, c, k.k->u64s);
- bkey_on_stack_realloc(&prev, c, k.k->u64s);
- bkey_reassemble(cur.k, k);
+ bch2_bkey_buf_reassemble(&cur, c, k);
- ret = bch2_read_indirect_extent(&trans,
+ ret = bch2_read_indirect_extent(&trans, &data_btree,
&offset_into_extent, &cur);
if (ret)
break;
k = bkey_i_to_s_c(cur.k);
+ bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
sectors = min(sectors, k.k->size - offset_into_extent);
- if (offset_into_extent)
- bch2_cut_front(POS(k.k->p.inode,
- bkey_start_offset(k.k) +
- offset_into_extent),
- cur.k);
+ bch2_cut_front(POS(k.k->p.inode,
+ bkey_start_offset(k.k) +
+ offset_into_extent),
+ cur.k);
bch2_key_resize(&cur.k->k, sectors);
cur.k->k.p = iter->pos;
cur.k->k.p.offset += cur.k->k.size;
bkey_copy(prev.k, cur.k);
have_extent = true;
- if (k.k->type == KEY_TYPE_reflink_v)
- bch2_btree_iter_set_pos(iter, k.k->p);
- else
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_set_pos(iter,
+ POS(iter->pos.inode, iter->pos.offset + sectors));
}
if (ret == -EINTR)
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
+ bch2_trans_iter_put(&trans, iter);
ret = bch2_trans_exit(&trans) ?: ret;
- bkey_on_stack_exit(&cur, c);
- bkey_on_stack_exit(&prev, c);
+ bch2_bkey_buf_exit(&cur, c);
+ bch2_bkey_buf_exit(&prev, c);
return ret < 0 ? ret : 0;
}
.open = generic_file_open,
.fsync = bch2_fsync,
.splice_read = generic_file_splice_read,
- /*
- * Broken, on v5.3:
+#if 0
+ /* Busted: */
.splice_write = iter_file_splice_write,
- */
+#endif
.fallocate = bch2_fallocate_dispatch,
.unlocked_ioctl = bch2_fs_file_ioctl,
#ifdef CONFIG_COMPAT
.writepage = bch2_writepage,
.readpage = bch2_readpage,
.writepages = bch2_writepages,
- .readpages = bch2_readpages,
+ .readahead = bch2_readahead,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
inode->v.i_generation = bi->bi_generation;
inode->v.i_size = bi->bi_size;
+ inode->ei_flags = 0;
inode->ei_journal_seq = 0;
inode->ei_quota_reserved = 0;
- inode->ei_str_hash = bch2_hash_info_init(c, bi);
inode->ei_qid = bch_qid(bi);
inode->v.i_mapping->a_ops = &bch_address_space_operations;
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode->v.i_ino);
+ bch2_inode_rm(c, inode->v.i_ino, true);
}
}
struct bch_fs *c = sb->s_fs_info;
struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
unsigned shift = sb->s_blocksize_bits - 9;
+ /*
+ * this assumes inodes take up 64 bytes, which is a decent average
+ * number:
+ */
+ u64 avail_inodes = ((usage.capacity - usage.used) << 3);
u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_blocks = usage.capacity >> shift;
buf->f_bfree = (usage.capacity - usage.used) >> shift;
buf->f_bavail = buf->f_bfree;
- buf->f_files = 0;
- buf->f_ffree = 0;
+
+ buf->f_files = usage.nr_inodes + avail_inodes;
+ buf->f_ffree = avail_inodes;
fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
}
void bch2_pagecache_add_put(struct pagecache_lock *);
+bool bch2_pagecache_add_tryget(struct pagecache_lock *);
void bch2_pagecache_add_get(struct pagecache_lock *);
void bch2_pagecache_block_put(struct pagecache_lock *);
void bch2_pagecache_block_get(struct pagecache_lock *);
struct bch_inode_info {
struct inode v;
+ unsigned long ei_flags;
struct mutex ei_update_lock;
u64 ei_journal_seq;
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
- struct bch_hash_info ei_str_hash;
-
/* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
};
+/*
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
+ * btree inode may be inconsistent:
+ */
+#define EI_INODE_ERROR 0
+
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "dirent.h"
#include "error.h"
u64 sectors = 0;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_EXTENTS,
+ for_each_btree_key(trans, iter, BTREE_ID_extents,
POS(inum, 0), 0, k, ret) {
if (k.k->p.inode != inum)
break;
buf[name.len] = '\0';
name.name = buf;
- ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode);
+ ret = __bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode, 0);
if (ret && ret != -EINTR)
bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
if (ret)
struct inode_walker *w, u64 inum)
{
if (inum != w->cur_inum) {
- int ret = bch2_inode_find_by_inum_trans(trans, inum,
- &w->inode);
+ int ret = __bch2_inode_find_by_inum_trans(trans, inum,
+ &w->inode, 0);
if (ret && ret != -ENOENT)
return ret;
bch2_trans_update(trans, k_iter, &delete, 0);
return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode,
- tmp, BCH_HASH_SET_MUST_CREATE);
+ tmp, 0);
}
static int fsck_hash_delete_at(struct btree_trans *trans,
return 0;
iter = bch2_trans_copy_iter(trans, h->chain);
- BUG_ON(IS_ERR(iter));
for_each_btree_key_continue(iter, 0, k2, ret) {
if (bkey_cmp(k2.k->p, k.k->p) >= 0)
struct hash_check *h,
struct btree_iter *k_iter, struct bkey_s_c k)
{
- bool hole = (k.k->type != KEY_TYPE_whiteout &&
+ bool hole = (k.k->type != KEY_TYPE_hash_whiteout &&
k.k->type != desc.key_type);
if (hole || k.k->p.offset > h->chain_end + 1)
hash_stop_chain(trans, h);
if (!hole) {
- if (!h->chain) {
+ if (!h->chain)
h->chain = bch2_trans_copy_iter(trans, k_iter);
- BUG_ON(IS_ERR(h->chain));
- }
h->chain_end = k.k->p.offset;
}
bch_err(c, "hash_redo_key err %i", ret);
return ret;
}
- return 1;
+ return -EINTR;
}
ret = hash_check_duplicates(trans, desc, h, k_iter, k);
if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n"
"hash table key at wrong offset: btree %u, offset %llu, "
"hashed to %llu chain starts at %llu\n%s",
- buf, strlen(buf), BTREE_ID_DIRENTS,
+ buf, strlen(buf), BTREE_ID_dirents,
k->k->p.offset, hash, h->chain->pos.offset,
(bch2_bkey_val_to_text(&PBUF(buf), c,
*k), buf))) {
goto err;
}
-static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
-{
- return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
- POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9),
- POS(inode_nr + 1, 0), NULL);
-}
-
-static int bch2_fix_overlapping_extent(struct btree_trans *trans,
- struct btree_iter *iter,
+static int fix_overlapping_extent(struct btree_trans *trans,
struct bkey_s_c k, struct bpos cut_at)
{
- struct btree_iter *u_iter;
+ struct btree_iter *iter;
struct bkey_i *u;
int ret;
bkey_reassemble(u, k);
bch2_cut_front(cut_at, u);
- u_iter = bch2_trans_copy_iter(trans, iter);
- ret = PTR_ERR_OR_ZERO(u_iter);
- if (ret)
- return ret;
/*
- * We don't want to go through the
- * extent_handle_overwrites path:
+ * We don't want to go through the extent_handle_overwrites path:
+ *
+ * XXX: this is going to screw up disk accounting, extent triggers
+ * assume things about extent overwrites - we should be running the
+ * triggers manually here
*/
- __bch2_btree_iter_set_pos(u_iter, u->k.p, false);
+ iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
+ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
- /*
- * XXX: this is going to leave disk space
- * accounting slightly wrong
- */
- ret = bch2_trans_update(trans, u_iter, u, 0);
- bch2_trans_iter_put(trans, u_iter);
- return ret;
+ BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+ bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN);
+ bch2_trans_iter_put(trans, iter);
+
+ return bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
}
/*
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_on_stack prev;
- u64 i_sectors;
+ struct bkey_buf prev;
+ u64 i_sectors = 0;
int ret = 0;
- bkey_on_stack_init(&prev);
+ bch2_bkey_buf_init(&prev);
prev.k->k = KEY(0, 0, 0);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch_verbose(c, "checking extents");
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_INTENT);
retry:
- for_each_btree_key_continue(iter, 0, k, ret) {
+ while ((k = bch2_btree_iter_peek(iter)).k &&
+ !(ret = bkey_err(k))) {
+ if (w.have_inode &&
+ w.cur_inum != k.k->p.inode &&
+ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
+ fsck_err_on(w.inode.bi_sectors != i_sectors, c,
+ "inode %llu has incorrect i_sectors: got %llu, should be %llu",
+ w.inode.bi_inum,
+ w.inode.bi_sectors, i_sectors)) {
+ struct btree_iter *inode_iter =
+ bch2_trans_get_iter(&trans, BTREE_ID_inodes,
+ POS(0, w.cur_inum),
+ BTREE_ITER_INTENT);
+
+ w.inode.bi_sectors = i_sectors;
+
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ bch2_inode_write(&trans, inode_iter, &w.inode));
+ bch2_trans_iter_put(&trans, inode_iter);
+ if (ret)
+ break;
+ }
+
if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
char buf1[200];
char buf2[200];
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
bch2_bkey_val_to_text(&PBUF(buf2), c, k);
- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_fix_overlapping_extent(&trans,
- iter, k, prev.k->k.p));
- if (ret)
- goto err;
- }
+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+ return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
}
- bkey_on_stack_reassemble(&prev, c, k);
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
+ if (w.first_this_inode)
+ i_sectors = 0;
+
if (fsck_err_on(!w.have_inode, c,
- "extent type %u for missing inode %llu",
- k.k->type, k.k->p.inode) ||
+ "extent type %u for missing inode %llu",
+ k.k->type, k.k->p.inode) ||
fsck_err_on(w.have_inode &&
- !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
- "extent type %u for non regular file, inode %llu mode %o",
- k.k->type, k.k->p.inode, w.inode.bi_mode)) {
- bch2_trans_unlock(&trans);
-
- ret = bch2_inode_truncate(c, k.k->p.inode, 0);
- if (ret)
- goto err;
- continue;
+ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
+ "extent type %u for non regular file, inode %llu mode %o",
+ k.k->type, k.k->p.inode, w.inode.bi_mode)) {
+ bch2_fs_lazy_rw(c);
+ return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
+ POS(k.k->p.inode, 0),
+ POS(k.k->p.inode, U64_MAX),
+ NULL) ?: -EINTR;
}
- if (fsck_err_on(w.first_this_inode &&
- w.have_inode &&
- !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
- w.inode.bi_sectors !=
- (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)),
- c, "inode %llu has incorrect i_sectors: got %llu, should be %llu",
- w.inode.bi_inum,
- w.inode.bi_sectors, i_sectors)) {
- struct bkey_inode_buf p;
-
- w.inode.bi_sectors = i_sectors;
-
- bch2_trans_unlock(&trans);
-
- bch2_inode_pack(&p, &w.inode);
-
- ret = bch2_btree_insert(c, BTREE_ID_INODES,
- &p.inode.k_i, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
- if (ret) {
- bch_err(c, "error in fsck: error %i updating inode", ret);
- goto err;
- }
-
- /* revalidate iterator: */
- k = bch2_btree_iter_peek(iter);
+ if (fsck_err_on(w.have_inode &&
+ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ k.k->type != KEY_TYPE_reservation &&
+ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
+ "extent type %u offset %llu past end of inode %llu, i_size %llu",
+ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
+ bch2_fs_lazy_rw(c);
+ return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
+ POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c))),
+ POS(k.k->p.inode, U64_MAX),
+ NULL) ?: -EINTR;
}
- if (fsck_err_on(w.have_inode &&
- !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- k.k->type != KEY_TYPE_reservation &&
- k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
- "extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
- bch2_trans_unlock(&trans);
+ if (bkey_extent_is_allocation(k.k))
+ i_sectors += k.k->size;
+ bch2_bkey_buf_reassemble(&prev, c, k);
- ret = bch2_inode_truncate(c, k.k->p.inode,
- w.inode.bi_size);
- if (ret)
- goto err;
- continue;
- }
+ bch2_btree_iter_advance(iter);
}
-err:
fsck_err:
if (ret == -EINTR)
goto retry;
- bkey_on_stack_exit(&prev, c);
+ bch2_trans_iter_put(&trans, iter);
+ bch2_bkey_buf_exit(&prev, c);
return bch2_trans_exit(&trans) ?: ret;
}
hash_check_init(&h);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
- for_each_btree_key_continue(iter, 0, k, ret) {
+ while ((k = bch2_btree_iter_peek(iter)).k &&
+ !(ret = bkey_err(k))) {
struct bkey_s_c_dirent d;
struct bch_inode_unpacked target;
bool have_target;
continue;
}
- ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target);
+ ret = __bch2_inode_find_by_inum_trans(&trans, d_inum, &target, 0);
if (ret && ret != -ENOENT)
break;
continue;
}
+ if (!target.bi_nlink &&
+ !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+ (target.bi_dir != k.k->p.inode ||
+ target.bi_dir_offset != k.k->p.offset) &&
+ (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+ "inode %llu has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ d_inum,
+ target.bi_dir,
+ target.bi_dir_offset,
+ k.k->p.inode,
+ k.k->p.offset) ||
+ c->opts.version_upgrade)) {
+ struct bkey_inode_buf p;
+
+ target.bi_dir = k.k->p.inode;
+ target.bi_dir_offset = k.k->p.offset;
+ bch2_trans_unlock(&trans);
+
+ bch2_inode_pack(c, &p, &target);
+
+ ret = bch2_btree_insert(c, BTREE_ID_inodes,
+ &p.inode.k_i, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW);
+ if (ret) {
+ bch_err(c, "error in fsck: error %i updating inode", ret);
+ goto err;
+ }
+ continue;
+ }
+
if (fsck_err_on(have_target &&
d.v->d_type !=
mode_to_type(target.bi_mode), c,
goto err;
}
+
+ bch2_btree_iter_advance(iter);
}
hash_stop_chain(&trans, &h);
if (ret == -EINTR)
goto retry;
+ bch2_trans_iter_put(&trans, h.chain);
+ bch2_trans_iter_put(&trans, iter);
return bch2_trans_exit(&trans) ?: ret;
}
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
POS(BCACHEFS_ROOT_INO, 0), 0);
retry:
- for_each_btree_key_continue(iter, 0, k, ret) {
+ while ((k = bch2_btree_iter_peek(iter)).k &&
+ !(ret = bkey_err(k))) {
ret = walk_inode(&trans, &w, k.k->p.inode);
if (ret)
break;
k.k->p.inode)) {
ret = bch2_btree_delete_at(&trans, iter, 0);
if (ret)
- goto err;
+ break;
continue;
}
ret = hash_check_key(&trans, bch2_xattr_hash_desc,
&h, iter, k);
if (ret)
- goto fsck_err;
+ break;
+
+ bch2_btree_iter_advance(iter);
}
-err:
fsck_err:
if (ret == -EINTR)
goto retry;
+
+ bch2_trans_iter_put(&trans, h.chain);
+ bch2_trans_iter_put(&trans, iter);
return bch2_trans_exit(&trans) ?: ret;
}
bch_verbose(c, "checking root directory");
- ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode);
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ __bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO,
+ root_inode, 0));
if (ret && ret != -ENOENT)
return ret;
0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO;
- bch2_inode_pack(&packed, root_inode);
+ bch2_inode_pack(c, &packed, root_inode);
- return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
+ return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
goto create_lostfound;
}
- ret = bch2_inode_find_by_inum(c, inum, lostfound_inode);
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ __bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0));
if (ret && ret != -ENOENT)
return ret;
return ret;
}
-struct inode_bitmap {
- unsigned long *bits;
- size_t size;
-};
+typedef GENRADIX(unsigned long) inode_bitmap;
-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr)
{
- return nr < b->size ? test_bit(nr, b->bits) : false;
+ unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG);
+ return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false;
}
-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+static inline int inode_bitmap_set(inode_bitmap *b, size_t nr)
{
- if (nr >= b->size) {
- size_t new_size = max_t(size_t, max_t(size_t,
- PAGE_SIZE * 8,
- b->size * 2),
- nr + 1);
- void *n;
-
- new_size = roundup_pow_of_two(new_size);
- n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
- if (!n) {
- return -ENOMEM;
- }
+ unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL);
- b->bits = n;
- b->size = new_size;
- }
+ if (!w)
+ return -ENOMEM;
- __set_bit(nr, b->bits);
+ *w |= 1UL << (nr & (BITS_PER_LONG - 1));
return 0;
}
static int check_directory_structure(struct bch_fs *c,
struct bch_inode_unpacked *lostfound_inode)
{
- struct inode_bitmap dirs_done = { NULL, 0 };
+ inode_bitmap dirs_done;
struct pathbuf path = { 0, 0, NULL };
struct pathbuf_entry *e;
struct btree_trans trans;
/* DFS: */
restart_dfs:
+ genradix_init(&dirs_done);
had_unreachable = false;
ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
if (e->offset == U64_MAX)
goto up;
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS,
+ for_each_btree_key(&trans, iter, BTREE_ID_dirents,
POS(e->inum, e->offset + 1), 0, k, ret) {
if (k.k->p.inode != e->inum)
break;
path.nr--;
}
- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS_MIN, 0);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
if (k.k->type != KEY_TYPE_inode)
if (had_unreachable) {
bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
- kfree(dirs_done.bits);
+ genradix_free(&dirs_done);
kfree(path.entries);
memset(&dirs_done, 0, sizeof(dirs_done));
memset(&path, 0, sizeof(path));
err:
fsck_err:
ret = bch2_trans_exit(&trans) ?: ret;
- kfree(dirs_done.bits);
+ genradix_free(&dirs_done);
kfree(path.entries);
return ret;
}
if (inum < range_start || inum >= *range_end)
return;
+ if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
+ *range_end = inum;
+ return;
+ }
+
link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
if (!link) {
bch_verbose(c, "allocation failed during fsck - will need another pass");
inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false);
- for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) {
+ for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
switch (k.k->type) {
case KEY_TYPE_dirent:
d = bkey_s_c_to_dirent(k);
bch2_trans_cond_resched(&trans);
}
+ bch2_trans_iter_put(&trans, iter);
+
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
bch2_fs_lazy_rw(c);
- ret = bch2_inode_rm(c, u.bi_inum);
+ ret = bch2_inode_rm(c, u.bi_inum, false);
if (ret)
bch_err(c, "error in fsck: error %i while deleting inode", ret);
return ret;
* XXX: need to truncate partial blocks too here - or ideally
* just switch units to bytes and that issue goes away
*/
-
- ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size);
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ POS(u.bi_inum, round_up(u.bi_size, block_bytes(c))),
+ POS(u.bi_inum, U64_MAX),
+ NULL);
if (ret) {
bch_err(c, "error in fsck: error %i truncating inode", ret);
return ret;
do_update = true;
}
+ if (!S_ISDIR(u.bi_mode) &&
+ u.bi_nlink &&
+ !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
+ (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
+ "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
+ c->opts.version_upgrade)) {
+ u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
+ do_update = true;
+ }
+
if (do_update) {
struct bkey_inode_buf p;
- bch2_inode_pack(&p, &u);
+ bch2_inode_pack(c, &p, &u);
+ p.inode.k.p = iter->pos;
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
POS(0, range_start), 0);
nlinks_iter = genradix_iter_init(links, 0);
while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret2 = bkey_err(k))) {
+ !(ret2 = bkey_err(k)) &&
+ iter->pos.offset < range_end) {
peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links);
if (!link && (!k.k || iter->pos.offset >= range_end))
break;
nlinks_pos = range_start + nlinks_iter.pos;
- if (iter->pos.offset > nlinks_pos) {
+
+ if (link && nlinks_pos < iter->pos.offset) {
/* Should have been caught by dirents pass: */
- need_fsck_err_on(link && link->count, c,
+ need_fsck_err_on(link->count, c,
"missing inode %llu (nlink %u)",
nlinks_pos, link->count);
genradix_iter_advance(&nlinks_iter, links);
goto peek_nlinks;
}
- if (iter->pos.offset < nlinks_pos || !link)
+ if (!link || nlinks_pos > iter->pos.offset)
link = &zero_links;
if (k.k && k.k->type == KEY_TYPE_inode) {
if (nlinks_pos == iter->pos.offset)
genradix_iter_advance(&nlinks_iter, links);
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_advance(iter);
bch2_trans_cond_resched(&trans);
}
fsck_err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
if (ret2)
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) {
+ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
if (k.k->type != KEY_TYPE_inode)
continue;
BCH_INODE_I_SECTORS_DIRTY|
BCH_INODE_UNLINKED)) {
ret = check_inode(&trans, NULL, iter, inode, NULL);
- BUG_ON(ret == -EINTR);
if (ret)
break;
}
}
+ bch2_trans_iter_put(&trans, iter);
+
BUG_ON(ret == -EINTR);
return bch2_trans_exit(&trans) ?: ret;
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_key_cache.h"
#include "bkey_methods.h"
#include "btree_update.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "str_hash.h"
+#include "varint.h"
#include <linux/random.h>
return bytes;
}
-void bch2_inode_pack(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
+static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
{
- u8 *out = packed->inode.v.fields;
+ struct bkey_i_inode *k = &packed->inode;
+ u8 *out = k->v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
unsigned bytes;
- bkey_inode_init(&packed->inode.k_i);
- packed->inode.k.p.offset = inode->bi_inum;
- packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
- packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
-
-#define x(_name, _bits) \
+#define x(_name, _bits) \
out += inode_encode_field(out, end, 0, inode->_name); \
nr_fields++; \
\
set_bkey_val_bytes(&packed->inode.k, bytes);
memset_u64s_tail(&packed->inode.v, 0, bytes);
- SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+ SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ struct bkey_i_inode *k = &packed->inode;
+ u8 *out = k->v.fields;
+ u8 *end = (void *) &packed[1];
+ u8 *last_nonzero_field = out;
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+ unsigned bytes;
+ int ret;
+
+#define x(_name, _bits) \
+ nr_fields++; \
+ \
+ if (inode->_name) { \
+ ret = bch2_varint_encode(out, inode->_name); \
+ out += ret; \
+ \
+ if (_bits > 64) \
+ *out++ = 0; \
+ \
+ last_nonzero_field = out; \
+ last_nonzero_fieldnr = nr_fields; \
+ } else { \
+ *out++ = 0; \
+ \
+ if (_bits > 64) \
+ *out++ = 0; \
+ }
+
+ BCH_INODE_FIELDS()
+#undef x
+ BUG_ON(out > end);
+
+ out = last_nonzero_field;
+ nr_fields = last_nonzero_fieldnr;
+
+ bytes = out - (u8 *) &packed->inode.v;
+ set_bkey_val_bytes(&packed->inode.k, bytes);
+ memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+ SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+void bch2_inode_pack(struct bch_fs *c,
+ struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ bkey_inode_init(&packed->inode.k_i);
+ packed->inode.k.p.offset = inode->bi_inum;
+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
+ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
+
+ if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
+ SET_INODE_NEW_VARINT(&packed->inode.v, true);
+ bch2_inode_pack_v2(packed, inode);
+ } else {
+ bch2_inode_pack_v1(packed, inode);
+ }
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
BUG_ON(unpacked.bi_mode != inode->bi_mode);
-#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name);
+#define x(_name, _bits) if (unpacked._name != inode->_name) \
+ panic("unpacked %llu should be %llu", \
+ (u64) unpacked._name, (u64) inode->_name);
BCH_INODE_FIELDS()
#undef x
}
}
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
- struct bch_inode_unpacked *unpacked)
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
{
const u8 *in = inode.v->fields;
- const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+ const u8 *end = bkey_val_end(inode);
u64 field[2];
unsigned fieldnr = 0, field_bits;
int ret;
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
-
#define x(_name, _bits) \
if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
memset(&unpacked->_name, 0, \
#undef x
/* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
+{
+ const u8 *in = inode.v->fields;
+ const u8 *end = bkey_val_end(inode);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v[2];
+
+#define x(_name, _bits) \
+ if (fieldnr < INODE_NR_FIELDS(inode.v)) { \
+ ret = bch2_varint_decode(in, end, &v[0]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ \
+ if (_bits > 64) { \
+ ret = bch2_varint_decode(in, end, &v[1]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v[1] = 0; \
+ } \
+ } else { \
+ v[0] = v[1] = 0; \
+ } \
+ \
+ unpacked->_name = v[0]; \
+ if (v[1] || v[0] != unpacked->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_INODE_FIELDS()
+#undef x
+
+ /* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
+{
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ if (INODE_NEW_VARINT(inode.v)) {
+ return bch2_inode_unpack_v2(inode, unpacked);
+ } else {
+ return bch2_inode_unpack_v1(inode, unpacked);
+ }
return 0;
}
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
- BTREE_ITER_SLOTS|flags);
- if (IS_ERR(iter))
- return iter;
-
- k = bch2_btree_iter_peek_slot(iter);
+ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_CACHED|flags);
+ k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret)
goto err;
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
- bch2_inode_pack(inode_p, inode);
+ bch2_inode_pack(trans->c, inode_p, inode);
+ inode_p->inode.k.p.snapshot = iter->snapshot;
bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
return 0;
}
return;
}
+ pr_buf(out, "mode: %o ", unpacked.bi_mode);
+
#define x(_name, _bits) \
pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
BCH_INODE_FIELDS()
}
}
-int bch2_inode_create(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- u64 min, u64 max, u64 *hint)
+struct btree_iter *bch2_inode_create(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode_u,
+ u32 snapshot)
{
- struct bkey_inode_buf *inode_p;
+ struct bch_fs *c = trans->c;
struct btree_iter *iter = NULL;
struct bkey_s_c k;
- u64 start;
+ u64 min, max, start, pos, *hint;
int ret;
- if (!max)
- max = ULLONG_MAX;
+ u64 cpu = raw_smp_processor_id();
+ unsigned bits = (c->opts.inodes_32bit
+ ? 31 : 63) - c->inode_shard_bits;
+
+ min = (cpu << bits);
+ max = (cpu << bits) | ~(ULLONG_MAX << bits);
- if (trans->c->opts.inodes_32bit)
- max = min_t(u64, max, U32_MAX);
+ min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+ hint = c->unused_inode_hints + cpu;
start = READ_ONCE(*hint);
if (start >= max || start < min)
start = min;
- inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
- if (IS_ERR(inode_p))
- return PTR_ERR(inode_p);
+ pos = start;
+ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
again:
- for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (bkey_cmp(iter->pos, POS(0, max)) > 0)
- break;
+ while ((k = bch2_btree_iter_peek(iter)).k &&
+ !(ret = bkey_err(k)) &&
+ bkey_cmp(k.k->p, POS(0, max)) < 0) {
+ while (pos < iter->pos.offset) {
+ if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
+ goto found_slot;
+
+ pos++;
+ }
- if (k.k->type != KEY_TYPE_inode)
- goto found_slot;
+ if (k.k->p.snapshot == snapshot &&
+ k.k->type != KEY_TYPE_inode &&
+ !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
+ bch2_btree_iter_next(iter);
+ continue;
+ }
+
+ /*
+ * We don't need to iterate over keys in every snapshot once
+ * we've found just one:
+ */
+ pos = iter->pos.offset + 1;
+ bch2_btree_iter_set_pos(iter, POS(0, pos));
}
- bch2_trans_iter_put(trans, iter);
+ while (!ret && pos < max) {
+ if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
+ goto found_slot;
- if (ret)
- return ret;
+ pos++;
+ }
- if (start != min) {
- /* Retry from start */
- start = min;
- goto again;
+ if (!ret && start == min)
+ ret = -ENOSPC;
+
+ if (ret) {
+ bch2_trans_iter_put(trans, iter);
+ return ERR_PTR(ret);
}
- return -ENOSPC;
+ /* Retry from start */
+ pos = start = min;
+ bch2_btree_iter_set_pos(iter, POS(0, pos));
+ goto again;
found_slot:
+ bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch2_trans_iter_put(trans, iter);
+ return ERR_PTR(ret);
+ }
+
+ /* We may have raced while the iterator wasn't pointing at pos: */
+ if (k.k->type == KEY_TYPE_inode ||
+ bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
+ goto again;
+
*hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
-
- bch2_inode_pack(inode_p, inode_u);
- bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
- bch2_trans_iter_put(trans, iter);
- return 0;
+ return iter;
}
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter *iter = NULL;
struct bkey_i_inode_generation delete;
struct bpos start = POS(inode_nr, 0);
struct bpos end = POS(inode_nr + 1, 0);
+ struct bch_inode_unpacked inode_u;
+ struct bkey_s_c k;
int ret;
+ bch2_trans_init(&trans, c, 0, 0);
+
/*
* If this was a directory, there shouldn't be any real dirents left -
* but there could be whiteouts (from hash collisions) that we should
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
- ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
- start, end, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_XATTRS,
- start, end, NULL) ?:
- bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
- start, end, NULL);
+ ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
+ start, end, NULL) ?:
+ bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
+ start, end, NULL) ?:
+ bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
+ start, end, NULL);
if (ret)
- return ret;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- do {
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- u32 bi_generation = 0;
-
- ret = bkey_err(k);
- if (ret)
- break;
+ goto err;
+retry:
+ bch2_trans_begin(&trans);
+
+ if (cached) {
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
+ BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_cached(iter);
+ } else {
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ }
- bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
- "inode %llu not found when deleting",
- inode_nr);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- switch (k.k->type) {
- case KEY_TYPE_inode: {
- struct bch_inode_unpacked inode_u;
+ if (k.k->type != KEY_TYPE_inode) {
+ bch2_fs_inconsistent(trans.c,
+ "inode %llu not found when deleting",
+ inode_nr);
+ ret = -EIO;
+ goto err;
+ }
- if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
- bi_generation = inode_u.bi_generation + 1;
- break;
- }
- case KEY_TYPE_inode_generation: {
- struct bkey_s_c_inode_generation g =
- bkey_s_c_to_inode_generation(k);
- bi_generation = le32_to_cpu(g.v->bi_generation);
- break;
- }
- }
+ bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
- if (!bi_generation) {
- bkey_init(&delete.k);
- delete.k.p.offset = inode_nr;
- } else {
- bkey_inode_generation_init(&delete.k_i);
- delete.k.p.offset = inode_nr;
- delete.v.bi_generation = cpu_to_le32(bi_generation);
- }
+ bkey_inode_generation_init(&delete.k_i);
+ delete.k.p = iter->pos;
+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
- bch2_trans_update(&trans, iter, &delete.k_i, 0);
+ bch2_trans_update(&trans, iter, &delete.k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
- } while (ret == -EINTR);
+ ret = bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ bch2_trans_iter_put(&trans, iter);
+ if (ret == -EINTR)
+ goto retry;
bch2_trans_exit(&trans);
return ret;
}
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
+int __bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode,
+ unsigned flags)
{
struct btree_iter *iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
- POS(0, inode_nr), BTREE_ITER_SLOTS);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
-
- k = bch2_btree_iter_peek_slot(iter);
+ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
+ POS(0, inode_nr), flags);
+ k = (flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED
+ ? bch2_btree_iter_peek_cached(iter)
+ : bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
return ret;
}
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode)
+{
+ return __bch2_inode_find_by_inum_trans(trans, inode_nr,
+ inode, BTREE_ITER_CACHED);
+
+}
+
int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
struct bch_inode_unpacked *inode)
{
return bch2_trans_do(c, NULL, NULL, 0,
bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void)
-{
- struct bch_inode_unpacked *u, test_inodes[] = {
- {
- .bi_atime = U64_MAX,
- .bi_ctime = U64_MAX,
- .bi_mtime = U64_MAX,
- .bi_otime = U64_MAX,
- .bi_size = U64_MAX,
- .bi_sectors = U64_MAX,
- .bi_uid = U32_MAX,
- .bi_gid = U32_MAX,
- .bi_nlink = U32_MAX,
- .bi_generation = U32_MAX,
- .bi_dev = U32_MAX,
- },
- };
-
- for (u = test_inodes;
- u < test_inodes + ARRAY_SIZE(test_inodes);
- u++) {
- struct bkey_inode_buf p;
-
- bch2_inode_pack(&p, u);
- }
-}
-#endif
.val_to_text = bch2_inode_generation_to_text, \
}
+#if 0
+typedef struct {
+ u64 lo;
+ u32 hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
struct bch_inode_unpacked {
u64 bi_inum;
__le64 bi_hash_seed;
#undef x
} __attribute__((packed, aligned(8)));
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
+ const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
struct btree_iter *bch2_inode_peek(struct btree_trans *,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
-int bch2_inode_create(struct btree_trans *,
- struct bch_inode_unpacked *,
- u64, u64, u64 *);
+struct btree_iter *bch2_inode_create(struct btree_trans *,
+ struct bch_inode_unpacked *, u32);
-int bch2_inode_rm(struct bch_fs *, u64);
+int bch2_inode_rm(struct bch_fs *, u64, bool);
+int __bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
+ struct bch_inode_unpacked *, unsigned);
int bch2_inode_find_by_inum_trans(struct btree_trans *, u64,
struct bch_inode_unpacked *);
int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
}
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void);
-#else
-static inline void bch2_inode_pack_test(void) {}
-#endif
-
#endif /* _BCACHEFS_INODE_H */
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "bset.h"
#include "btree_update.h"
#include "buckets.h"
while (size) {
struct page *page = __bio_alloc_page_pool(c, &using_mempool);
- unsigned len = min(PAGE_SIZE, size);
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
BUG_ON(!bio_add_page(bio, page, len, 0));
size -= len;
/* Extent update path: */
-static int sum_sector_overwrites(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct bkey_i *new,
- bool may_allocate,
- bool *maybe_extending,
- s64 *delta)
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct bkey_i *new,
+ bool *maybe_extending,
+ bool *should_check_enospc,
+ s64 *i_sectors_delta,
+ s64 *disk_sectors_delta)
{
+ struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_s_c old;
+ unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+ bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
int ret = 0;
- *maybe_extending = true;
- *delta = 0;
+ *maybe_extending = true;
+ *should_check_enospc = false;
+ *i_sectors_delta = 0;
+ *disk_sectors_delta = 0;
iter = bch2_trans_copy_iter(trans, extent_iter);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
- if (!may_allocate &&
- bch2_bkey_nr_ptrs_fully_allocated(old) <
- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
- ret = -ENOSPC;
- break;
- }
+ s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+ max(bkey_start_offset(&new->k),
+ bkey_start_offset(old.k));
- *delta += (min(new->k.p.offset,
- old.k->p.offset) -
- max(bkey_start_offset(&new->k),
- bkey_start_offset(old.k))) *
+ *i_sectors_delta += sectors *
(bkey_extent_is_allocation(&new->k) -
bkey_extent_is_allocation(old.k));
+ *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+ *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+ ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+ : 0;
+
+ if (!*should_check_enospc &&
+ (new_replicas > bch2_bkey_replicas(c, old) ||
+ (!new_compressed && bch2_bkey_sectors_compressed(old))))
+ *should_check_enospc = true;
+
if (bkey_cmp(old.k->p, new->k.p) >= 0) {
/*
* Check if there's already data above where we're
struct disk_reservation *disk_res,
u64 *journal_seq,
u64 new_i_size,
- s64 *i_sectors_delta)
+ s64 *i_sectors_delta_total)
{
/* this must live until after bch2_trans_commit(): */
struct bkey_inode_buf inode_p;
- bool extending = false;
- s64 delta = 0;
+ bool extending = false, should_check_enospc;
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
int ret;
ret = bch2_extent_trim_atomic(k, iter);
if (ret)
return ret;
- ret = sum_sector_overwrites(trans, iter, k,
- disk_res && disk_res->sectors != 0,
- &extending, &delta);
+ ret = bch2_sum_sector_overwrites(trans, iter, k,
+ &extending,
+ &should_check_enospc,
+ &i_sectors_delta,
+ &disk_sectors_delta);
if (ret)
return ret;
+ if (disk_res &&
+ disk_sectors_delta > (s64) disk_res->sectors) {
+ ret = bch2_disk_reservation_add(trans->c, disk_res,
+ disk_sectors_delta - disk_res->sectors,
+ !should_check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
+ if (ret)
+ return ret;
+ }
+
new_i_size = extending
? min(k->k.p.offset << 9, new_i_size)
: 0;
- if (delta || new_i_size) {
+ if (i_sectors_delta || new_i_size) {
struct btree_iter *inode_iter;
struct bch_inode_unpacked inode_u;
else
new_i_size = 0;
- inode_u.bi_sectors += delta;
+ inode_u.bi_sectors += i_sectors_delta;
+
+ if (i_sectors_delta || new_i_size) {
+ bch2_inode_pack(trans->c, &inode_p, &inode_u);
+
+ inode_p.inode.k.p.snapshot = iter->snapshot;
- if (delta || new_i_size) {
- bch2_inode_pack(&inode_p, &inode_u);
bch2_trans_update(trans, inode_iter,
&inode_p.inode.k_i, 0);
}
ret = bch2_trans_commit(trans, disk_res, journal_seq,
BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE);
- if (!ret && i_sectors_delta)
- *i_sectors_delta += delta;
+ BTREE_INSERT_NOFAIL);
+ if (ret)
+ return ret;
- return ret;
+ if (i_sectors_delta_total)
+ *i_sectors_delta_total += i_sectors_delta;
+ return 0;
}
int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
POS(inum, start),
BTREE_ITER_INTENT);
ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
journal_seq, i_sectors_delta);
+
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
if (ret == -EINTR)
int bch2_write_index_default(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct keylist *keys = &op->insert_keys;
struct bkey_i *k = bch2_keylist_front(keys);
struct btree_trans trans;
struct btree_iter *iter;
int ret;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
bkey_start_pos(&k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
k = bch2_keylist_front(keys);
- bkey_on_stack_realloc(&sk, c, k->k.u64s);
+ k->k.p.snapshot = iter->snapshot;
+
+ bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
bkey_copy(sk.k, k);
bch2_cut_front(iter->pos, sk.k);
bch2_keylist_pop_front(keys);
} while (!bch2_keylist_empty(keys));
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
n->submit_time = local_clock();
n->bio.bi_iter.bi_sector = ptr->offset;
- if (!journal_flushes_device(ca))
- n->bio.bi_opf |= REQ_FUA;
-
if (likely(n->have_ioref)) {
this_cpu_add(ca->io_done->sectors[WRITE][type],
bio_sectors(&n->bio));
op->written += sectors_start - keylist_sectors(keys);
if (ret) {
- __bcache_io_error(c, "btree IO error %i", ret);
+ bch_err_inum_ratelimited(c, op->pos.inode,
+ "write error %i from btree update", ret);
op->error = ret;
}
}
struct bch_fs *c = wbio->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+ op->pos.inode,
+ op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
+ "data write error: %s",
bch2_blk_status_to_str(bio->bi_status)))
set_bit(wbio->dev, op->failed.d);
wbio_init(bio)->put_bio = false;
if (bio_sectors(bio) & (c->opts.block_size - 1)) {
- __bcache_io_error(c, "misaligned write");
+ bch_err_inum_ratelimited(c, op->pos.inode,
+ "misaligned write");
op->error = -EIO;
goto err;
}
if (c->opts.nochanges ||
!percpu_ref_tryget(&c->writes)) {
- if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
- __bcache_io_error(c, "read only");
op->error = -EROFS;
goto err;
}
promote = __promote_alloc(c,
k.k->type == KEY_TYPE_reflink_v
- ? BTREE_ID_REFLINK
- : BTREE_ID_EXTENTS,
+ ? BTREE_ID_reflink
+ : BTREE_ID_extents,
k, pos, pick, opts, sectors, rbio);
if (!promote)
return NULL;
{
struct btree_trans trans;
struct btree_iter *iter;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct bkey_s_c k;
int ret;
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- rbio->pos, BTREE_ITER_SLOTS);
+ iter = bch2_trans_get_iter(&trans, rbio->data_btree,
+ rbio->read_pos, BTREE_ITER_SLOTS);
retry:
rbio->bio.bi_status = 0;
if (bkey_err(k))
goto err;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
bch2_trans_unlock(&trans);
if (!bch2_bkey_matches_ptr(c, k,
rbio->pick.ptr,
- rbio->pos.offset -
+ rbio->data_pos.offset -
rbio->pick.crc.offset)) {
/* extent we wanted to read no longer exists: */
rbio->hole = true;
goto out;
}
- ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags);
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter,
+ rbio->read_pos,
+ rbio->data_btree,
+ k, 0, failed, flags);
if (ret == READ_RETRY)
goto retry;
if (ret)
goto err;
out:
bch2_rbio_done(rbio);
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return;
err:
rbio->bio.bi_status = BLK_STS_IOERR;
goto out;
}
-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- struct bch_io_failures *failed, unsigned flags)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_on_stack sk;
- struct bkey_s_c k;
- int ret;
-
- flags &= ~BCH_READ_LAST_FRAGMENT;
- flags |= BCH_READ_MUST_CLONE;
-
- bkey_on_stack_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
-retry:
- bch2_trans_begin(&trans);
-
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_SLOTS, k, ret) {
- unsigned bytes, sectors, offset_into_extent;
-
- bkey_on_stack_reassemble(&sk, c, k);
-
- offset_into_extent = iter->pos.offset -
- bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
-
- ret = bch2_read_indirect_extent(&trans,
- &offset_into_extent, &sk);
- if (ret)
- break;
-
- k = bkey_i_to_s_c(sk.k);
-
- sectors = min(sectors, k.k->size - offset_into_extent);
-
- bch2_trans_unlock(&trans);
-
- bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
- swap(bvec_iter.bi_size, bytes);
-
- ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
- offset_into_extent, failed, flags);
- switch (ret) {
- case READ_RETRY:
- goto retry;
- case READ_ERR:
- goto err;
- };
-
- if (bytes == bvec_iter.bi_size)
- goto out;
-
- swap(bvec_iter.bi_size, bytes);
- bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
- }
-
- if (ret == -EINTR)
- goto retry;
- /*
- * If we get here, it better have been because there was an error
- * reading a btree node
- */
- BUG_ON(!ret);
- __bcache_io_error(c, "btree IO error: %i", ret);
-err:
- rbio->bio.bi_status = BLK_STS_IOERR;
-out:
- bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
- bch2_rbio_done(rbio);
-}
-
static void bch2_rbio_retry(struct work_struct *work)
{
struct bch_read_bio *rbio =
struct bch_fs *c = rbio->c;
struct bvec_iter iter = rbio->bvec_iter;
unsigned flags = rbio->flags;
- u64 inode = rbio->pos.inode;
+ u64 inode = rbio->read_pos.inode;
struct bch_io_failures failed = { .nr = 0 };
trace_read_retry(&rbio->bio);
flags |= BCH_READ_IN_RETRY;
flags &= ~BCH_READ_MAY_PROMOTE;
- if (flags & BCH_READ_NODECODE)
+ if (flags & BCH_READ_NODECODE) {
bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
- else
- bch2_read_retry(c, rbio, iter, inode, &failed, flags);
+ } else {
+ flags &= ~BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_MUST_CLONE;
+
+ __bch2_read(c, rbio, iter, inode, &failed, flags);
+ }
}
static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
struct bch_read_bio *rbio)
{
struct bch_fs *c = rbio->c;
- u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
+ u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
struct bch_extent_crc_unpacked new_crc;
struct btree_iter *iter = NULL;
struct bkey_i *new;
if (crc_is_compressed(rbio->pick.crc))
return 0;
- iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos,
+ iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if ((ret = PTR_ERR_OR_ZERO(iter)))
- goto out;
-
k = bch2_btree_iter_peek_slot(iter);
if ((ret = bkey_err(k)))
goto out;
- /*
- * going to be temporarily appending another checksum entry:
- */
- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
- BKEY_EXTENT_U64s_MAX * 8);
- if ((ret = PTR_ERR_OR_ZERO(new)))
- goto out;
-
- bkey_reassemble(new, k);
- k = bkey_i_to_s_c(new);
-
if (bversion_cmp(k.k->version, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
goto out;
}
+ /*
+ * going to be temporarily appending another checksum entry:
+ */
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+ sizeof(struct bch_extent_crc128));
+ if ((ret = PTR_ERR_OR_ZERO(new)))
+ goto out;
+
+ bkey_reassemble(new, k);
+
if (!bch2_bkey_narrow_crcs(new, new_crc))
goto out;
return;
}
- bch2_dev_io_error(ca,
- "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
- rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+ bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
+ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
return;
decompression_err:
- __bcache_io_error(c, "decompression error, inode %llu offset %llu",
- rbio->pos.inode,
- (u64) rbio->bvec_iter.bi_sector);
+ bch_err_inum_ratelimited(c, rbio->read_pos.inode,
+ "decompression error");
bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
return;
}
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
+ rbio->read_pos.inode,
+ rbio->read_pos.offset,
+ "data read error: %s",
bch2_blk_status_to_str(bio->bi_status))) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
int __bch2_read_indirect_extent(struct btree_trans *trans,
unsigned *offset_into_extent,
- struct bkey_on_stack *orig_k)
+ struct bkey_buf *orig_k)
{
struct btree_iter *iter;
struct bkey_s_c k;
reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
*offset_into_extent;
- iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+ iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
POS(0, reflink_offset),
BTREE_ITER_SLOTS);
- ret = PTR_ERR_OR_ZERO(iter);
- if (ret)
- return ret;
-
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
if (k.k->type != KEY_TYPE_reflink_v &&
k.k->type != KEY_TYPE_indirect_inline_data) {
- __bcache_io_error(trans->c,
+ bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
"pointer to nonexistent indirect extent");
ret = -EIO;
goto err;
}
*offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
- bkey_on_stack_reassemble(orig_k, trans->c, k);
+ bch2_bkey_buf_reassemble(orig_k, trans->c, k);
err:
bch2_trans_iter_put(trans, iter);
return ret;
}
int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bkey_s_c k,
+ struct bvec_iter iter, struct bpos read_pos,
+ enum btree_id data_btree, struct bkey_s_c k,
unsigned offset_into_extent,
struct bch_io_failures *failed, unsigned flags)
{
struct bch_dev *ca;
struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
- struct bpos pos = bkey_start_pos(k.k);
+ struct bpos data_pos = bkey_start_pos(k.k);
int pick_ret;
if (bkey_extent_is_inline_data(k.k)) {
goto hole;
if (pick_ret < 0) {
- __bcache_io_error(c, "no device to read from");
+ bch_err_inum_ratelimited(c, k.k->p.inode,
+ "no device to read from");
goto err;
}
pick.crc.offset ||
offset_into_extent));
- pos.offset += offset_into_extent;
+ data_pos.offset += offset_into_extent;
pick.ptr.offset += pick.crc.offset +
offset_into_extent;
offset_into_extent = 0;
/* XXX: only initialize this if needed */
rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
- rbio->pos = pos;
+ rbio->read_pos = read_pos;
+ rbio->data_btree = data_btree;
+ rbio->data_pos = data_pos;
rbio->version = k.k->version;
rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- if (pick.ptr.cached)
+ /*
+ * If it's being moved internally, we don't want to flag it as a cache
+ * hit:
+ */
+ if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
if (!rbio->pick.idx) {
if (!rbio->have_ioref) {
- __bcache_io_error(c, "no device to read from");
+ bch_err_inum_ratelimited(c, k.k->p.inode,
+ "no device to read from");
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}
ret = READ_RETRY;
}
+ if (!ret)
+ goto out_read_done;
+
return ret;
}
return 0;
}
-void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_io_failures *failed, unsigned flags)
{
struct btree_trans trans;
struct btree_iter *iter;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct bkey_s_c k;
- unsigned flags = BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE|
- BCH_READ_USER_MAPPED;
int ret;
- BUG_ON(rbio->_state);
BUG_ON(flags & BCH_READ_NODECODE);
- BUG_ON(flags & BCH_READ_IN_RETRY);
- rbio->c = c;
- rbio->start_time = local_clock();
-
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- POS(inode, rbio->bio.bi_iter.bi_sector),
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+ POS(inode, bvec_iter.bi_sector),
BTREE_ITER_SLOTS);
while (1) {
unsigned bytes, sectors, offset_into_extent;
+ enum btree_id data_btree = BTREE_ID_extents;
bch2_btree_iter_set_pos(iter,
- POS(inode, rbio->bio.bi_iter.bi_sector));
+ POS(inode, bvec_iter.bi_sector));
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
- goto err;
+ break;
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
- ret = bch2_read_indirect_extent(&trans,
+ ret = bch2_read_indirect_extent(&trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- goto err;
+ break;
k = bkey_i_to_s_c(sk.k);
*/
bch2_trans_unlock(&trans);
- bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
- swap(rbio->bio.bi_iter.bi_size, bytes);
+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+ swap(bvec_iter.bi_size, bytes);
- if (rbio->bio.bi_iter.bi_size == bytes)
+ if (bvec_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- bch2_read_extent(&trans, rbio, k, offset_into_extent, flags);
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
+ data_btree, k,
+ offset_into_extent, failed, flags);
+ if (ret)
+ break;
if (flags & BCH_READ_LAST_FRAGMENT)
break;
- swap(rbio->bio.bi_iter.bi_size, bytes);
- bio_advance(&rbio->bio, bytes);
+ swap(bvec_iter.bi_size, bytes);
+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
}
-out:
- bch2_trans_exit(&trans);
- bkey_on_stack_exit(&sk, c);
- return;
-err:
- if (ret == -EINTR)
+ bch2_trans_iter_put(&trans, iter);
+
+ if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
goto retry;
- bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
- bch2_rbio_done(rbio);
- goto out;
+ if (ret) {
+ bch_err_inum_ratelimited(c, inode,
+ "read error %i from btree lookup", ret);
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ bch2_rbio_done(rbio);
+ }
+ bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(&sk, c);
}
void bch2_fs_io_exit(struct bch_fs *c)
#define _BCACHEFS_IO_H
#include "checksum.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "io_types.h"
#define to_wbio(_bio) \
: op->c->wq;
}
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, bool *, bool *, s64 *, s64 *);
int bch2_extent_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, struct disk_reservation *,
u64 *, u64, s64 *);
struct extent_ptr_decoded;
int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
- struct bkey_on_stack *);
+ struct bkey_buf *);
static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+ enum btree_id *data_btree,
unsigned *offset_into_extent,
- struct bkey_on_stack *k)
+ struct bkey_buf *k)
{
- return k->k->k.type == KEY_TYPE_reflink_p
- ? __bch2_read_indirect_extent(trans, offset_into_extent, k)
- : 0;
+ if (k->k->k.type != KEY_TYPE_reflink_p)
+ return 0;
+
+ *data_btree = BTREE_ID_reflink;
+ return __bch2_read_indirect_extent(trans, offset_into_extent, k);
}
enum bch_read_flags {
};
int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
- struct bvec_iter, struct bkey_s_c, unsigned,
+ struct bvec_iter, struct bpos, enum btree_id,
+ struct bkey_s_c, unsigned,
struct bch_io_failures *, unsigned);
static inline void bch2_read_extent(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- struct bkey_s_c k,
- unsigned offset_into_extent,
- unsigned flags)
+ struct bch_read_bio *rbio, struct bpos read_pos,
+ enum btree_id data_btree, struct bkey_s_c k,
+ unsigned offset_into_extent, unsigned flags)
{
- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k,
- offset_into_extent, NULL, flags);
+ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+ data_btree, k, offset_into_extent, NULL, flags);
}
-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+ u64, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+ u64 inode)
+{
+ struct bch_io_failures failed = { .nr = 0 };
+
+ BUG_ON(rbio->_state);
+
+ rbio->c = c;
+ rbio->start_time = local_clock();
+
+ __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+ BCH_READ_RETRY_IF_STALE|
+ BCH_READ_MAY_PROMOTE|
+ BCH_READ_USER_MAPPED);
+}
static inline struct bch_read_bio *rbio_init(struct bio *bio,
struct bch_io_opts opts)
struct bch_devs_list devs_have;
struct extent_ptr_decoded pick;
- /* start pos of data we read (may not be pos of data we want) */
- struct bpos pos;
+
+ /*
+ * pos we read from - different from data_pos for indirect extents:
+ */
+ struct bpos read_pos;
+
+ /*
+ * start pos of data we read (may not be pos of data we want) - for
+ * promote, narrow extents paths:
+ */
+ enum btree_id data_btree;
+ struct bpos data_pos;
struct bversion version;
struct promote_op *promote;
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "btree_gc.h"
+#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include <trace/events/bcachefs.h>
-static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64);
+static u64 last_unwritten_seq(struct journal *j)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+
+ lockdep_assert_held(&j->lock);
+
+ return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
+}
+
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+ return seq >= last_unwritten_seq(j);
+}
static bool __journal_entry_is_open(union journal_res_state state)
{
return __journal_entry_is_open(j->reservations);
}
-static void journal_pin_new_entry(struct journal *j, int count)
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
{
- struct journal_entry_pin_list *p;
+ struct journal_buf *buf = NULL;
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for journal_last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
- p = fifo_push_ref(&j->pin);
+ EBUG_ON(seq > journal_cur_seq(j));
+ EBUG_ON(seq == journal_cur_seq(j) &&
+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+ if (journal_seq_unwritten(j, seq)) {
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
+ }
+ return buf;
+}
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
+{
INIT_LIST_HEAD(&p->list);
+ INIT_LIST_HEAD(&p->key_cache_list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
p->devs.nr = 0;
}
+static void journal_pin_new_entry(struct journal *j)
+{
+ /*
+ * The fifo_push() needs to happen at the same time as j->seq is
+ * incremented for journal_last_seq() to be calculated correctly
+ */
+ atomic64_inc(&j->seq);
+ journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+}
+
static void bch2_journal_buf_init(struct journal *j)
{
struct journal_buf *buf = journal_cur_buf(j);
+ bkey_extent_init(&buf->key);
+ buf->noflush = false;
+ buf->must_flush = false;
+ buf->separate_flush = false;
+
memset(buf->has_inode, 0, sizeof(buf->has_inode));
memset(buf->data, 0, sizeof(*buf->data));
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ j->err_seq = journal_cur_seq(j);
journal_wake(j);
closure_wake_up(&journal_cur_buf(j)->wait);
}
/* journal entry close/open: */
-void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+void __bch2_journal_buf_put(struct journal *j)
{
- if (!need_write_just_set &&
- test_bit(JOURNAL_NEED_WRITE, &j->flags))
- bch2_time_stats_update(j->delay_time,
- j->need_write_time);
-
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
/*
* Returns true if journal entry is now closed:
+ *
+ * We don't close a journal_buf until the next journal_buf is finished writing,
+ * and can be opened again - this also initializes the next journal_buf:
*/
static bool __journal_entry_close(struct journal *j)
{
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
- bool set_need_write = false;
unsigned sectors;
lockdep_assert_held(&j->lock);
if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
set_bit(JOURNAL_NEED_WRITE, &j->flags);
j->need_write_time = local_clock();
- set_need_write = true;
}
- if (new.prev_buf_unwritten)
- return false;
-
new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
new.idx++;
- new.prev_buf_unwritten = 1;
+
+ if (new.idx == new.unwritten_idx)
+ return false;
BUG_ON(journal_state_count(new, new.idx));
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ /* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
BUG_ON(sectors > buf->sectors);
buf->sectors = sectors;
- bkey_extent_init(&buf->key);
-
/*
* We have to set last_seq here, _before_ opening a new journal entry:
*
*/
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
- if (journal_entry_empty(buf->data))
- clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
- else
- set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+ __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
- journal_pin_new_entry(j, 1);
+ /* Initialize new buffer: */
+ journal_pin_new_entry(j);
bch2_journal_buf_init(j);
cancel_delayed_work(&j->write_work);
+ clear_bit(JOURNAL_NEED_WRITE, &j->flags);
bch2_journal_space_available(j);
- bch2_journal_buf_put(j, old.idx, set_need_write);
+ bch2_journal_buf_put(j, old.idx);
return true;
}
+static bool journal_entry_want_write(struct journal *j)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+ bool ret = false;
+
+ /*
+ * Don't close it yet if we already have a write in flight, but do set
+ * NEED_WRITE:
+ */
+ if (s.idx != s.unwritten_idx)
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
+ else
+ ret = __journal_entry_close(j);
+
+ return ret;
+}
+
static bool journal_entry_close(struct journal *j)
{
bool ret;
spin_lock(&j->lock);
- ret = __journal_entry_close(j);
+ ret = journal_entry_want_write(j);
spin_unlock(&j->lock);
return ret;
*/
static int journal_entry_open(struct journal *j)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
int u64s;
u64 v;
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
if (j->blocked)
- return -EAGAIN;
+ return cur_entry_blocked;
if (j->cur_entry_error)
return j->cur_entry_error;
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= le32_to_cpu(buf->data->u64s))
- return -ENOSPC;
+ return cur_entry_journal_full;
/*
* Must be set before marking the journal entry as open:
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return -EROFS;
+ return cur_entry_insufficient_devices;
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
static bool journal_quiesced(struct journal *j)
{
- union journal_res_state state = READ_ONCE(j->reservations);
- bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
+ union journal_res_state s = READ_ONCE(j->reservations);
+ bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
if (!ret)
journal_entry_close(j);
u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
{
size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
- u64 seq = 0;
+ union journal_res_state s;
+ unsigned i;
+ u64 seq;
- if (!test_bit(h, j->buf[0].has_inode) &&
- !test_bit(h, j->buf[1].has_inode))
- return 0;
spin_lock(&j->lock);
- if (test_bit(h, journal_cur_buf(j)->has_inode))
- seq = journal_cur_seq(j);
- else if (test_bit(h, journal_prev_buf(j)->has_inode))
- seq = journal_cur_seq(j) - 1;
+ seq = journal_cur_seq(j);
+ s = READ_ONCE(j->reservations);
+ i = s.idx;
+
+ while (1) {
+ if (test_bit(h, j->buf[i].has_inode))
+ goto out;
+
+ if (i == s.unwritten_idx)
+ break;
+
+ i = (i - 1) & JOURNAL_BUF_MASK;
+ seq--;
+ }
+
+ seq = 0;
+out:
spin_unlock(&j->lock);
return seq;
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
- ret = -ENOSPC;
+ ret = cur_entry_journal_full;
goto unlock;
}
* there's still a previous one in flight:
*/
trace_journal_entry_full(c);
- ret = -EAGAIN;
+ ret = cur_entry_blocked;
} else {
ret = journal_entry_open(j);
}
unlock:
- if ((ret == -EAGAIN || ret == -ENOSPC) &&
- !j->res_get_blocked_start)
+ if ((ret && ret != cur_entry_insufficient_devices) &&
+ !j->res_get_blocked_start) {
j->res_get_blocked_start = local_clock() ?: 1;
+ trace_journal_full(c);
+ }
can_discard = j->can_discard;
spin_unlock(&j->lock);
if (!ret)
goto retry;
- if (ret == -ENOSPC) {
- WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED),
- "JOURNAL_RES_GET_RESERVED set but journal full");
-
- /*
- * Journal is full - can't rely on reclaim from work item due to
- * freezing:
- */
- trace_journal_full(c);
+ if ((ret == cur_entry_journal_full ||
+ ret == cur_entry_journal_pin_full) &&
+ !can_discard &&
+ j->reservations.idx == j->reservations.unwritten_idx &&
+ (flags & JOURNAL_RES_GET_RESERVED)) {
+ char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+
+ bch_err(c, "Journal stuck!");
+ if (journal_debug_buf) {
+ bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+ bch_err(c, "%s", journal_debug_buf);
+
+ bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
+ bch_err(c, "Journal pins:\n%s", journal_debug_buf);
+ kfree(journal_debug_buf);
+ }
- if (!(flags & JOURNAL_RES_GET_NONBLOCK)) {
- if (can_discard) {
- bch2_journal_do_discards(j);
- goto retry;
- }
+ bch2_fatal_error(c);
+ dump_stack();
+ }
- if (mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
+ /*
+ * Journal is full - can't rely on reclaim from work item due to
+ * freezing:
+ */
+ if ((ret == cur_entry_journal_full ||
+ ret == cur_entry_journal_pin_full) &&
+ !(flags & JOURNAL_RES_GET_NONBLOCK)) {
+ if (can_discard) {
+ bch2_journal_do_discards(j);
+ goto retry;
}
- ret = -EAGAIN;
+ if (mutex_trylock(&j->reclaim_lock)) {
+ bch2_journal_reclaim(j);
+ mutex_unlock(&j->reclaim_lock);
+ }
}
- return ret;
+ return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
}
/*
unsigned new_u64s,
unsigned flags)
{
- bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags);
+ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
- if (!ret)
- bch2_journal_reclaim_work(&j->reclaim_work.work);
+ if (!ret && mutex_trylock(&j->reclaim_lock)) {
+ bch2_journal_reclaim(j);
+ mutex_unlock(&j->reclaim_lock);
+ }
return ret;
}
/* journal flushing: */
-u64 bch2_journal_last_unwritten_seq(struct journal *j)
-{
- u64 seq;
-
- spin_lock(&j->lock);
- seq = journal_cur_seq(j);
- if (j->reservations.prev_buf_unwritten)
- seq--;
- spin_unlock(&j->lock);
-
- return seq;
-}
-
/**
- * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
- * open yet, or wait if we cannot
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
*
- * used by the btree interior update machinery, when it needs to write a new
- * btree root - every journal entry contains the roots of all the btrees, so it
- * doesn't need to bother with getting a journal reservation
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
*/
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+ struct closure *parent)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- int ret;
+ struct journal_buf *buf;
+ int ret = 0;
- spin_lock(&j->lock);
+ if (seq <= j->flushed_seq_ondisk)
+ return 1;
- /*
- * Can't try to open more than one sequence number ahead:
- */
- BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
-
- if (journal_cur_seq(j) > seq ||
- journal_entry_is_open(j)) {
- spin_unlock(&j->lock);
- return 0;
- }
+ spin_lock(&j->lock);
- if (journal_cur_seq(j) < seq &&
- !__journal_entry_close(j)) {
- /* haven't finished writing out the previous one: */
- trace_journal_entry_full(c);
- ret = -EAGAIN;
- } else {
- BUG_ON(journal_cur_seq(j) != seq);
+ BUG_ON(seq > journal_cur_seq(j));
- ret = journal_entry_open(j);
+ /* Recheck under lock: */
+ if (j->err_seq && seq >= j->err_seq) {
+ ret = -EIO;
+ goto out;
}
- if ((ret == -EAGAIN || ret == -ENOSPC) &&
- !j->res_get_blocked_start)
- j->res_get_blocked_start = local_clock() ?: 1;
-
- if (ret == -EAGAIN || ret == -ENOSPC)
- closure_wait(&j->async_wait, cl);
-
- spin_unlock(&j->lock);
-
- if (ret == -ENOSPC) {
- trace_journal_full(c);
- bch2_journal_reclaim_work(&j->reclaim_work.work);
- ret = -EAGAIN;
+ if (seq <= j->flushed_seq_ondisk) {
+ ret = 1;
+ goto out;
}
- return ret;
-}
-
-static int journal_seq_error(struct journal *j, u64 seq)
-{
- union journal_res_state state = READ_ONCE(j->reservations);
+ /* if seq was written, but not flushed - flush a newer one instead */
+ seq = max(seq, last_unwritten_seq(j));
- if (seq == journal_cur_seq(j))
- return bch2_journal_error(j);
-
- if (seq + 1 == journal_cur_seq(j) &&
- !state.prev_buf_unwritten &&
- seq > j->seq_ondisk)
- return -EIO;
-
- return 0;
-}
-
-static inline struct journal_buf *
-journal_seq_to_buf(struct journal *j, u64 seq)
-{
- /* seq should be for a journal entry that has been opened: */
- BUG_ON(seq > journal_cur_seq(j));
- BUG_ON(seq == journal_cur_seq(j) &&
- j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+recheck_need_open:
+ if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
+ struct journal_res res = { 0 };
- if (seq == journal_cur_seq(j))
- return journal_cur_buf(j);
- if (seq + 1 == journal_cur_seq(j) &&
- j->reservations.prev_buf_unwritten)
- return journal_prev_buf(j);
- return NULL;
-}
+ spin_unlock(&j->lock);
-/**
- * bch2_journal_wait_on_seq - wait for a journal entry to be written
- *
- * does _not_ cause @seq to be written immediately - if there is no other
- * activity to cause the relevant journal entry to be filled up or flushed it
- * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
- * configurable).
- */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
- struct closure *parent)
-{
- struct journal_buf *buf;
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
+ if (ret)
+ return ret;
- spin_lock(&j->lock);
+ seq = res.seq;
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
+ buf->must_flush = true;
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
- if ((buf = journal_seq_to_buf(j, seq))) {
- if (!closure_wait(&buf->wait, parent))
+ if (parent && !closure_wait(&buf->wait, parent))
BUG();
- if (seq == journal_cur_seq(j)) {
- smp_mb();
- if (bch2_journal_error(j))
- closure_wake_up(&buf->wait);
- }
- }
+ bch2_journal_res_put(j, &res);
- spin_unlock(&j->lock);
-}
-
-/**
- * bch2_journal_flush_seq_async - wait for a journal entry to be written
- *
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
- * necessary
- */
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
- struct closure *parent)
-{
- struct journal_buf *buf;
-
- spin_lock(&j->lock);
-
- if (parent &&
- (buf = journal_seq_to_buf(j, seq)))
- if (!closure_wait(&buf->wait, parent))
- BUG();
-
- if (seq == journal_cur_seq(j))
- __journal_entry_close(j);
- spin_unlock(&j->lock);
-}
+ spin_lock(&j->lock);
+ goto want_write;
+ }
-static int journal_seq_flushed(struct journal *j, u64 seq)
-{
- int ret;
+ /*
+ * if write was kicked off without a flush, flush the next sequence
+ * number instead
+ */
+ buf = journal_seq_to_buf(j, seq);
+ if (buf->noflush) {
+ seq++;
+ goto recheck_need_open;
+ }
- spin_lock(&j->lock);
- ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
+ buf->must_flush = true;
+ if (parent && !closure_wait(&buf->wait, parent))
+ BUG();
+want_write:
if (seq == journal_cur_seq(j))
- __journal_entry_close(j);
+ journal_entry_want_write(j);
+out:
spin_unlock(&j->lock);
-
return ret;
}
u64 start_time = local_clock();
int ret, ret2;
- ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+ ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
- bch2_time_stats_update(j->flush_seq_time, start_time);
+ if (!ret)
+ bch2_time_stats_update(j->flush_seq_time, start_time);
return ret ?: ret2 < 0 ? ret2 : 0;
}
-/**
- * bch2_journal_meta_async - force a journal entry to be written
- */
-void bch2_journal_meta_async(struct journal *j, struct closure *parent)
-{
- struct journal_res res;
-
- memset(&res, 0, sizeof(res));
-
- bch2_journal_res_get(j, &res, jset_u64s(0), 0);
- bch2_journal_res_put(j, &res);
-
- bch2_journal_flush_seq_async(j, res.seq, parent);
-}
-
int bch2_journal_meta(struct journal *j)
{
struct journal_res res;
if (nr <= ja->nr)
return 0;
- ret = -ENOMEM;
new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq)
+ if (!new_buckets || !new_bucket_seq) {
+ ret = -ENOMEM;
goto err;
+ }
journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
+ nr + sizeof(*journal_buckets) / sizeof(u64));
+ if (!journal_buckets) {
+ ret = -ENOSPC;
goto err;
+ }
/*
* We may be called from the device add path, before the new device has
goto err;
}
} else {
- ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
+ rcu_read_lock();
+ ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
false, cl);
+ rcu_read_unlock();
if (IS_ERR(ob)) {
ret = cl ? -EAGAIN : -ENOSPC;
goto err;
spin_lock(&c->journal.lock);
}
+ /*
+ * XXX
+ * For resize at runtime, we should be writing the new
+ * superblock before inserting into the journal array
+ */
+
pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
__array_insert_item(ja->buckets, ja->nr, pos);
__array_insert_item(ja->bucket_seq, ja->nr, pos);
if (pos <= ja->cur_idx)
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB),
- 0);
+ if (!c || new_fs)
+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal,
+ ca->mi.bucket_size,
+ gc_phase(GC_PHASE_SB),
+ 0);
if (c) {
spin_unlock(&c->journal.lock);
percpu_up_read(&c->mark_lock);
}
+ if (c && !new_fs)
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_trans_mark_metadata_bucket(&trans, NULL, ca,
+ bucket, BCH_DATA_journal,
+ ca->mi.bucket_size));
+
if (!new_fs)
bch2_open_bucket_put(c, ob);
- }
- ret = 0;
+ if (ret)
+ goto err;
+ }
err:
+ bch2_sb_resize_journal(&ca->disk_sb,
+ ja->nr + sizeof(*journal_buckets) / sizeof(u64));
kfree(new_bucket_seq);
kfree(new_buckets);
if (dynamic_fault("bcachefs:add:journal_alloc"))
return -ENOMEM;
+ /* 1/128th of the device by default: */
+ nr = ca->mi.nbuckets >> 7;
+
/*
- * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
+ * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
* is smaller:
*/
- nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
+ nr = clamp_t(unsigned, nr,
BCH_JOURNAL_BUCKETS_MIN,
- min(1 << 10,
- (1 << 20) / ca->mi.bucket_size));
+ min(1 << 13,
+ (1 << 24) / ca->mi.bucket_size));
return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
}
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
union journal_res_state state;
- struct journal_buf *w;
- bool ret;
+ bool ret = false;
+ unsigned i;
spin_lock(&j->lock);
state = READ_ONCE(j->reservations);
- w = j->buf + !state.idx;
+ i = state.idx;
- ret = state.prev_buf_unwritten &&
- bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
+ while (i != state.unwritten_idx) {
+ i = (i - 1) & JOURNAL_BUF_MASK;
+ if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
+ ret = true;
+ }
spin_unlock(&j->lock);
return ret;
wait_event(j->wait, journal_entry_close(j));
- /* do we need to write another journal entry? */
- if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
- bch2_journal_meta(j);
+ /*
+ * Always write a new journal entry, to make sure the clock hands are up
+ * to date (and match the superblock)
+ */
+ bch2_journal_meta(j);
journal_quiesce(j);
BUG_ON(!bch2_journal_error(j) &&
- test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+ test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
+ (journal_entry_is_open(j) ||
+ j->last_empty_seq + 1 != journal_cur_seq(j)));
cancel_delayed_work_sync(&j->write_work);
- cancel_delayed_work_sync(&j->reclaim_work);
+ bch2_journal_reclaim_stop(j);
}
int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
- fifo_for_each_entry_ptr(p, &j->pin, seq) {
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 1);
- p->devs.nr = 0;
- }
+ fifo_for_each_entry_ptr(p, &j->pin, seq)
+ journal_pin_list_init(p, 1);
list_for_each_entry(i, journal_entries, list) {
+ unsigned ptr;
+
seq = le64_to_cpu(i->j.seq);
BUG_ON(seq >= cur_seq);
if (seq < last_seq)
continue;
- journal_seq_pin(j, seq)->devs = i->devs;
+ p = journal_seq_pin(j, seq);
+
+ p->devs.nr = 0;
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+ bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
}
spin_lock(&j->lock);
set_bit(JOURNAL_STARTED, &j->flags);
+ j->last_flush_write = jiffies;
+
+ journal_pin_new_entry(j);
+
+ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
- journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
c->last_bucket_seq_cleanup = journal_cur_seq(j);
void bch2_fs_journal_exit(struct journal *j)
{
- kvpfree(j->buf[1].data, j->buf[1].buf_size);
- kvpfree(j->buf[0].data, j->buf[0].buf_size);
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(j->buf); i++)
+ kvpfree(j->buf[i].data, j->buf[i].buf_size);
free_fifo(&j->pin);
}
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
static struct lock_class_key res_key;
+ unsigned i;
int ret = 0;
pr_verbose_init(c->opts, "");
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
- INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
+ init_waitqueue_head(&j->reclaim_wait);
init_waitqueue_head(&j->pin_flush_wait);
mutex_init(&j->reclaim_lock);
mutex_init(&j->discard_lock);
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
- j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
- /* Btree roots: */
- j->entry_u64s_reserved +=
- BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
- !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
ret = -ENOMEM;
goto out;
}
+ for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+ j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+ if (!j->buf[i].data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
j->pin.front = j->pin.back = 1;
out:
pr_verbose_init(c->opts, "ret %i", ret);
/* debug: */
-void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
struct bch_dev *ca;
- unsigned iter;
+ unsigned i;
rcu_read_lock();
- spin_lock(&j->lock);
s = READ_ONCE(j->reservations);
pr_buf(out,
"seq:\t\t\t%llu\n"
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
+ "flushed_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
+ "each entry reserved:\t%u\n"
+ "nr flush writes:\t%llu\n"
+ "nr noflush writes:\t%llu\n"
+ "nr direct reclaim:\t%llu\n"
+ "nr background reclaim:\t%llu\n"
"current entry sectors:\t%u\n"
+ "current entry error:\t%u\n"
"current entry:\t\t",
fifo_used(&j->pin),
journal_cur_seq(j),
journal_last_seq(j),
j->last_seq_ondisk,
+ j->flushed_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
- j->cur_entry_sectors);
+ j->entry_u64s_reserved,
+ j->nr_flush_writes,
+ j->nr_noflush_writes,
+ j->nr_direct_reclaim,
+ j->nr_background_reclaim,
+ j->cur_entry_sectors,
+ j->cur_entry_error);
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
}
pr_buf(out,
- "current entry refs:\t%u\n"
- "prev entry unwritten:\t",
- journal_state_count(s, s.idx));
-
- if (s.prev_buf_unwritten)
- pr_buf(out, "yes, ref %u sectors %u\n",
- journal_state_count(s, !s.idx),
- journal_prev_buf(j)->sectors);
- else
- pr_buf(out, "no\n");
+ "current entry:\t\tidx %u refcount %u\n",
+ s.idx, journal_state_count(s, s.idx));
+
+ i = s.idx;
+ while (i != s.unwritten_idx) {
+ i = (i - 1) & JOURNAL_BUF_MASK;
+
+ pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
+ i, journal_state_count(s, i), j->buf[i].sectors);
+ }
pr_buf(out,
"need write:\t\t%i\n"
test_bit(JOURNAL_NEED_WRITE, &j->flags),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
- for_each_member_device_rcu(ca, c, iter,
+ pr_buf(out, "space:\n");
+ pr_buf(out, "\tdiscarded\t%u:%u\n",
+ j->space[journal_space_discarded].next_entry,
+ j->space[journal_space_discarded].total);
+ pr_buf(out, "\tclean ondisk\t%u:%u\n",
+ j->space[journal_space_clean_ondisk].next_entry,
+ j->space[journal_space_clean_ondisk].total);
+ pr_buf(out, "\tclean\t\t%u:%u\n",
+ j->space[journal_space_clean].next_entry,
+ j->space[journal_space_clean].total);
+ pr_buf(out, "\ttotal\t\t%u:%u\n",
+ j->space[journal_space_total].next_entry,
+ j->space[journal_space_total].total);
+
+ for_each_member_device_rcu(ca, c, i,
&c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
+ if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
+ continue;
+
if (!ja->nr)
continue;
pr_buf(out,
"dev %u:\n"
"\tnr\t\t%u\n"
+ "\tbucket size\t%u\n"
"\tavailable\t%u:%u\n"
- "\tdiscard_idx\t\t%u\n"
- "\tdirty_idx_ondisk\t%u (seq %llu)\n"
- "\tdirty_idx\t\t%u (seq %llu)\n"
+ "\tdiscard_idx\t%u\n"
+ "\tdirty_ondisk\t%u (seq %llu)\n"
+ "\tdirty_idx\t%u (seq %llu)\n"
"\tcur_idx\t\t%u (seq %llu)\n",
- iter, ja->nr,
+ i, ja->nr, ca->mi.bucket_size,
bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
ja->sectors_free,
ja->discard_idx,
ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
}
- spin_unlock(&j->lock);
rcu_read_unlock();
}
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
+{
+ spin_lock(&j->lock);
+ __bch2_journal_debug_to_text(out, j);
+ spin_unlock(&j->lock);
+}
+
void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
{
struct journal_entry_pin_list *pin_list;
return j->buf + j->reservations.idx;
}
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
- return j->buf + !j->reservations.idx;
-}
-
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
static inline u64 journal_cur_seq(struct journal *j)
{
- BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+ EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
return j->pin.back - 1;
}
static inline int journal_state_count(union journal_res_state s, int idx)
{
- return idx == 0 ? s.buf0_count : s.buf1_count;
+ switch (idx) {
+ case 0: return s.buf0_count;
+ case 1: return s.buf1_count;
+ case 2: return s.buf2_count;
+ case 3: return s.buf3_count;
+ }
+ BUG();
}
static inline void journal_state_inc(union journal_res_state *s)
{
s->buf0_count += s->idx == 0;
s->buf1_count += s->idx == 1;
+ s->buf2_count += s->idx == 2;
+ s->buf3_count += s->idx == 3;
}
static inline void bch2_journal_set_has_inode(struct journal *j,
enum btree_id id, unsigned level,
const void *data, unsigned u64s)
{
- memset(entry, 0, sizeof(*entry));
entry->u64s = cpu_to_le16(u64s);
- entry->type = type;
entry->btree_id = id;
entry->level = level;
+ entry->type = type;
+ entry->pad[0] = 0;
+ entry->pad[1] = 0;
+ entry->pad[2] = 0;
memcpy_u64s_small(entry->_data, data, u64s);
return jset_u64s(u64s);
return true;
}
-void __bch2_journal_buf_put(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *);
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
- bool need_write_just_set)
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
{
union journal_res_state s;
s.v = atomic64_sub_return(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
+ .buf2_count = idx == 2,
+ .buf3_count = idx == 3,
}).v, &j->reservations.counter);
- if (!journal_state_count(s, idx)) {
- EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
- __bch2_journal_buf_put(j, need_write_just_set);
- }
+
+ EBUG_ON(((s.idx - idx) & 3) >
+ ((s.idx - s.unwritten_idx) & 3));
+
+ if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
+ __bch2_journal_buf_put(j);
}
/*
BCH_JSET_ENTRY_btree_keys,
0, 0, NULL, 0);
- bch2_journal_buf_put(j, res->idx, false);
+ bch2_journal_buf_put(j, res->idx);
res->ref = 0;
}
#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
#define JOURNAL_RES_GET_CHECK (1 << 1)
#define JOURNAL_RES_GET_RESERVED (1 << 2)
-#define JOURNAL_RES_GET_RECLAIM (1 << 3)
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
return 0;
- if (flags & JOURNAL_RES_GET_CHECK)
- return 1;
-
new.cur_entry_offset += res->u64s;
journal_state_inc(&new);
+
+ /*
+ * If the refcount would overflow, we have to wait:
+ * XXX - tracepoint this:
+ */
+ if (!journal_state_count(new, new.idx))
+ return 0;
+
+ if (flags & JOURNAL_RES_GET_CHECK)
+ return 1;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
static inline bool journal_check_may_get_unreserved(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
- bool ret = s.reserved <= s.remaining &&
+ bool ret = s.reserved < s.remaining &&
fifo_free(&j->pin) > 8;
lockdep_assert_held(&j->lock);
s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
res->u64s = 0;
- closure_wake_up(&j->preres_wait);
+
+ if (unlikely(s.waiting)) {
+ clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
+ (unsigned long *) &j->prereserved.v);
+ closure_wake_up(&j->preres_wait);
+ }
if (s.reserved <= s.remaining &&
!test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
static inline int bch2_journal_preres_get_fast(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
- unsigned flags)
+ unsigned flags,
+ bool set_waiting)
{
int d = new_u64s - res->u64s;
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
+ int ret;
do {
old.v = new.v = v;
-
- new.reserved += d;
-
- /*
- * If we're being called from the journal reclaim path, we have
- * to unconditionally give out the pre-reservation, there's
- * nothing else sensible we can do - otherwise we'd recurse back
- * into the reclaim path and deadlock:
- */
-
- if (!(flags & JOURNAL_RES_GET_RECLAIM) &&
- new.reserved > new.remaining)
+ ret = 0;
+
+ if ((flags & JOURNAL_RES_GET_RESERVED) ||
+ new.reserved + d < new.remaining) {
+ new.reserved += d;
+ ret = 1;
+ } else if (set_waiting && !new.waiting)
+ new.waiting = true;
+ else
return 0;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
- res->u64s += d;
- return 1;
+ if (ret)
+ res->u64s += d;
+ return ret;
}
static inline int bch2_journal_preres_get(struct journal *j,
if (new_u64s <= res->u64s)
return 0;
- if (bch2_journal_preres_get_fast(j, res, new_u64s, flags))
+ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
return 0;
if (flags & JOURNAL_RES_GET_NONBLOCK)
struct journal_entry_res *,
unsigned);
-u64 bch2_journal_last_unwritten_seq(struct journal *);
-int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
-
-void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
-void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
-void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
struct bch_dev;
-static inline bool journal_flushes_device(struct bch_dev *ca)
-{
- return true;
-}
-
static inline void bch2_journal_set_replay_done(struct journal *j)
{
BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
+#include "disk_groups.h"
#include "error.h"
#include "io.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
#include "replicas.h"
#include <trace/events/bcachefs.h>
+static void __journal_replay_free(struct journal_replay *i)
+{
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
+
+}
+
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+{
+ i->ignore = true;
+
+ if (!c->opts.read_entire_journal)
+ __journal_replay_free(i);
+}
+
struct journal_list {
struct closure cl;
struct mutex lock;
* be replayed:
*/
static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+ struct bch_extent_ptr entry_ptr,
struct journal_list *jlist, struct jset *j,
bool bad)
{
- struct journal_replay *i, *pos;
- struct bch_devs_list devs = { .nr = 0 };
+ struct journal_replay *i, *pos, *dup = NULL;
+ struct bch_extent_ptr *ptr;
struct list_head *where;
size_t bytes = vstruct_bytes(j);
- __le64 last_seq;
- int ret;
-
- last_seq = !list_empty(jlist->head)
- ? list_last_entry(jlist->head, struct journal_replay,
- list)->j.last_seq
- : 0;
+ u64 last_seq = 0;
+ int ret = JOURNAL_ENTRY_ADD_OK;
- if (!c->opts.read_entire_journal) {
- /* Is this entry older than the range we need? */
- if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
+ list_for_each_entry_reverse(i, jlist->head, list) {
+ if (!JSET_NO_FLUSH(&i->j)) {
+ last_seq = le64_to_cpu(i->j.last_seq);
+ break;
}
+ }
- /* Drop entries we don't need anymore */
+ /* Is this entry older than the range we need? */
+ if (!c->opts.read_entire_journal &&
+ le64_to_cpu(j->seq) < last_seq) {
+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+ goto out;
+ }
+
+ /* Drop entries we don't need anymore */
+ if (!JSET_NO_FLUSH(j)) {
list_for_each_entry_safe(i, pos, jlist->head, list) {
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ journal_replay_free(c, i);
}
}
where = jlist->head;
add:
- i = where->next != jlist->head
+ dup = where->next != jlist->head
? container_of(where->next, struct journal_replay, list)
: NULL;
+ if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
+ dup = NULL;
+
/*
* Duplicate journal entries? If so we want the one that didn't have a
* checksum error:
*/
- if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
- if (i->bad) {
- devs = i->devs;
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ if (dup) {
+ if (dup->bad) {
+ /* we'll replace @dup: */
} else if (bad) {
+ i = dup;
goto found;
} else {
- fsck_err_on(bytes != vstruct_bytes(&i->j) ||
- memcmp(j, &i->j, bytes), c,
+ fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
+ memcmp(j, &dup->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
+ i = dup;
goto found;
}
-
}
i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
goto out;
}
- list_add(&i->list, where);
- i->devs = devs;
- i->bad = bad;
+ i->nr_ptrs = 0;
+ i->bad = bad;
+ i->ignore = false;
memcpy(&i->j, j, bytes);
+
+ if (dup) {
+ i->nr_ptrs = dup->nr_ptrs;
+ memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
+ __journal_replay_free(dup);
+ }
+
+ list_add(&i->list, where);
found:
- if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
- bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
- else
- fsck_err_on(1, c, "duplicate journal entries on same device");
- ret = JOURNAL_ENTRY_ADD_OK;
+ for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
+ if (ptr->dev == ca->dev_idx) {
+ bch_err(c, "duplicate journal entry %llu on same device",
+ le64_to_cpu(i->j.seq));
+ goto out;
+ }
+ }
+
+ if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
+ bch_err(c, "found too many copies of journal entry %llu",
+ le64_to_cpu(i->j.seq));
+ goto out;
+ }
+
+ i->ptrs[i->nr_ptrs++] = entry_ptr;
out:
fsck_err:
return ret;
#define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
-static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+#define FSCK_DELETED_KEY 5
+
+static int journal_validate_key(struct bch_fs *c, const char *where,
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
- struct bkey_i *k,
- const char *type, int write)
+ struct bkey_i *k, const char *type,
+ unsigned version, int big_endian, int write)
{
void *next = vstruct_next(entry);
const char *invalid;
- unsigned version = le32_to_cpu(jset->version);
int ret = 0;
if (journal_entry_err_on(!k->k.u64s, c,
- "invalid %s in journal: k->u64s 0", type)) {
+ "invalid %s in %s entry offset %zi/%u: k->u64s 0",
+ type, where,
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry), c,
- "invalid %s in journal: extends past end of journal entry",
- type)) {
+ "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
+ type, where,
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in journal: bad format %u",
- type, k->k.format)) {
- le16_add_cpu(&entry->u64s, -k->k.u64s);
+ "invalid %s in %s entry offset %zi/%u: bad format %u",
+ type, where,
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s),
+ k->k.format)) {
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (!write)
- bch2_bkey_compat(level, btree_id, version,
- JSET_BIG_ENDIAN(jset), write,
- NULL, bkey_to_packed(k));
+ bch2_bkey_compat(level, btree_id, version, big_endian,
+ write, NULL, bkey_to_packed(k));
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id));
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
- type, invalid, buf);
+ mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
+ type, where,
+ (u64 *) k - entry->_data,
+ le16_to_cpu(entry->u64s),
+ invalid, buf);
- le16_add_cpu(&entry->u64s, -k->k.u64s);
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
journal_entry_null_range(vstruct_next(entry), next);
- return 0;
+ return FSCK_DELETED_KEY;
}
if (write)
- bch2_bkey_compat(level, btree_id, version,
- JSET_BIG_ENDIAN(jset), write,
- NULL, bkey_to_packed(k));
+ bch2_bkey_compat(level, btree_id, version, big_endian,
+ write, NULL, bkey_to_packed(k));
fsck_err:
return ret;
}
static int journal_entry_validate_btree_keys(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
- struct bkey_i *k;
+ struct bkey_i *k = entry->start;
- vstruct_for_each(entry, k) {
- int ret = journal_validate_key(c, jset, entry,
+ while (k != vstruct_last(entry)) {
+ int ret = journal_validate_key(c, where, entry,
entry->level,
entry->btree_id,
- k, "key", write);
- if (ret)
- return ret;
+ k, "key", version, big_endian, write);
+ if (ret == FSCK_DELETED_KEY)
+ continue;
+
+ k = bkey_next(k);
}
return 0;
}
static int journal_entry_validate_btree_root(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct bkey_i *k = entry->start;
int ret = 0;
return 0;
}
- return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
- "btree root", write);
+ return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
+ "btree root", version, big_endian, write);
fsck_err:
return ret;
}
static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
/* obsolete, don't care: */
return 0;
}
static int journal_entry_validate_blacklist(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
int ret = 0;
}
static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
}
static int journal_entry_validate_usage(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
}
static int journal_entry_validate_data_usage(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
return ret;
}
+static int journal_entry_validate_clock(struct bch_fs *c,
+ const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes != sizeof(*clock),
+ c, "invalid journal entry clock: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(clock->rw > 1,
+ c, "invalid journal entry clock: bad rw")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+ const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+ unsigned dev;
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes < expected,
+ c, "invalid journal entry dev usage: bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ dev = le32_to_cpu(u->dev);
+
+ if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+ c, "invalid journal entry dev usage: bad dev")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(u->pad,
+ c, "invalid journal entry dev usage: bad pad")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
struct jset_entry_ops {
- int (*validate)(struct bch_fs *, struct jset *,
- struct jset_entry *, int);
+ int (*validate)(struct bch_fs *, const char *,
+ struct jset_entry *, unsigned, int, int);
};
static const struct jset_entry_ops bch2_jset_entry_ops[] = {
#undef x
};
-static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
- struct jset_entry *entry, int write)
+int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
{
return entry->type < BCH_JSET_ENTRY_NR
- ? bch2_jset_entry_ops[entry->type].validate(c, jset,
- entry, write)
+ ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+ version, big_endian, write)
: 0;
}
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
int write)
{
+ char buf[100];
struct jset_entry *entry;
int ret = 0;
vstruct_for_each(jset, entry) {
+ scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
+ le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s));
+
if (journal_entry_err_on(vstruct_next(entry) >
vstruct_last(jset), c,
"journal entry extends past end of jset")) {
break;
}
- ret = journal_entry_validate(c, jset, entry, write);
+ ret = bch2_journal_entry_validate(c, buf, entry,
+ le32_to_cpu(jset->version),
+ JSET_BIG_ENDIAN(jset), write);
if (ret)
break;
}
version < bcachefs_metadata_version_min) ||
version >= bcachefs_metadata_version_max, c,
"%s sector %llu seq %llu: unknown journal entry version %u",
- ca->name, sector, le64_to_cpu(jset->seq),
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
version)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
+ /* don't try to continue: */
+ return EINVAL;
}
+ if (bytes > (sectors_read << 9) &&
+ sectors_read < bucket_sectors_left)
+ return JOURNAL_ENTRY_REREAD;
+
if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
- ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq), bytes)) {
+ ret = JOURNAL_ENTRY_BAD;
+ le32_add_cpu(&jset->u64s,
+ -((bytes - (bucket_sectors_left << 9)) / 8));
}
- if (bytes > sectors_read << 9)
- return JOURNAL_ENTRY_REREAD;
-
- if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
- ca->name, sector, le64_to_cpu(jset->seq),
- JSET_CSUM_TYPE(jset)))
- return JOURNAL_ENTRY_BAD;
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq),
+ JSET_CSUM_TYPE(jset))) {
+ ret = JOURNAL_ENTRY_BAD;
+ goto csum_done;
+ }
+
+ if (write)
+ goto csum_done;
csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
"%s sector %llu seq %llu: journal checksum bad",
- ca->name, sector, le64_to_cpu(jset->seq))) {
- /* XXX: retry IO, when we start retrying checksum errors */
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
+ ca ? ca->name : c->name,
+ sector, le64_to_cpu(jset->seq)))
+ ret = JOURNAL_ENTRY_BAD;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
-
- if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
- "invalid journal entry: last_seq > seq")) {
+csum_done:
+ /* last_seq is ignored when JSET_NO_FLUSH is true */
+ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
+ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+ "invalid journal entry: last_seq > seq (%llu > %llu)",
+ le64_to_cpu(jset->last_seq),
+ le64_to_cpu(jset->seq))) {
jset->last_seq = jset->seq;
return JOURNAL_ENTRY_BAD;
}
-
- return 0;
fsck_err:
return ret;
}
+static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
+{
+ unsigned sectors = vstruct_sectors(jset, c->block_bits);
+
+ return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
+ jset_validate_entries(c, jset, WRITE);
+}
+
struct journal_read_buf {
void *data;
size_t size;
bio_put(bio);
if (bch2_dev_io_err_on(ret, ca,
- "journal read from sector %llu",
+ "journal read error: sector %llu",
offset) ||
- bch2_meta_read_fault("journal"))
- return -EIO;
+ bch2_meta_read_fault("journal")) {
+ /*
+ * We don't error out of the recovery process
+ * here, since the relevant journal entry may be
+ * found on a different device, and missing or
+ * no journal entries will be handled later
+ */
+ return 0;
+ }
j = buf->data;
}
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, jlist, j, ret != 0);
+ ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
+ .dev = ca->dev_idx,
+ .offset = offset,
+ }, jlist, j, ret != 0);
mutex_unlock(&jlist->lock);
switch (ret) {
goto out;
}
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ unsigned i;
+
+ for (i = 0; i < j->nr_ptrs; i++) {
+ struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+ u64 offset;
+
+ div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
+
+ if (i)
+ pr_buf(out, " ");
+ pr_buf(out, "%u:%llu (offset %llu)",
+ j->ptrs[i].dev,
+ (u64) j->ptrs[i].offset, offset);
+ }
+}
+
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
+ u64 *blacklist_seq, u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i;
+ struct journal_replay *i, *t;
struct bch_dev *ca;
unsigned iter;
size_t keys = 0, entries = 0;
bool degraded = false;
+ u64 seq, last_seq = 0;
int ret = 0;
closure_init_stack(&jlist.cl);
!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
- if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
- ca->mi.state == BCH_MEMBER_STATE_RO) &&
+ if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+ ca->mi.state == BCH_MEMBER_STATE_ro) &&
percpu_ref_tryget(&ca->io_ref))
closure_call(&ca->journal.read,
bch2_journal_read_device,
if (jlist.ret)
return jlist.ret;
+ if (list_empty(list)) {
+ bch_info(c, "journal read done, but no entries found");
+ return 0;
+ }
+
+ i = list_last_entry(list, struct journal_replay, list);
+ *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+ /*
+ * Find most recent flush entry, and ignore newer non flush entries -
+ * those entries will be blacklisted:
+ */
+ list_for_each_entry_safe_reverse(i, t, list, list) {
+ if (i->ignore)
+ continue;
+
+ if (!JSET_NO_FLUSH(&i->j)) {
+ last_seq = le64_to_cpu(i->j.last_seq);
+ *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
+ break;
+ }
+
+ journal_replay_free(c, i);
+ }
+
+ if (!last_seq) {
+ fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+ return -1;
+ }
+
+ /* Drop blacklisted entries and entries older than last_seq: */
+ list_for_each_entry_safe(i, t, list, list) {
+ if (i->ignore)
+ continue;
+
+ seq = le64_to_cpu(i->j.seq);
+ if (seq < last_seq) {
+ journal_replay_free(c, i);
+ continue;
+ }
+
+ if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
+ fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+ "found blacklisted journal entry %llu", seq);
+
+ journal_replay_free(c, i);
+ }
+ }
+
+ /* Check for missing entries: */
+ seq = last_seq;
+ list_for_each_entry(i, list, list) {
+ if (i->ignore)
+ continue;
+
+ BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+ while (seq < le64_to_cpu(i->j.seq)) {
+ u64 missing_start, missing_end;
+ char buf1[200], buf2[200];
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (seq == le64_to_cpu(i->j.seq))
+ break;
+
+ missing_start = seq;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ !bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (i->list.prev != list) {
+ struct printbuf out = PBUF(buf1);
+ struct journal_replay *p = list_prev_entry(i, list);
+
+ bch2_journal_ptrs_to_text(&out, c, p);
+ pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+ } else
+ sprintf(buf1, "(none)");
+ bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+
+ missing_end = seq - 1;
+ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+ " prev at %s\n"
+ " next at %s",
+ missing_start, missing_end,
+ last_seq, *blacklist_seq - 1,
+ buf1, buf2);
+ }
+
+ seq++;
+ }
+
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
- struct bch_replicas_padded replicas;
+ struct bch_replicas_padded replicas = {
+ .e.data_type = BCH_DATA_journal,
+ .e.nr_required = 1,
+ };
+ unsigned ptr;
char buf[80];
+ if (i->ignore)
+ continue;
+
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+ replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+
+ bch2_replicas_entry_sort(&replicas.e);
+
/*
* If we're mounting in degraded mode - if we didn't read all
* the devices - this is wrong:
*/
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
-
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
entries++;
}
- if (!list_empty(list)) {
- i = list_last_entry(list, struct journal_replay, list);
+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+ keys, entries, *start_seq);
- bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
- keys, entries, le64_to_cpu(i->j.seq));
- }
+ if (*start_seq != *blacklist_seq)
+ bch_info(c, "dropped unflushed entries %llu-%llu",
+ *blacklist_seq, *start_seq - 1);
fsck_err:
return ret;
}
* it:
*/
if (!ca->mi.durability ||
- ca->mi.state != BCH_MEMBER_STATE_RW ||
+ ca->mi.state != BCH_MEMBER_STATE_rw ||
!ja->nr ||
bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
ca->dev_idx) ||
unsigned sectors)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_devs_mask devs;
struct journal_device *ja;
struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
+ unsigned target = c->opts.metadata_target ?:
+ c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
rcu_read_lock();
+retry:
+ devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
- &c->rw_devs[BCH_DATA_journal]);
+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
+
+ if (replicas < replicas_want && target) {
+ /* Retry from all devices: */
+ target = 0;
+ goto retry;
+ }
done:
rcu_read_unlock();
+ BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+
return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
}
return;
memcpy(new_buf, buf->data, buf->buf_size);
- kvpfree(buf->data, buf->buf_size);
- buf->data = new_buf;
- buf->buf_size = new_size;
+
+ spin_lock(&j->lock);
+ swap(buf->data, new_buf);
+ swap(buf->buf_size, new_size);
+ spin_unlock(&j->lock);
+
+ kvpfree(new_buf, new_size);
+}
+
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
+{
+ return j->buf + j->reservations.unwritten_idx;
}
static void journal_write_done(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *w = journal_prev_buf(j);
+ struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_devs_list devs =
bch2_bkey_devs(bkey_i_to_s_c(&w->key));
struct bch_replicas_padded replicas;
- u64 seq = le64_to_cpu(w->data->seq);
- u64 last_seq = le64_to_cpu(w->data->last_seq);
+ union journal_res_state old, new;
+ u64 v, seq, last_seq;
+ int err = 0;
bch2_time_stats_update(j->write_time, j->write_start_time);
if (!devs.nr) {
bch_err(c, "unable to write journal to sufficient devices");
- goto err;
+ err = -EIO;
+ } else {
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+ if (bch2_mark_replicas(c, &replicas.e))
+ err = -EIO;
}
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
-
- if (bch2_mark_replicas(c, &replicas.e))
- goto err;
+ if (err)
+ bch2_fatal_error(c);
spin_lock(&j->lock);
+ seq = le64_to_cpu(w->data->seq);
+ last_seq = le64_to_cpu(w->data->last_seq);
+
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = devs;
j->seq_ondisk = seq;
- j->last_seq_ondisk = last_seq;
- bch2_journal_space_available(j);
+ if (err && (!j->err_seq || seq < j->err_seq))
+ j->err_seq = seq;
+
+ if (!JSET_NO_FLUSH(w->data)) {
+ j->flushed_seq_ondisk = seq;
+ j->last_seq_ondisk = last_seq;
+ }
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
-out:
+ journal_reclaim_kick(&c->journal);
+
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
- BUG_ON(!j->reservations.prev_buf_unwritten);
- atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
- &j->reservations.counter);
+ v = atomic64_read(&j->reservations.counter);
+ do {
+ old.v = new.v = v;
+ BUG_ON(new.idx == new.unwritten_idx);
+
+ new.unwritten_idx++;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
+
+ bch2_journal_space_available(j);
closure_wake_up(&w->wait);
journal_wake(j);
if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
mod_delayed_work(system_freezable_wq, &j->write_work, 0);
spin_unlock(&j->lock);
- return;
-err:
- bch2_fatal_error(c);
- spin_lock(&j->lock);
- goto out;
+
+ if (new.unwritten_idx != new.idx &&
+ !journal_state_count(new, new.unwritten_idx))
+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
}
static void journal_write_endio(struct bio *bio)
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
- struct journal_buf *w = journal_prev_buf(j);
+ struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
percpu_ref_put(&ca->io_ref);
}
+static void do_journal_write(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_extent_ptr *ptr;
+ struct bio *bio;
+ unsigned sectors = vstruct_sectors(w->data, c->block_bits);
+
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+ ca = bch_dev_bkey_exists(c, ptr->dev);
+ if (!percpu_ref_tryget(&ca->io_ref)) {
+ /* XXX: fix this */
+ bch_err(c, "missing device for journal write\n");
+ continue;
+ }
+
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
+ sectors);
+
+ bio = ca->journal.bio;
+ bio_reset(bio);
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ bio->bi_iter.bi_sector = ptr->offset;
+ bio->bi_end_io = journal_write_endio;
+ bio->bi_private = ca;
+ bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
+
+ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+ ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
+ if (!JSET_NO_FLUSH(w->data))
+ bio->bi_opf |= REQ_FUA;
+ if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
+ bio->bi_opf |= REQ_PREFLUSH;
+
+ bch2_bio_map(bio, w->data, sectors << 9);
+
+ trace_journal_write(bio);
+ closure_bio_submit(bio, cl);
+
+ ca->journal.bucket_seq[ca->journal.cur_idx] =
+ le64_to_cpu(w->data->seq);
+ }
+
+ continue_at(cl, journal_write_done, system_highpri_wq);
+ return;
+}
+
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
- struct journal_buf *w = journal_prev_buf(j);
+ struct journal_buf *w = journal_last_unwritten_buf(j);
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
- struct bch_extent_ptr *ptr;
+ char *journal_debug_buf = NULL;
bool validate_before_checksum = false;
- unsigned i, sectors, bytes, u64s;
+ unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
- bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
+ spin_lock(&j->lock);
+ if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
+ !w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+ w->noflush = true;
+ SET_JSET_NO_FLUSH(jset, true);
+ jset->last_seq = 0;
+
+ j->nr_noflush_writes++;
+ } else {
+ j->last_flush_write = jiffies;
+ j->nr_flush_writes++;
+ }
+ spin_unlock(&j->lock);
+
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
- end = bch2_journal_super_entries_add_common(c, end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end,
+ le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
journal_write_compact(jset);
- jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
-
jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+ if (journal_entry_empty(jset))
+ j->last_empty_seq = le64_to_cpu(jset->seq);
+
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
- if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
validate_before_checksum = true;
if (validate_before_checksum &&
- jset_validate_entries(c, jset, WRITE))
+ jset_validate_for_write(c, jset))
goto err;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
journal_nonce(jset), jset);
if (!validate_before_checksum &&
- jset_validate_entries(c, jset, WRITE))
+ jset_validate_for_write(c, jset))
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
goto retry_alloc;
}
+ if (ret) {
+ journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+ if (journal_debug_buf)
+ __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+ }
+
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
spin_unlock(&j->lock);
if (ret) {
- bch_err(c, "Unable to allocate journal write");
+ bch_err(c, "Unable to allocate journal write:\n%s",
+ journal_debug_buf);
+ kfree(journal_debug_buf);
bch2_fatal_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
return;
if (c->opts.nochanges)
goto no_io;
- extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- ca = bch_dev_bkey_exists(c, ptr->dev);
- if (!percpu_ref_tryget(&ca->io_ref)) {
- /* XXX: fix this */
- bch_err(c, "missing device for journal write\n");
- continue;
- }
-
- this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
- sectors);
-
- bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_end_io = journal_write_endio;
- bio->bi_private = ca;
- bio_set_op_attrs(bio, REQ_OP_WRITE,
- REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
- bch2_bio_map(bio, jset, sectors << 9);
-
- trace_journal_write(bio);
- closure_bio_submit(bio, cl);
+ for_each_rw_member(ca, c, i)
+ nr_rw_members++;
- ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
- }
+ if (nr_rw_members > 1)
+ w->separate_flush = true;
- for_each_rw_member(ca, c, i)
- if (journal_flushes_device(ca) &&
- !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
+ if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+ for_each_rw_member(ca, c, i) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
+ }
+
+ bch2_bucket_seq_cleanup(c);
+ continue_at(cl, do_journal_write, system_highpri_wq);
+ return;
no_io:
bch2_bucket_seq_cleanup(c);
*/
struct journal_replay {
struct list_head list;
- struct bch_devs_list devs;
+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
+ unsigned nr_ptrs;
+
/* checksum error, but we may want to try using it anyways: */
bool bad;
+ bool ignore;
/* must be last: */
struct jset j;
};
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
-int bch2_journal_read(struct bch_fs *, struct list_head *);
+int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
+ unsigned, int, int);
+
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
void bch2_journal_write(struct closure *);
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "super.h"
+#include <linux/kthread.h>
+#include <linux/sched/mm.h>
+#include <trace/events/bcachefs.h>
+
/* Free space calculations: */
static unsigned journal_space_from(struct journal_device *ja,
old.v, new.v)) != old.v);
}
-static struct journal_space {
- unsigned next_entry;
- unsigned remaining;
-} __journal_space_available(struct journal *j, unsigned nr_devs_want,
- enum journal_space_from from)
+static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned sectors_next_entry = UINT_MAX;
- unsigned sectors_total = UINT_MAX;
- unsigned i, nr_devs = 0;
- unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
- ? journal_prev_buf(j)->sectors
- : 0;
+ unsigned sectors = 0;
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
- struct journal_device *ja = &ca->journal;
- unsigned buckets_this_device, sectors_this_device;
+ while (!sectors && *idx != j->reservations.idx) {
+ sectors = j->buf[*idx].sectors;
- if (!ja->nr)
- continue;
+ *idx = (*idx + 1) & JOURNAL_BUF_MASK;
+ }
- buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
- sectors_this_device = ja->sectors_free;
+ return sectors;
+}
- /*
- * We that we don't allocate the space for a journal entry
- * until we write it out - thus, account for it here:
- */
- if (unwritten_sectors >= sectors_this_device) {
- if (!buckets_this_device)
- continue;
+static struct journal_space
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
+ enum journal_space_from from)
+{
+ struct journal_device *ja = &ca->journal;
+ unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
- buckets_this_device--;
- sectors_this_device = ca->mi.bucket_size;
- }
+ if (from == journal_space_total)
+ return (struct journal_space) {
+ .next_entry = ca->mi.bucket_size,
+ .total = ca->mi.bucket_size * ja->nr,
+ };
- sectors_this_device -= unwritten_sectors;
+ buckets = bch2_journal_dev_buckets_available(j, ja, from);
+ sectors = ja->sectors_free;
- if (sectors_this_device < ca->mi.bucket_size &&
- buckets_this_device) {
- buckets_this_device--;
- sectors_this_device = ca->mi.bucket_size;
+ /*
+ * We that we don't allocate the space for a journal entry
+ * until we write it out - thus, account for it here:
+ */
+ while ((unwritten = get_unwritten_sectors(j, &idx))) {
+ if (unwritten >= sectors) {
+ if (!buckets) {
+ sectors = 0;
+ break;
+ }
+
+ buckets--;
+ sectors = ca->mi.bucket_size;
}
- if (!sectors_this_device)
+ sectors -= unwritten;
+ }
+
+ if (sectors < ca->mi.bucket_size && buckets) {
+ buckets--;
+ sectors = ca->mi.bucket_size;
+ }
+
+ return (struct journal_space) {
+ .next_entry = sectors,
+ .total = sectors + buckets * ca->mi.bucket_size,
+ };
+}
+
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
+ enum journal_space_from from)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned i, pos, nr_devs = 0;
+ struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
+
+ BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i,
+ &c->rw_devs[BCH_DATA_journal]) {
+ if (!ca->journal.nr)
continue;
- sectors_next_entry = min(sectors_next_entry,
- sectors_this_device);
+ space = journal_dev_space_available(j, ca, from);
+ if (!space.next_entry)
+ continue;
- sectors_total = min(sectors_total,
- buckets_this_device * ca->mi.bucket_size +
- sectors_this_device);
+ for (pos = 0; pos < nr_devs; pos++)
+ if (space.total > dev_space[pos].total)
+ break;
- nr_devs++;
+ array_insert_item(dev_space, nr_devs, pos, space);
}
rcu_read_unlock();
if (nr_devs < nr_devs_want)
return (struct journal_space) { 0, 0 };
- return (struct journal_space) {
- .next_entry = sectors_next_entry,
- .remaining = max_t(int, 0, sectors_total - sectors_next_entry),
- };
+ /*
+ * We sorted largest to smallest, and we want the smallest out of the
+ * @nr_devs_want largest devices:
+ */
+ return dev_space[nr_devs_want - 1];
}
void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
- struct journal_space discarded, clean_ondisk, clean;
- unsigned overhead, u64s_remaining = 0;
+ unsigned clean, clean_ondisk, total;
+ s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want;
j->can_discard = can_discard;
if (nr_online < c->opts.metadata_replicas_required) {
- ret = -EROFS;
- goto out;
- }
-
- if (!fifo_free(&j->pin)) {
- ret = -ENOSPC;
+ ret = cur_entry_insufficient_devices;
goto out;
}
nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
- discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded);
- clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
- clean = __journal_space_available(j, nr_devs_want, journal_space_clean);
+ for (i = 0; i < journal_space_nr; i++)
+ j->space[i] = __journal_space_available(j, nr_devs_want, i);
+
+ clean_ondisk = j->space[journal_space_clean_ondisk].total;
+ clean = j->space[journal_space_clean].total;
+ total = j->space[journal_space_total].total;
- if (!discarded.next_entry)
- ret = -ENOSPC;
+ if (!clean_ondisk &&
+ j->reservations.idx ==
+ j->reservations.unwritten_idx) {
+ char *buf = kmalloc(4096, GFP_ATOMIC);
- overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
- journal_entry_overhead(j);
- u64s_remaining = clean.remaining << 6;
- u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
+ bch_err(c, "journal stuck");
+ if (buf) {
+ __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
+ pr_err("\n%s", buf);
+ kfree(buf);
+ }
+
+ bch2_fatal_error(c);
+ ret = cur_entry_journal_stuck;
+ } else if (!j->space[journal_space_discarded].next_entry)
+ ret = cur_entry_journal_full;
+ else if (!fifo_free(&j->pin))
+ ret = cur_entry_journal_pin_full;
+
+ if ((j->space[journal_space_clean_ondisk].next_entry <
+ j->space[journal_space_clean_ondisk].total) &&
+ (clean - clean_ondisk <= total / 8) &&
+ (clean_ondisk * 2 > clean ))
+ set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+ else
+ clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
+
+ u64s_remaining = (u64) clean << 6;
+ u64s_remaining -= (u64) total << 3;
+ u64s_remaining = max(0LL, u64s_remaining);
u64s_remaining /= 4;
+ u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
out:
- j->cur_entry_sectors = !ret ? discarded.next_entry : 0;
+ j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
journal_check_may_get_unreserved(j);
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
BUG_ON(!fifo_pop(&j->pin, temp));
popped = true;
}
bch2_journal_space_available(j);
}
+void __bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ if (atomic_dec_and_test(&pin_list->count))
+ bch2_journal_reclaim_fast(j);
+}
+
void bch2_journal_pin_put(struct journal *j, u64 seq)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
if (!journal_pin_active(pin))
return;
+ if (j->flush_in_progress == pin)
+ j->flush_in_progress_dropped = true;
+
pin_list = journal_seq_pin(j, pin->seq);
pin->seq = 0;
list_del_init(&pin->list);
spin_unlock(&j->lock);
}
-static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
{
- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
- __journal_pin_drop(j, pin);
-
- BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
-
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
-
- list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
-}
+ struct journal_entry_pin_list *pin_list;
-void __bch2_journal_pin_add(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
spin_lock(&j->lock);
- bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
- spin_unlock(&j->lock);
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- journal_wake(j);
-}
-
-void bch2_journal_pin_update(struct journal *j, u64 seq,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- if (journal_pin_active(pin) && pin->seq < seq)
+ if (seq < journal_last_seq(j)) {
+ /*
+ * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
+ * the src pin - with the pin dropped, the entry to pin might no
+ * longer to exist, but that means there's no longer anything to
+ * copy and we can bail out here:
+ */
+ spin_unlock(&j->lock);
return;
+ }
- spin_lock(&j->lock);
+ pin_list = journal_seq_pin(j, seq);
- if (pin->seq != seq) {
- bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
- } else {
- struct journal_entry_pin_list *pin_list =
- journal_seq_pin(j, seq);
+ __journal_pin_drop(j, pin);
- /*
- * If the pin is already pinning the right sequence number, it
- * still might've already been flushed:
- */
- list_move(&pin->list, &pin_list->list);
- }
+ atomic_inc(&pin_list->count);
+ pin->seq = seq;
+ pin->flush = flush_fn;
+ if (flush_fn == bch2_btree_key_cache_journal_flush)
+ list_add(&pin->list, &pin_list->key_cache_list);
+ else if (flush_fn)
+ list_add(&pin->list, &pin_list->list);
+ else
+ list_add(&pin->list, &pin_list->flushed);
spin_unlock(&j->lock);
/*
journal_wake(j);
}
-void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- if (journal_pin_active(src) &&
- (!journal_pin_active(dst) || src->seq < dst->seq))
- bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
-
- spin_unlock(&j->lock);
-}
-
/**
* bch2_journal_pin_flush: ensure journal pin callback is no longer running
*/
*/
static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
+journal_get_next_pin(struct journal *j,
+ bool get_any,
+ bool get_key_cache,
+ u64 max_seq, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
- if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
- return NULL;
-
- spin_lock(&j->lock);
-
- fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
- if (*seq > max_seq ||
- (ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list)))
+ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
+ if (*seq > max_seq && !get_any && !get_key_cache)
break;
- if (ret) {
- list_move(&ret->list, &pin_list->flushed);
- BUG_ON(j->flush_in_progress);
- j->flush_in_progress = ret;
- j->last_flushed = jiffies;
- }
+ if (*seq <= max_seq || get_any) {
+ ret = list_first_entry_or_null(&pin_list->list,
+ struct journal_entry_pin, list);
+ if (ret)
+ return ret;
+ }
- spin_unlock(&j->lock);
+ if (*seq <= max_seq || get_any || get_key_cache) {
+ ret = list_first_entry_or_null(&pin_list->key_cache_list,
+ struct journal_entry_pin, list);
+ if (ret)
+ return ret;
+ }
+ }
- return ret;
+ return NULL;
}
/* returns true if we did work */
-static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
- unsigned min_nr)
+static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+ unsigned min_any,
+ unsigned min_key_cache)
{
struct journal_entry_pin *pin;
- bool ret = false;
+ size_t nr_flushed = 0;
+ journal_pin_flush_fn flush_fn;
u64 seq;
+ int err;
+
+ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
+ return 0;
lockdep_assert_held(&j->reclaim_lock);
- while ((pin = journal_get_next_pin(j, min_nr
- ? U64_MAX : seq_to_flush, &seq))) {
- if (min_nr)
- min_nr--;
+ while (1) {
+ cond_resched();
+
+ j->last_flushed = jiffies;
+
+ spin_lock(&j->lock);
+ pin = journal_get_next_pin(j,
+ min_any != 0,
+ min_key_cache != 0,
+ seq_to_flush, &seq);
+ if (pin) {
+ BUG_ON(j->flush_in_progress);
+ j->flush_in_progress = pin;
+ j->flush_in_progress_dropped = false;
+ flush_fn = pin->flush;
+ }
+ spin_unlock(&j->lock);
+
+ if (!pin)
+ break;
- pin->flush(j, pin, seq);
+ if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
+ min_key_cache--;
- BUG_ON(j->flush_in_progress != pin);
+ if (min_any)
+ min_any--;
+
+ err = flush_fn(j, pin, seq);
+
+ spin_lock(&j->lock);
+ /* Pin might have been dropped or rearmed: */
+ if (likely(!err && !j->flush_in_progress_dropped))
+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
j->flush_in_progress = NULL;
+ j->flush_in_progress_dropped = false;
+ spin_unlock(&j->lock);
+
wake_up(&j->pin_flush_wait);
- ret = true;
+
+ if (err)
+ break;
+
+ nr_flushed++;
}
- return ret;
+ return nr_flushed;
}
-/**
- * bch2_journal_reclaim - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-void bch2_journal_reclaim(struct journal *j)
+static u64 journal_seq_to_flush(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
- unsigned iter, min_nr = 0;
u64 seq_to_flush = 0;
-
- lockdep_assert_held(&j->reclaim_lock);
-
- bch2_journal_do_discards(j);
+ unsigned iter;
spin_lock(&j->lock);
(j->pin.size >> 1));
spin_unlock(&j->lock);
+ return seq_to_flush;
+}
+
+/**
+ * bch2_journal_reclaim - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+static int __bch2_journal_reclaim(struct journal *j, bool direct)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+ u64 seq_to_flush;
+ size_t min_nr, nr_flushed;
+ unsigned flags;
+ int ret = 0;
+
/*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
+ * We can't invoke memory reclaim while holding the reclaim_lock -
+ * journal reclaim is required to make progress for memory reclaim
+ * (cleaning the caches), so we can't get stuck in memory reclaim while
+ * we're holding the reclaim lock:
*/
- if (time_after(jiffies, j->last_flushed +
- msecs_to_jiffies(j->reclaim_delay_ms)))
- min_nr = 1;
+ lockdep_assert_held(&j->reclaim_lock);
+ flags = memalloc_noreclaim_save();
+
+ do {
+ if (kthread && kthread_should_stop())
+ break;
+
+ if (bch2_journal_error(j)) {
+ ret = -EIO;
+ break;
+ }
+
+ bch2_journal_do_discards(j);
+
+ seq_to_flush = journal_seq_to_flush(j);
+ min_nr = 0;
+
+ /*
+ * If it's been longer than j->reclaim_delay_ms since we last flushed,
+ * make sure to flush at least one journal pin:
+ */
+ if (time_after(jiffies, j->last_flushed +
+ msecs_to_jiffies(j->reclaim_delay_ms)))
+ min_nr = 1;
+
+ if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+ min_nr = 1;
+
+ if (fifo_free(&j->pin) <= 32)
+ min_nr = 1;
+
+ trace_journal_reclaim_start(c,
+ min_nr,
+ j->prereserved.reserved,
+ j->prereserved.remaining,
+ atomic_read(&c->btree_cache.dirty),
+ c->btree_cache.used,
+ atomic_long_read(&c->btree_key_cache.nr_dirty),
+ atomic_long_read(&c->btree_key_cache.nr_keys));
+
+ nr_flushed = journal_flush_pins(j, seq_to_flush,
+ min_nr,
+ min(bch2_nr_btree_keys_need_flush(c), 128UL));
+
+ if (direct)
+ j->nr_direct_reclaim += nr_flushed;
+ else
+ j->nr_background_reclaim += nr_flushed;
+ trace_journal_reclaim_finish(c, nr_flushed);
+
+ if (nr_flushed)
+ wake_up(&j->reclaim_wait);
+ } while (min_nr && nr_flushed && !direct);
+
+ memalloc_noreclaim_restore(flags);
- if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
- seq_to_flush = max(seq_to_flush, journal_last_seq(j));
- min_nr = 1;
+ return ret;
+}
+
+int bch2_journal_reclaim(struct journal *j)
+{
+ return __bch2_journal_reclaim(j, true);
+}
+
+static int bch2_journal_reclaim_thread(void *arg)
+{
+ struct journal *j = arg;
+ unsigned long next;
+ int ret = 0;
+
+ set_freezable();
+
+ kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
+
+ while (!ret && !kthread_should_stop()) {
+ j->reclaim_kicked = false;
+
+ mutex_lock(&j->reclaim_lock);
+ ret = __bch2_journal_reclaim(j, false);
+ mutex_unlock(&j->reclaim_lock);
+
+ next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+ if (j->reclaim_kicked)
+ break;
+ if (time_after_eq(jiffies, next))
+ break;
+ schedule_timeout(next - jiffies);
+ try_to_freeze();
+
+ }
+ __set_current_state(TASK_RUNNING);
}
- journal_flush_pins(j, seq_to_flush, min_nr);
+ return 0;
+}
- if (!bch2_journal_error(j))
- queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
- msecs_to_jiffies(j->reclaim_delay_ms));
+void bch2_journal_reclaim_stop(struct journal *j)
+{
+ struct task_struct *p = j->reclaim_thread;
+
+ j->reclaim_thread = NULL;
+
+ if (p) {
+ kthread_stop(p);
+ put_task_struct(p);
+ }
}
-void bch2_journal_reclaim_work(struct work_struct *work)
+int bch2_journal_reclaim_start(struct journal *j)
{
- struct journal *j = container_of(to_delayed_work(work),
- struct journal, reclaim_work);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct task_struct *p;
- mutex_lock(&j->reclaim_lock);
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
+ if (j->reclaim_thread)
+ return 0;
+
+ p = kthread_create(bch2_journal_reclaim_thread, j,
+ "bch-reclaim/%s", c->name);
+ if (IS_ERR(p)) {
+ bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
+ return PTR_ERR(p);
+ }
+
+ get_task_struct(p);
+ j->reclaim_thread = p;
+ wake_up_process(p);
+ return 0;
}
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- *did_work = journal_flush_pins(j, seq_to_flush, 0);
+ *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
spin_lock(&j->lock);
/*
#define JOURNAL_PIN (32 * 1024)
-enum journal_space_from {
- journal_space_discarded,
- journal_space_clean_ondisk,
- journal_space_clean,
-};
+static inline void journal_reclaim_kick(struct journal *j)
+{
+ struct task_struct *p = READ_ONCE(j->reclaim_thread);
+
+ if (p && !j->reclaim_kicked) {
+ j->reclaim_kicked = true;
+ if (p)
+ wake_up_process(p);
+ }
+}
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *,
return &j->pin.data[seq & j->pin.mask];
}
+void __bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
- journal_pin_flush_fn);
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
+ journal_pin_flush_fn);
static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
- __bch2_journal_pin_add(j, seq, pin, flush_fn);
+ bch2_journal_pin_set(j, seq, pin, flush_fn);
}
-void bch2_journal_pin_update(struct journal *, u64,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
+static inline void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
+{
+ /* Guard against racing with journal_pin_drop(src): */
+ u64 seq = READ_ONCE(src->seq);
-void bch2_journal_pin_copy(struct journal *,
- struct journal_entry_pin *,
- struct journal_entry_pin *,
- journal_pin_flush_fn);
+ if (seq)
+ bch2_journal_pin_add(j, seq, dst, flush_fn);
+}
+
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
+ bch2_journal_pin_set(j, seq, pin, flush_fn);
+}
void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
void bch2_journal_do_discards(struct journal *);
-void bch2_journal_reclaim(struct journal *);
-void bch2_journal_reclaim_work(struct work_struct *);
+int bch2_journal_reclaim(struct journal *);
+
+void bch2_journal_reclaim_stop(struct journal *);
+int bch2_journal_reclaim_start(struct journal *);
bool bch2_journal_flush_pins(struct journal *, u64);
out:
mutex_unlock(&c->sb_lock);
- return ret;
+ return ret ?: bch2_blacklist_table_initialize(c);
}
static int journal_seq_blacklist_table_cmp(const void *_l,
struct journal_seq_blacklist_table *t;
unsigned i, nr = blacklist_nr_entries(bl);
- BUG_ON(c->journal_seq_blacklist_table);
-
if (!bl)
return 0;
journal_seq_blacklist_table_cmp,
NULL);
+ kfree(c->journal_seq_blacklist_table);
c->journal_seq_blacklist_table = t;
return 0;
}
#include "super_types.h"
#include "fifo.h"
-struct journal_res;
+#define JOURNAL_BUF_BITS 2
+#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
/*
- * We put two of these in struct journal; we used them for writes to the
- * journal that are being staged or in flight.
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
*/
struct journal_buf {
struct jset *data;
- BKEY_PADDED(key);
+ __BKEY_PADDED(key, BCH_REPLICAS_MAX);
struct closure_waitlist wait;
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
+ bool noflush; /* write has already been kicked off, and was noflush */
+ bool must_flush; /* something wants a flush */
+ bool separate_flush;
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
struct journal_entry_pin_list {
struct list_head list;
+ struct list_head key_cache_list;
struct list_head flushed;
atomic_t count;
struct bch_devs_list devs;
struct journal;
struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j,
+typedef int (*journal_pin_flush_fn)(struct journal *j,
struct journal_entry_pin *, u64);
struct journal_entry_pin {
struct {
u64 cur_entry_offset:20,
- idx:1,
- prev_buf_unwritten:1,
- buf0_count:21,
- buf1_count:21;
+ idx:2,
+ unwritten_idx:2,
+ buf0_count:10,
+ buf1_count:10,
+ buf2_count:10,
+ buf3_count:10;
};
};
};
struct {
- u32 reserved;
- u32 remaining;
+ u64 waiting:1,
+ reserved:31,
+ remaining:32;
};
};
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
+struct journal_space {
+ /* Units of 512 bytes sectors: */
+ unsigned next_entry; /* How big the next journal entry can be */
+ unsigned total;
+};
+
+enum journal_space_from {
+ journal_space_discarded,
+ journal_space_clean_ondisk,
+ journal_space_clean,
+ journal_space_total,
+ journal_space_nr,
+};
+
/*
* JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
* either because something's waiting on the write to complete or because it's
JOURNAL_STARTED,
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
- JOURNAL_NOT_EMPTY,
JOURNAL_MAY_GET_UNRESERVED,
+ JOURNAL_MAY_SKIP_FLUSH,
};
/* Embedded in struct bch_fs */
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
- int cur_entry_error;
+ enum {
+ cur_entry_ok,
+ cur_entry_blocked,
+ cur_entry_journal_full,
+ cur_entry_journal_pin_full,
+ cur_entry_journal_stuck,
+ cur_entry_insufficient_devices,
+ } cur_entry_error;
union journal_preres_state prereserved;
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
*/
- struct journal_buf buf[2];
+ struct journal_buf buf[JOURNAL_BUF_NR];
spinlock_t lock;
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
+ u64 flushed_seq_ondisk;
u64 last_seq_ondisk;
+ u64 err_seq;
+ u64 last_empty_seq;
/*
* FIFO of journal entries whose btree updates have not yet been
struct journal_entry_pin_list *data;
} pin;
+ struct journal_space space[journal_space_nr];
+
u64 replay_journal_seq;
u64 replay_journal_seq_end;
struct write_point wp;
spinlock_t err_lock;
- struct delayed_work reclaim_work;
struct mutex reclaim_lock;
+ wait_queue_head_t reclaim_wait;
+ struct task_struct *reclaim_thread;
+ bool reclaim_kicked;
+ u64 nr_direct_reclaim;
+ u64 nr_background_reclaim;
+
unsigned long last_flushed;
struct journal_entry_pin *flush_in_progress;
+ bool flush_in_progress_dropped;
wait_queue_head_t pin_flush_wait;
/* protects advancing ja->discard_idx: */
unsigned write_delay_ms;
unsigned reclaim_delay_ms;
+ unsigned long last_flush_write;
u64 res_get_blocked_start;
u64 need_write_time;
u64 write_start_time;
+ u64 nr_flush_writes;
+ u64 nr_noflush_writes;
+
struct time_stats *write_time;
struct time_stats *delay_time;
struct time_stats *blocked_time;
*/
#include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
int ret = 0;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) {
if (!bch2_bkey_has_device(k, dev_idx)) {
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_advance(iter);
continue;
}
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
dev_idx, flags, false);
if (ret)
break;
}
+ bch2_trans_iter_put(&trans, iter);
ret = bch2_trans_exit(&trans) ?: ret;
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
BUG_ON(ret == -EINTR);
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
- return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
- __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
+ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
+ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
}
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
struct btree_iter *iter;
struct closure cl;
struct btree *b;
+ struct bkey_buf k;
unsigned id;
int ret;
if (flags & BCH_FORCE_IF_METADATA_LOST)
return -EINVAL;
+ bch2_bkey_buf_init(&k);
bch2_trans_init(&trans, c, 0, 0);
closure_init_stack(&cl);
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH, b) {
- __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
retry:
if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
dev_idx))
continue;
- bkey_copy(&tmp.k, &b->key);
+ bch2_bkey_buf_copy(&k, c, &b->key);
- ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k),
+ ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
if (ret) {
bch_err(c, "Cannot drop device without losing data");
- goto err;
+ break;
}
- ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+ ret = bch2_btree_node_update_key(c, iter, b, k.k);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(iter);
+ ret = 0;
goto retry;
}
if (ret) {
bch_err(c, "Error updating btree node key: %i", ret);
- goto err;
+ break;
}
}
bch2_trans_iter_free(&trans, iter);
+
+ if (ret)
+ goto err;
}
/* flush relevant btree updates */
ret = 0;
err:
ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_bkey_buf_exit(&k, c);
BUG_ON(ret == -EINTR);
#include "bcachefs.h"
#include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
struct migrate_write *m =
container_of(op, struct migrate_write, op);
struct keylist *keys = &op->insert_keys;
+ struct bkey_buf _new, _insert;
int ret = 0;
+ bch2_bkey_buf_init(&_new);
+ bch2_bkey_buf_init(&_insert);
+ bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, m->btree_id,
struct bkey_s_c k;
struct bkey_i *insert;
struct bkey_i_extent *new;
- BKEY_PADDED(k) _new, _insert;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
bool did_work = false;
- int nr;
+ bool extending = false, should_check_enospc;
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
bch2_trans_reset(&trans, 0);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
- if (ret) {
- if (ret == -EINTR)
- continue;
- break;
- }
+ if (ret)
+ goto err;
new = bkey_i_to_extent(bch2_keylist_front(keys));
!bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
goto nomatch;
- bkey_reassemble(&_insert.k, k);
- insert = &_insert.k;
+ bkey_reassemble(_insert.k, k);
+ insert = _insert.k;
- bkey_copy(&_new.k, bch2_keylist_front(keys));
- new = bkey_i_to_extent(&_new.k);
+ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+ new = bkey_i_to_extent(_new.k);
bch2_cut_front(iter->pos, &new->k_i);
bch2_cut_front(iter->pos, insert);
op->opts.background_target,
op->opts.data_replicas);
- /*
- * If we're not fully overwriting @k, and it's compressed, we
- * need a reservation for all the pointers in @insert
- */
- nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) -
- m->nr_ptrs_reserved;
+ ret = bch2_sum_sector_overwrites(&trans, iter, insert,
+ &extending,
+ &should_check_enospc,
+ &i_sectors_delta,
+ &disk_sectors_delta);
+ if (ret)
+ goto err;
- if (insert->k.size < k.k->size &&
- bch2_bkey_sectors_compressed(k) &&
- nr > 0) {
+ if (disk_sectors_delta > (s64) op->res.sectors) {
ret = bch2_disk_reservation_add(c, &op->res,
- keylist_sectors(keys) * nr, 0);
+ disk_sectors_delta - op->res.sectors,
+ !should_check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
goto out;
-
- m->nr_ptrs_reserved += nr;
- goto next;
}
bch2_trans_update(&trans, iter, insert, 0);
ret = bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
m->data_opts.btree_insert_flags);
+err:
if (!ret)
atomic_long_inc(&c->extent_migrate_done);
if (ret == -EINTR)
goto next;
}
out:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
+ bch2_bkey_buf_exit(&_insert, c);
+ bch2_bkey_buf_exit(&_new, c);
BUG_ON(ret == -EINTR);
return ret;
}
BUG_ON(!m->op.wbio.bio.bi_vcnt);
m->ptr = rbio->pick.ptr;
- m->offset = rbio->pos.offset - rbio->pick.crc.offset;
+ m->offset = rbio->data_pos.offset - rbio->pick.crc.offset;
m->op.devs_have = rbio->devs_have;
- m->op.pos = rbio->pos;
+ m->op.pos = rbio->data_pos;
m->op.version = rbio->version;
m->op.crc = rbio->pick.crc;
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(trans, &io->rbio, k, 0,
+ bch2_read_extent(trans, &io->rbio,
+ bkey_start_pos(k.k),
+ btree_id, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
return ret;
}
+static int lookup_inode(struct btree_trans *trans, struct bpos pos,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ int ret;
+
+ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+ if (ret)
+ goto err;
+
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
static int __bch2_move_data(struct bch_fs *c,
struct moving_context *ctxt,
struct bch_ratelimit *rate,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct bkey_on_stack sk;
+ struct bkey_buf sk;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u64 delay, cur_inum = U64_MAX;
int ret = 0, ret2;
- bkey_on_stack_init(&sk);
+ bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
stats->data_type = BCH_DATA_user;
stats->btree_id = btree_id;
- stats->pos = POS_MIN;
+ stats->pos = start;
iter = bch2_trans_get_iter(&trans, btree_id, start,
BTREE_ITER_PREFETCH);
try_to_freeze();
}
} while (delay);
-peek:
+
k = bch2_btree_iter_peek(iter);
stats->pos = iter->pos;
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- if (btree_id == BTREE_ID_EXTENTS &&
+ if (btree_id == BTREE_ID_extents &&
cur_inum != k.k->p.inode) {
struct bch_inode_unpacked inode;
- /* don't hold btree locks while looking up inode: */
- bch2_trans_unlock(&trans);
-
io_opts = bch2_opts_to_inode_opts(c->opts);
- if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+
+ ret = lookup_inode(&trans,
+ SPOS(0, k.k->p.inode, k.k->p.snapshot),
+ &inode);
+ if (ret == -EINTR)
+ continue;
+
+ if (!ret)
bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
+
cur_inum = k.k->p.inode;
- goto peek;
}
switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
}
/* unlock before doing IO: */
- bkey_on_stack_reassemble(&sk, c, k);
+ bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
bch2_trans_unlock(&trans);
ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts);
if (ret2) {
+ if (ret2 == -EINTR) {
+ bch2_trans_reset(&trans, 0);
+ bch2_trans_cond_resched(&trans);
+ continue;
+ }
+
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
&stats->sectors_seen);
next_nondata:
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_advance(iter);
bch2_trans_cond_resched(&trans);
}
out:
+
+ bch2_trans_iter_put(&trans, iter);
ret = bch2_trans_exit(&trans) ?: ret;
- bkey_on_stack_exit(&sk, c);
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
int bch2_move_data(struct bch_fs *c,
+ enum btree_id start_btree_id, struct bpos start_pos,
+ enum btree_id end_btree_id, struct bpos end_pos,
struct bch_ratelimit *rate,
struct write_point_specifier wp,
- struct bpos start,
- struct bpos end,
move_pred_fn pred, void *arg,
struct bch_move_stats *stats)
{
struct moving_context ctxt = { .stats = stats };
+ enum btree_id id;
int ret;
closure_init_stack(&ctxt.cl);
stats->data_type = BCH_DATA_user;
- ret = __bch2_move_data(c, &ctxt, rate, wp, start, end,
- pred, arg, stats, BTREE_ID_EXTENTS) ?:
- __bch2_move_data(c, &ctxt, rate, wp, start, end,
- pred, arg, stats, BTREE_ID_REFLINK);
+ for (id = start_btree_id;
+ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+ id++) {
+ stats->btree_id = id;
+
+ if (id != BTREE_ID_extents &&
+ id != BTREE_ID_reflink)
+ continue;
+
+ ret = __bch2_move_data(c, &ctxt, rate, wp,
+ id == start_btree_id ? start_pos : POS_MIN,
+ id == end_btree_id ? end_pos : POS_MAX,
+ pred, arg, stats, id);
+ if (ret)
+ break;
+ }
+
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
closure_sync(&ctxt.cl);
return ret;
}
+typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
+ struct btree *, struct bch_io_opts *,
+ struct data_opts *);
+
static int bch2_move_btree(struct bch_fs *c,
- move_pred_fn pred,
- void *arg,
+ enum btree_id start_btree_id, struct bpos start_pos,
+ enum btree_id end_btree_id, struct bpos end_pos,
+ move_btree_pred pred, void *arg,
struct bch_move_stats *stats)
{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_trans trans;
struct btree_iter *iter;
struct btree *b;
- unsigned id;
+ enum btree_id id;
struct data_opts data_opts;
enum data_cmd cmd;
int ret = 0;
stats->data_type = BCH_DATA_btree;
- for (id = 0; id < BTREE_ID_NR; id++) {
+ for (id = start_btree_id;
+ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
+ id++) {
stats->btree_id = id;
- for_each_btree_node(&trans, iter, id, POS_MIN,
+ for_each_btree_node(&trans, iter, id,
+ id == start_btree_id ? start_pos : POS_MIN,
BTREE_ITER_PREFETCH, b) {
+ if (kthread && kthread_should_stop())
+ goto out;
+
+ if ((cmp_int(id, end_btree_id) ?:
+ bkey_cmp(b->key.k.p, end_pos)) > 0)
+ break;
+
stats->pos = iter->pos;
- switch ((cmd = pred(c, arg,
- bkey_i_to_s_c(&b->key),
- &io_opts, &data_opts))) {
+ switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
case DATA_SKIP:
goto next;
case DATA_SCRUB:
ret = bch2_trans_iter_free(&trans, iter) ?: ret;
}
-
+out:
bch2_trans_exit(&trans);
+ if (ret)
+ bch_err(c, "error %i in bch2_move_btree", ret);
+
return ret;
}
return DATA_REWRITE;
}
+static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
+static bool bformat_needs_redo(struct bkey_format *f)
+{
+ unsigned i;
+
+ for (i = 0; i < f->nr_fields; i++) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (f->bits_per_field[i] > unpacked_bits)
+ return true;
+
+ if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
+ return true;
+
+ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
+ unpacked_mask) <
+ field_offset)
+ return true;
+ }
+
+ return false;
+}
+
+static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ if (b->version_ondisk != c->sb.version ||
+ btree_node_need_rewrite(b) ||
+ bformat_needs_redo(&b->format)) {
+ data_opts->target = 0;
+ data_opts->nr_replicas = 1;
+ data_opts->btree_insert_flags = 0;
+ return DATA_REWRITE;
+ }
+
+ return DATA_SKIP;
+}
+
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
+{
+ int ret;
+
+ ret = bch2_move_btree(c,
+ 0, POS_MIN,
+ BTREE_ID_NR, POS_MAX,
+ rewrite_old_nodes_pred, c, stats);
+ if (!ret) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+ c->disk_sb.sb->version_min = c->disk_sb.sb->version;
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+
+ return ret;
+}
+
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
- ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+ ret = bch2_move_btree(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ rereplicate_btree_pred, c, stats) ?: ret;
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
ret = bch2_replicas_gc2(c) ?: ret;
- ret = bch2_move_data(c, NULL,
- writepoint_hashed((unsigned long) current),
- op.start,
- op.end,
+ ret = bch2_move_data(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ NULL, writepoint_hashed((unsigned long) current),
rereplicate_pred, c, stats) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
- ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+ ret = bch2_move_btree(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ migrate_btree_pred, &op, stats) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
- ret = bch2_move_data(c, NULL,
- writepoint_hashed((unsigned long) current),
- op.start,
- op.end,
+ ret = bch2_move_data(c,
+ op.start_btree, op.start_pos,
+ op.end_btree, op.end_pos,
+ NULL, writepoint_hashed((unsigned long) current),
migrate_pred, &op, stats) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
break;
+ case BCH_DATA_OP_REWRITE_OLD_NODES:
+ ret = bch2_scan_old_btree_nodes(c, stats);
+ break;
default:
ret = -EINVAL;
}
struct bkey_s_c,
struct bch_io_opts *, struct data_opts *);
-int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+
+int bch2_move_data(struct bch_fs *,
+ enum btree_id, struct bpos,
+ enum btree_id, struct bpos,
+ struct bch_ratelimit *,
struct write_point_specifier,
- struct bpos, struct bpos,
move_pred_fn, void *,
struct bch_move_stats *);
copygc_heap *h = &c->copygc_heap;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
+ struct extent_ptr_decoded p = { 0 };
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
data_opts->rewrite_dev = p.ptr.dev;
- if (p.has_ec) {
- struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
-
- data_opts->nr_replicas += m->nr_redundant;
- }
+ if (p.has_ec)
+ data_opts->nr_replicas += p.ec.redundancy;
return DATA_REWRITE;
}
bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
- WARN_ON(m.stripe && !g->ec_redundancy);
+ WARN_ON(m.stripe && !g->stripe_redundancy);
e = (struct copygc_heap_entry) {
.dev = dev_idx,
.gen = m.gen,
- .replicas = 1 + g->ec_redundancy,
+ .replicas = 1 + g->stripe_redundancy,
.fragmentation = bucket_sectors_used(m) * (1U << 15)
/ ca->mi.bucket_size,
.sectors = bucket_sectors_used(m),
return -1;
}
+ /*
+ * Our btree node allocations also come out of RESERVE_MOVINGGC:
+ */
+ sectors_to_move = (sectors_to_move * 3) / 4;
+
for (i = h->data; i < h->data + h->used; i++)
sectors_to_move += i->sectors * i->replicas;
sizeof(h->data[0]),
bucket_offset_cmp, NULL);
- ret = bch2_move_data(c, &c->copygc_pd.rate,
+ ret = bch2_move_data(c,
+ 0, POS_MIN,
+ BTREE_ID_NR, POS_MAX,
+ &c->copygc_pd.rate,
writepoint_ptr(&c->copygc_write_point),
- POS_MIN, POS_MAX,
copygc_pred, NULL,
&move_stats);
fragmented_allowed += ((__dev_buckets_available(ca, usage) *
ca->mi.bucket_size) >> 1);
- fragmented += usage.sectors_fragmented;
+ fragmented += usage.d[BCH_DATA_user].fragmented;
}
return max_t(s64, 0, fragmented_allowed - fragmented);
{
struct bch_fs *c = arg;
struct io_clock *clock = &c->io_clock[WRITE];
- unsigned long last, wait;
+ u64 last, wait;
set_freezable();
if (kthread_wait_freezable(c->copy_gc_enabled))
break;
- last = atomic_long_read(&clock->now);
+ last = atomic64_read(&clock->now);
wait = bch2_copygc_wait_amount(c);
if (wait > clock->max_slop) {
if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
- t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
- if (IS_ERR(t))
+ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
+ if (IS_ERR(t)) {
+ bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
return PTR_ERR(t);
+ }
get_task_struct(t);
#include "super-io.h"
#include "util.h"
+#define x(t, n) #t,
+
const char * const bch2_error_actions[] = {
- "continue",
- "remount-ro",
- "panic",
+ BCH_ERROR_ACTIONS()
NULL
};
const char * const bch2_sb_features[] = {
-#define x(f, n) #f,
BCH_SB_FEATURES()
-#undef x
+ NULL
+};
+
+const char * const bch2_sb_compat[] = {
+ BCH_SB_COMPAT()
+ NULL
+};
+
+const char * const bch2_btree_ids[] = {
+ BCH_BTREE_IDS()
NULL
};
const char * const bch2_csum_opts[] = {
- "none",
- "crc32c",
- "crc64",
+ BCH_CSUM_OPTS()
NULL
};
const char * const bch2_compression_opts[] = {
-#define x(t, n) #t,
BCH_COMPRESSION_OPTS()
-#undef x
NULL
};
const char * const bch2_str_hash_types[] = {
- "crc32c",
- "crc64",
- "siphash",
+ BCH_STR_HASH_OPTS()
NULL
};
const char * const bch2_data_types[] = {
-#define x(t, n) #t,
BCH_DATA_TYPES()
-#undef x
NULL
};
const char * const bch2_cache_replacement_policies[] = {
- "lru",
- "fifo",
- "random",
+ BCH_CACHE_REPLACEMENT_POLICIES()
NULL
};
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch2_cache_modes[] = {
- "default",
- "writethrough",
- "writeback",
- "writearound",
- "none",
+const char * const bch2_member_states[] = {
+ BCH_MEMBER_STATES()
NULL
};
-const char * const bch2_dev_state[] = {
- "readwrite",
- "readonly",
- "failed",
- "spare",
- NULL
-};
+#undef x
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
{
extern const char * const bch2_error_actions[];
extern const char * const bch2_sb_features[];
+extern const char * const bch2_sb_compat[];
+extern const char * const bch2_btree_ids[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_data_types[];
extern const char * const bch2_cache_replacement_policies[];
-extern const char * const bch2_cache_modes[];
-extern const char * const bch2_dev_state[];
+extern const char * const bch2_member_states[];
/*
* Mount options; we also store defaults in the superblock.
x(errors, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
- BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \
+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \
NULL, "Action to take on filesystem error") \
x(metadata_replicas, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
x(metadata_checksum, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_csum_opts), \
- BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \
+ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(data_checksum, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
OPT_STR(bch2_csum_opts), \
- BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \
+ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(compression, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
x(str_hash, u8, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_str_hash_types), \
- BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \
+ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
+ x(metadata_target, u16, \
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FN(bch2_opt_target), \
+ BCH_SB_METADATA_TARGET, 0, \
+ "(target)", "Device or disk group for metadata writes") \
x(foreground_target, u16, \
OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
OPT_FN(bch2_opt_target), \
OPT_BOOL(), \
BCH_SB_PRJQUOTA, false, \
NULL, "Enable project quotas") \
- x(reflink, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_REFLINK, true, \
- NULL, "Enable reflink support") \
x(degraded, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Allow mounting in degraded mode") \
+ x(very_degraded, u8, \
+ OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false, \
+ NULL, "Allow mounting in when data will be missing") \
x(discard, u8, \
OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0),
+ for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
BTREE_ITER_PREFETCH, k, ret) {
if (k.k->p.inode != type)
break;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
switch (k.k->type) {
case KEY_TYPE_inode:
if (c->opts.usrquota)
return -EINVAL;
- ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_USR, 0),
POS(QTYP_USR + 1, 0),
NULL);
if (c->opts.grpquota)
return -EINVAL;
- ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_GRP, 0),
POS(QTYP_GRP + 1, 0),
NULL);
if (c->opts.prjquota)
return -EINVAL;
- ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_PRJ, 0),
POS(QTYP_PRJ + 1, 0),
NULL);
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p,
+ iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(iter);
struct qc_dqblk *qdq)
{
struct bch_fs *c = sb->s_fs_info;
- struct btree_trans trans;
struct bkey_i_quota new_quota;
int ret;
bkey_quota_init(&new_quota.k_i);
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
- bch2_trans_init(&trans, c, 0, 0);
-
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
- bch2_trans_exit(&trans);
-
return ret;
}
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
- unsigned long io_start;
+ u64 io_start;
long throttle;
set_freezable();
- io_start = atomic_long_read(&clock->now);
+ io_start = atomic64_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
(20 - w.dev_most_full_percent),
50);
- if (atomic_long_read(&clock->now) + clock->max_slop <
+ if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED;
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
- io_start = atomic_long_read(&clock->now);
+ io_start = atomic64_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
rebalance_work_reset(c);
bch2_move_data(c,
+ 0, POS_MIN,
+ BTREE_ID_NR, POS_MAX,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
writepoint_ptr(&c->rebalance_write_point),
- POS_MIN, POS_MAX,
rebalance_pred, NULL,
&r->move_stats);
}
case REBALANCE_THROTTLED:
bch2_hprint(&PBUF(h1),
(r->throttled_until_iotime -
- atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+ atomic64_read(&c->io_clock[WRITE].now)) << 9);
pr_buf(out, "throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);
break;
case REBALANCE_RUNNING:
- pr_buf(out, "running\n");
- pr_buf(out, "pos %llu:%llu\n",
- r->move_stats.pos.inode,
- r->move_stats.pos.offset);
+ pr_buf(out, "running\n"
+ "pos ");
+ bch2_bpos_to_text(out, r->move_stats.pos);
+ pr_buf(out, "\n");
break;
}
}
{
struct task_struct *p;
+ if (c->rebalance.thread)
+ return 0;
+
if (c->opts.nochanges)
return 0;
- p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
- if (IS_ERR(p))
+ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
+ if (IS_ERR(p)) {
+ bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
return PTR_ERR(p);
+ }
get_task_struct(p);
rcu_assign_pointer(c->rebalance.thread, p);
atomic64_t work_unknown_dev;
enum rebalance_state state;
- unsigned long throttled_until_iotime;
+ u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats move_stats;
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "alloc_background.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
+#include "move.h"
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
size_t src, dst;
for (src = 0, dst = 0; src < keys->nr; src++)
- if (keys->d[src].btree_id != BTREE_ID_ALLOC)
+ if (keys->d[src].btree_id != BTREE_ID_alloc)
keys->d[dst++] = keys->d[src];
keys->nr = dst;
/* iterate over keys read from the journal: */
-static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
+static int __journal_key_cmp(enum btree_id l_btree_id,
+ unsigned l_level,
+ struct bpos l_pos,
+ struct journal_key *r)
+{
+ return (cmp_int(l_btree_id, r->btree_id) ?:
+ cmp_int(l_level, r->level) ?:
+ bpos_cmp(l_pos, r->k->k.p));
+}
+
+static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+{
+ return (cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
+ bpos_cmp(l->k->k.p, r->k->k.p));
+}
+
+static size_t journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
size_t l = 0, r = journal_keys->nr, m;
while (l < r) {
m = l + ((r - l) >> 1);
- if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
- cmp_int(level, journal_keys->d[m].level) ?:
- bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
+ if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
l = m + 1;
else
r = m;
}
BUG_ON(l < journal_keys->nr &&
- (cmp_int(id, journal_keys->d[l].btree_id) ?:
- cmp_int(level, journal_keys->d[l].level) ?:
- bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
+ __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
BUG_ON(l &&
- (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
- cmp_int(level, journal_keys->d[l - 1].level) ?:
- bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
+ __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
+
+ return l;
+}
+
+static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+{
+ struct bkey_i *n = iter->keys->d[idx].k;
+ struct btree_and_journal_iter *biter =
+ container_of(iter, struct btree_and_journal_iter, journal);
+
+ if (iter->idx > idx ||
+ (iter->idx == idx &&
+ biter->last &&
+ bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
+ iter->idx++;
+}
+
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bkey_i *k)
+{
+ struct journal_key n = {
+ .btree_id = id,
+ .level = level,
+ .k = k,
+ .allocated = true
+ };
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_iter *iter;
+ unsigned idx = journal_key_search(keys, id, level, k->k.p);
+
+ if (idx < keys->nr &&
+ journal_key_cmp(&n, &keys->d[idx]) == 0) {
+ if (keys->d[idx].allocated)
+ kfree(keys->d[idx].k);
+ keys->d[idx] = n;
+ return 0;
+ }
+
+ if (keys->nr == keys->size) {
+ struct journal_keys new_keys = {
+ .nr = keys->nr,
+ .size = keys->size * 2,
+ .journal_seq_base = keys->journal_seq_base,
+ };
+
+ new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+ if (!new_keys.d) {
+ bch_err(c, "%s: error allocating new key array (size %zu)",
+ __func__, new_keys.size);
+ return -ENOMEM;
+ }
+
+ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+ kvfree(keys->d);
+ *keys = new_keys;
+ }
+
+ array_insert_item(keys->d, keys->nr, idx, n);
- return l < journal_keys->nr ? journal_keys->d + l : NULL;
+ list_for_each_entry(iter, &c->journal_iters, list)
+ journal_iter_fix(c, iter, idx);
+
+ return 0;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bpos pos)
+{
+ struct bkey_i *whiteout =
+ kmalloc(sizeof(struct bkey), GFP_KERNEL);
+ int ret;
+
+ if (!whiteout) {
+ bch_err(c, "%s: error allocating new key", __func__);
+ return -ENOMEM;
+ }
+
+ bkey_init(&whiteout->k);
+ whiteout->k.p = pos;
+
+ ret = bch2_journal_key_insert(c, id, level, whiteout);
+ if (ret)
+ kfree(whiteout);
+ return ret;
}
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{
- if (iter->k &&
- iter->k < iter->keys->d + iter->keys->nr &&
- iter->k->btree_id == iter->btree_id &&
- iter->k->level == iter->level)
- return iter->k->k;
+ struct journal_key *k = iter->idx - iter->keys->nr
+ ? iter->keys->d + iter->idx : NULL;
+
+ if (k &&
+ k->btree_id == iter->btree_id &&
+ k->level == iter->level)
+ return k->k;
- iter->k = NULL;
+ iter->idx = iter->keys->nr;
return NULL;
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
{
- if (iter->k)
- iter->k++;
+ if (iter->idx < iter->keys->nr)
+ iter->idx++;
}
-static void bch2_journal_iter_init(struct journal_iter *iter,
- struct journal_keys *journal_keys,
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+ list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+ struct journal_iter *iter,
enum btree_id id, unsigned level,
struct bpos pos)
{
iter->btree_id = id;
iter->level = level;
- iter->keys = journal_keys;
- iter->k = journal_key_search(journal_keys, id, level, pos);
+ iter->keys = &c->journal_keys;
+ iter->idx = journal_key_search(&c->journal_keys, id, level, pos);
+ list_add(&iter->list, &c->journal_iters);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
{
- return iter->btree
- ? bch2_btree_iter_peek(iter->btree)
- : bch2_btree_node_iter_peek_unpack(&iter->node_iter,
- iter->b, &iter->unpacked);
+ return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+ iter->b, &iter->unpacked);
}
static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
{
- if (iter->btree)
- bch2_btree_iter_next(iter->btree);
- else
- bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
}
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
if (btree_k.k && journal_k.k) {
- int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+ int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
if (!cmp)
bch2_journal_iter_advance_btree(iter);
ret = iter->last == journal ? journal_k : btree_k;
if (iter->b &&
- bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
- iter->journal.k = NULL;
+ bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+ iter->journal.idx = iter->journal.keys->nr;
iter->last = none;
return bkey_s_c_null;
}
return bch2_btree_and_journal_iter_peek(iter);
}
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
- struct btree_trans *trans,
- struct journal_keys *journal_keys,
- enum btree_id id, struct bpos pos)
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
{
- memset(iter, 0, sizeof(*iter));
-
- iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
- bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+ bch2_journal_iter_exit(&iter->journal);
}
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct journal_keys *journal_keys,
+ struct bch_fs *c,
struct btree *b)
{
memset(iter, 0, sizeof(*iter));
iter->b = b;
bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
- bch2_journal_iter_init(&iter->journal, journal_keys,
+ bch2_journal_iter_init(c, &iter->journal,
b->c.btree_id, b->c.level, b->data->min_key);
}
/* Walk btree, overlaying keys from the journal: */
+static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
+ struct btree_and_journal_iter iter)
+{
+ unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
+ struct bkey_s_c k;
+ struct bkey_buf tmp;
+
+ BUG_ON(!b->c.level);
+
+ bch2_bkey_buf_init(&tmp);
+
+ while (i < nr &&
+ (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+
+ bch2_btree_node_prefetch(c, NULL, tmp.k,
+ b->c.btree_id, b->c.level - 1);
+
+ bch2_btree_and_journal_iter_advance(&iter);
+ i++;
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
struct journal_keys *journal_keys,
enum btree_id btree_id,
{
struct btree_and_journal_iter iter;
struct bkey_s_c k;
+ struct bkey_buf tmp;
+ struct btree *child;
int ret = 0;
- bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+ bch2_bkey_buf_init(&tmp);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
ret = key_fn(c, btree_id, b->c.level, k);
break;
if (b->c.level) {
- struct btree *child;
- BKEY_PADDED(k) tmp;
-
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
+ bch2_bkey_buf_reassemble(&tmp, c, k);
bch2_btree_and_journal_iter_advance(&iter);
- if (b->c.level > 0) {
- child = bch2_btree_node_get_noiter(c, &tmp.k,
- b->c.btree_id, b->c.level - 1);
- ret = PTR_ERR_OR_ZERO(child);
- if (ret)
- break;
+ child = bch2_btree_node_get_noiter(c, tmp.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
- ret = (node_fn ? node_fn(c, b) : 0) ?:
- bch2_btree_and_journal_walk_recurse(c, child,
- journal_keys, btree_id, node_fn, key_fn);
- six_unlock_read(&child->c.lock);
+ ret = PTR_ERR_OR_ZERO(child);
+ if (ret)
+ break;
- if (ret)
- break;
- }
+ btree_and_journal_iter_prefetch(c, b, iter);
+
+ ret = (node_fn ? node_fn(c, b) : 0) ?:
+ bch2_btree_and_journal_walk_recurse(c, child,
+ journal_keys, btree_id, node_fn, key_fn);
+ six_unlock_read(&child->c.lock);
+
+ if (ret)
+ break;
} else {
bch2_btree_and_journal_iter_advance(&iter);
}
}
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&tmp, c);
return ret;
}
return cmp_int(l->btree_id, r->btree_id) ?:
cmp_int(l->level, r->level) ?:
- bkey_cmp(l->k->k.p, r->k->k.p) ?:
+ bpos_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
void bch2_journal_keys_free(struct journal_keys *keys)
{
+ struct journal_key *i;
+
+ for (i = keys->d; i < keys->d + keys->nr; i++)
+ if (i->allocated)
+ kfree(i->k);
+
kvfree(keys->d);
keys->d = NULL;
keys->nr = 0;
static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
{
- struct journal_replay *p;
+ struct journal_replay *i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys keys = { NULL };
if (list_empty(journal_entries))
return keys;
- keys.journal_seq_base =
- le64_to_cpu(list_last_entry(journal_entries,
- struct journal_replay, list)->j.last_seq);
-
- list_for_each_entry(p, journal_entries, list) {
- if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+ list_for_each_entry(i, journal_entries, list) {
+ if (i->ignore)
continue;
- for_each_jset_key(k, _n, entry, &p->j)
+ if (!keys.journal_seq_base)
+ keys.journal_seq_base = le64_to_cpu(i->j.seq);
+
+ for_each_jset_key(k, _n, entry, &i->j)
nr_keys++;
}
+ keys.size = roundup_pow_of_two(nr_keys);
- keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
+ keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
if (!keys.d)
goto err;
- list_for_each_entry(p, journal_entries, list) {
- if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
+ list_for_each_entry(i, journal_entries, list) {
+ if (i->ignore)
continue;
- for_each_jset_key(k, _n, entry, &p->j)
+ BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
+
+ for_each_jset_key(k, _n, entry, &i->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
- .journal_seq = le64_to_cpu(p->j.seq) -
+ .journal_seq = le64_to_cpu(i->j.seq) -
keys.journal_seq_base,
- .journal_offset = k->_data - p->j._data,
+ .journal_offset = k->_data - i->j._data,
};
}
while (src + 1 < keys.d + keys.nr &&
src[0].btree_id == src[1].btree_id &&
src[0].level == src[1].level &&
- !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
+ !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
src++;
*dst++ = *src++;
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
-static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
- struct bkey_i *k)
-{
- struct btree_trans trans;
- struct btree_iter *iter, *split_iter;
- /*
- * We might cause compressed extents to be split, so we need to pass in
- * a disk_reservation:
- */
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i *split;
- struct bpos atomic_end;
- /*
- * Some extents aren't equivalent - w.r.t. what the triggers do
- * - if they're split:
- */
- bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
- k->k.type == KEY_TYPE_reflink_p;
- bool remark = false;
- int ret;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-retry:
- bch2_trans_begin(&trans);
-
- iter = bch2_trans_get_iter(&trans, btree_id,
- bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
-
- do {
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto err;
-
- atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
-
- split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
- ret = PTR_ERR_OR_ZERO(split);
- if (ret)
- goto err;
-
- if (!remark &&
- remark_if_split &&
- bkey_cmp(atomic_end, k->k.p) < 0) {
- ret = bch2_disk_reservation_add(c, &disk_res,
- k->k.size *
- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
- BCH_DISK_RESERVATION_NOFAIL);
- BUG_ON(ret);
-
- remark = true;
- }
-
- bkey_copy(split, k);
- bch2_cut_front(iter->pos, split);
- bch2_cut_back(atomic_end, split);
-
- split_iter = bch2_trans_copy_iter(&trans, iter);
- ret = PTR_ERR_OR_ZERO(split_iter);
- if (ret)
- goto err;
-
- /*
- * It's important that we don't go through the
- * extent_handle_overwrites() and extent_update_to_keys() path
- * here: journal replay is supposed to treat extents like
- * regular keys
- */
- __bch2_btree_iter_set_pos(split_iter, split->k.p, false);
- bch2_trans_update(&trans, split_iter, split,
- BTREE_TRIGGER_NORUN);
-
- bch2_btree_iter_set_pos(iter, split->k.p);
-
- if (remark) {
- ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split),
- 0, split->k.size,
- BTREE_TRIGGER_INSERT);
- if (ret)
- goto err;
- }
- } while (bkey_cmp(iter->pos, k->k.p) < 0);
-
- if (remark) {
- ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
- 0, -((s64) k->k.size),
- BTREE_TRIGGER_OVERWRITE);
- if (ret)
- goto err;
- }
-
- ret = bch2_trans_commit(&trans, &disk_res, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY);
-err:
- if (ret == -EINTR)
- goto retry;
-
- bch2_disk_reservation_put(c, &disk_res);
-
- return bch2_trans_exit(&trans) ?: ret;
-}
-
static int __bch2_journal_replay_key(struct btree_trans *trans,
enum btree_id id, unsigned level,
struct bkey_i *k)
iter = bch2_trans_get_node_iter(trans, id, k->k.p,
BTREE_MAX_DEPTH, level,
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
/*
* iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
* want that here, journal replay is supposed to treat extents like
* regular keys:
*/
- __bch2_btree_iter_set_pos(iter, k->k.p, false);
+ BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
ret = bch2_btree_iter_traverse(iter) ?:
bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
return ret;
}
-static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
+static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
{
- return bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY,
- __bch2_journal_replay_key(&trans, id, level, k));
+ unsigned commit_flags = BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW;
+
+ if (!k->allocated)
+ commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+
+ return bch2_trans_do(c, NULL, NULL, commit_flags,
+ __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
}
static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
struct btree_iter *iter;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p,
+ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
BTREE_ITER_CACHED|
BTREE_ITER_CACHED_NOFILL|
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter) ?:
- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
+ ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(trans, iter);
return ret;
}
return cmp_int(r->level, l->level) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->k->k.p, r->k->k.p);
+ bpos_cmp(l->k->k.p, r->k->k.p);
}
static int bch2_journal_replay(struct bch_fs *c,
for_each_journal_key(keys, i) {
cond_resched();
- if (!i->level && i->btree_id == BTREE_ID_ALLOC) {
+ if (!i->level && i->btree_id == BTREE_ID_alloc) {
j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
ret = bch2_alloc_replay_key(c, i->k);
if (ret)
if (i->level) {
j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
- ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+ ret = bch2_journal_replay_key(c, i);
if (ret)
goto err;
}
*/
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
+ journal_reclaim_kick(j);
j->replay_journal_seq = seq;
for_each_journal_key(keys, i) {
cond_resched();
- if (i->level || i->btree_id == BTREE_ID_ALLOC)
+ if (i->level || i->btree_id == BTREE_ID_alloc)
continue;
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
- ret = i->k->k.size
- ? bch2_extent_replay_key(c, i->btree_id, i->k)
- : bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+ ret = bch2_journal_replay_key(c, i);
if (ret)
goto err;
}
bch2_journal_flush_all_pins(j);
return bch2_journal_error(j);
err:
- bch_err(c, "journal replay: error %d while replaying key", ret);
- return ret;
-}
-
-static bool journal_empty(struct list_head *journal)
-{
- return list_empty(journal) ||
- journal_entry_empty(&list_last_entry(journal,
- struct journal_replay, list)->j);
-}
-
-static int
-verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
- struct list_head *journal)
-{
- struct journal_replay *i =
- list_last_entry(journal, struct journal_replay, list);
- u64 start_seq = le64_to_cpu(i->j.last_seq);
- u64 end_seq = le64_to_cpu(i->j.seq);
- u64 seq = start_seq;
- int ret = 0;
-
- list_for_each_entry(i, journal, list) {
- if (le64_to_cpu(i->j.seq) < start_seq)
- continue;
-
- fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- seq, le64_to_cpu(i->j.seq) - 1,
- start_seq, end_seq);
-
- seq = le64_to_cpu(i->j.seq);
-
- fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
- "found blacklisted journal entry %llu", seq);
-
- do {
- seq++;
- } while (bch2_journal_seq_is_blacklisted(c, seq, false));
- }
-fsck_err:
+ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+ ret, bch2_btree_ids[i->btree_id], i->level);
return ret;
}
case BCH_JSET_ENTRY_data_usage: {
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
+
ret = bch2_replicas_set_usage(c, &u->r,
le64_to_cpu(u->v));
break;
}
+ case BCH_JSET_ENTRY_dev_usage: {
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+ sizeof(struct jset_entry_dev_usage_type);
+ unsigned i;
+
+ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
+ ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
+
+ for (i = 0; i < nr_types; i++) {
+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+ }
+
+ break;
+ }
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
le64_to_cpu(bl_entry->end) + 1);
break;
}
+ case BCH_JSET_ENTRY_clock: {
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+
+ atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+ }
}
return ret;
struct bch_sb_field_clean *clean,
struct list_head *journal)
{
+ struct journal_replay *i;
struct jset_entry *entry;
int ret;
if (clean) {
- c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
for (entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
return ret;
}
} else {
- struct journal_replay *i =
- list_last_entry(journal, struct journal_replay, list);
-
- c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
- c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
+ list_for_each_entry(i, journal, list) {
+ if (i->ignore)
+ continue;
- list_for_each_entry(i, journal, list)
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
+ }
}
bch2_fs_usage_initialize(c);
struct bch_sb_field_clean *clean = *cleanp;
int ret = 0;
- if (!c->sb.clean || !j)
- return 0;
-
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
return 0;
}
- mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
- "superblock read clock %u doesn't match journal %u after clean shutdown",
- clean->read_clock, j->read_clock);
- mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
- "superblock write clock %u doesn't match journal %u after clean shutdown",
- clean->write_clock, j->write_clock);
-
for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
return ERR_PTR(-ENOMEM);
}
- if (le16_to_cpu(c->disk_sb.sb->version) <
- bcachefs_metadata_version_bkey_renumber)
- bch2_sb_clean_renumber(clean, READ);
+ ret = bch2_sb_clean_validate(c, clean, READ);
+ if (ret) {
+ mutex_unlock(&c->sb_lock);
+ return ERR_PTR(ret);
+ }
mutex_unlock(&c->sb_lock);
if (!r->alive)
continue;
- if (i == BTREE_ID_ALLOC &&
+ if (i == BTREE_ID_alloc &&
c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
continue;
}
if (r->error) {
- __fsck_err(c, i == BTREE_ID_ALLOC
+ __fsck_err(c, i == BTREE_ID_alloc
? FSCK_CAN_IGNORE : 0,
"invalid btree root %s",
bch2_btree_ids[i]);
- if (i == BTREE_ID_ALLOC)
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ if (i == BTREE_ID_alloc)
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
ret = bch2_btree_root_read(c, i, &r->key, r->level);
if (ret) {
- __fsck_err(c, i == BTREE_ID_ALLOC
+ __fsck_err(c, i == BTREE_ID_alloc
? FSCK_CAN_IGNORE : 0,
"error reading btree root %s",
bch2_btree_ids[i]);
- if (i == BTREE_ID_ALLOC)
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ if (i == BTREE_ID_alloc)
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
}
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
- u64 journal_seq;
- bool write_sb = false, need_write_alloc = false;
+ struct jset *last_journal_entry = NULL;
+ u64 blacklist_seq, journal_seq;
+ bool write_sb = false;
int ret;
if (c->sb.clean)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
+ bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (!c->sb.clean &&
+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
+ bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
+ ret = -EINVAL;
+ goto err;
+
+ }
+
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
+ bch_info(c, "alloc_v2 feature bit not set, fsck required");
+ c->opts.fsck = true;
+ c->opts.fix_errors = FSCK_OPT_YES;
+ }
+
if (!c->replicas.entries ||
c->opts.rebuild_replicas) {
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
+ ret = bch2_blacklist_table_initialize(c);
+ if (ret) {
+ bch_err(c, "error initializing blacklist table");
+ goto err;
+ }
+
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
- struct jset *j;
+ struct journal_replay *i;
- ret = bch2_journal_read(c, &c->journal_entries);
+ ret = bch2_journal_read(c, &c->journal_entries,
+ &blacklist_seq, &journal_seq);
if (ret)
goto err;
- if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
+ list_for_each_entry_reverse(i, &c->journal_entries, list)
+ if (!i->ignore) {
+ last_journal_entry = &i->j;
+ break;
+ }
+
+ if (mustfix_fsck_err_on(c->sb.clean &&
+ last_journal_entry &&
+ !journal_entry_empty(last_journal_entry), c,
"filesystem marked clean but journal not empty")) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
}
- if (!c->sb.clean && list_empty(&c->journal_entries)) {
- bch_err(c, "no journal entries found");
- ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
- goto err;
+ if (!last_journal_entry) {
+ fsck_err_on(!c->sb.clean, c, "no journal entries found");
+ goto use_clean;
}
c->journal_keys = journal_keys_sort(&c->journal_entries);
goto err;
}
- j = &list_last_entry(&c->journal_entries,
- struct journal_replay, list)->j;
-
- ret = verify_superblock_clean(c, &clean, j);
- if (ret)
- goto err;
-
- journal_seq = le64_to_cpu(j->seq) + 1;
+ if (c->sb.clean && last_journal_entry) {
+ ret = verify_superblock_clean(c, &clean,
+ last_journal_entry);
+ if (ret)
+ goto err;
+ }
} else {
- journal_seq = le64_to_cpu(clean->journal_seq) + 1;
- }
+use_clean:
+ if (!clean) {
+ bch_err(c, "no superblock clean section found");
+ ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
+ goto err;
- if (!c->sb.clean &&
- !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
- bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
- ret = -EINVAL;
- goto err;
+ }
+ blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
if (c->opts.reconstruct_alloc) {
- c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
drop_alloc_keys(&c->journal_keys);
}
if (ret)
goto err;
- if (!c->sb.clean) {
+ /*
+ * After an unclean shutdown, skip then next few journal sequence
+ * numbers as they may have been referenced by btree writes that
+ * happened before their corresponding journal writes - those btree
+ * writes need to be ignored, by skipping and blacklisting the next few
+ * journal sequence numbers:
+ */
+ if (!c->sb.clean)
+ journal_seq += 8;
+
+ if (blacklist_seq != journal_seq) {
ret = bch2_journal_seq_blacklist_add(c,
- journal_seq,
- journal_seq + 4);
+ blacklist_seq, journal_seq);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
goto err;
}
-
- journal_seq += 4;
-
- /*
- * The superblock needs to be written before we do any btree
- * node writes: it will be in the read_write() path
- */
- }
-
- ret = bch2_blacklist_table_initialize(c);
-
- if (!list_empty(&c->journal_entries)) {
- ret = verify_journal_entries_not_blacklisted_or_missing(c,
- &c->journal_entries);
- if (ret)
- goto err;
}
ret = bch2_fs_journal_start(&c->journal, journal_seq,
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
- if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
- /*
- * interior btree node updates aren't consistent with the
- * journal; after an unclean shutdown we have to walk all
- * pointers to metadata:
- */
- bch_info(c, "starting metadata mark and sweep");
- err = "error in mark and sweep";
- ret = bch2_gc(c, &c->journal_keys, true, true);
- if (ret < 0)
- goto err;
- if (ret)
- need_write_alloc = true;
- bch_verbose(c, "mark and sweep done");
- }
-
if (c->opts.fsck ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
bch_info(c, "starting mark and sweep");
err = "error in mark and sweep";
- ret = bch2_gc(c, &c->journal_keys, true, false);
- if (ret < 0)
- goto err;
+ ret = bch2_gc(c, true);
if (ret)
- need_write_alloc = true;
+ goto err;
bch_verbose(c, "mark and sweep done");
}
+ bch2_stripes_heap_start(c);
+
clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
goto err;
bch_verbose(c, "journal replay done");
- if (need_write_alloc && !c->opts.nochanges) {
+ if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
+ !c->opts.nochanges) {
/*
* note that even when filesystem was clean there might be work
* to do here, if we ran gc (because of fsck) which recalculated
goto err;
}
bch_verbose(c, "alloc write done");
-
- set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
}
if (!c->sb.clean) {
bch_verbose(c, "quotas done");
}
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
+ struct bch_move_stats stats = { 0 };
+
+ bch_info(c, "scanning for old btree nodes");
+ ret = bch2_fs_read_write(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_scan_old_btree_nodes(c, &stats);
+ if (ret)
+ goto err;
+ bch_info(c, "scanning for old btree nodes done");
+ }
+
mutex_lock(&c->sb_lock);
if (c->opts.version_upgrade) {
- if (c->sb.version < bcachefs_metadata_version_new_versioning)
- c->disk_sb.sb->version_min =
- le16_to_cpu(bcachefs_metadata_version_min);
c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
write_sb = true;
}
if (!test_bit(BCH_FS_ERROR, &c->flags)) {
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
write_sb = true;
}
bch_notice(c, "initializing new filesystem");
mutex_lock(&c->sb_lock);
- for_each_online_member(ca, c, i)
- bch2_mark_dev_superblock(c, ca, 0);
- mutex_unlock(&c->sb_lock);
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->version = c->disk_sb.sb->version_min =
- le16_to_cpu(bcachefs_metadata_version_current);
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
- c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+ if (c->opts.version_upgrade) {
+ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+ bch2_write_super(c);
+ }
- bch2_write_super(c);
+ for_each_online_member(ca, c, i)
+ bch2_mark_dev_superblock(c, ca, 0);
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
- bch2_inode_pack(&packed_inode, &root_inode);
+ bch2_inode_pack(c, &packed_inode, &root_inode);
+ packed_inode.inode.k.p.snapshot = U32_MAX;
err = "error creating root directory";
- ret = bch2_btree_insert(c, BTREE_ID_INODES,
+ ret = bch2_btree_insert(c, BTREE_ID_inodes,
&packed_inode.inode.k_i,
NULL, NULL, 0);
if (ret)
&lostfound,
0, 0, S_IFDIR|0700, 0,
NULL, NULL));
- if (ret)
+ if (ret) {
+ bch_err(c, "error creating lost+found");
goto err;
+ }
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
struct journal_iter {
+ struct list_head list;
enum btree_id btree_id;
unsigned level;
+ size_t idx;
struct journal_keys *keys;
- struct journal_key *k;
};
/*
*/
struct btree_and_journal_iter {
- struct btree_iter *btree;
-
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
} last;
};
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+ unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
+
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
- struct btree_trans *,
- struct journal_keys *,
- enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct journal_keys *,
+ struct bch_fs *,
struct btree *);
typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b);
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
#include "btree_update.h"
#include "extents.h"
#include "inode.h"
if (orig->k.type == KEY_TYPE_inline_data)
bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
- for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
+ for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
POS(0, c->reflink_hint),
BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
if (reflink_iter->pos.inode) {
bch2_trans_update(trans, reflink_iter, r_v, 0);
r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
- if (IS_ERR(r_p))
- return PTR_ERR(r_p);
+ if (IS_ERR(r_p)) {
+ ret = PTR_ERR(r_p);
+ goto err;
+ }
orig->k.type = KEY_TYPE_reflink_p;
r_p = bkey_i_to_reflink_p(orig);
struct btree_trans trans;
struct btree_iter *dst_iter, *src_iter;
struct bkey_s_c src_k;
- BKEY_PADDED(k) new_dst;
- struct bkey_on_stack new_src;
+ struct bkey_buf new_dst, new_src;
struct bpos dst_end = dst_start, src_end = src_start;
struct bpos dst_want, src_want;
u64 src_done, dst_done;
int ret = 0, ret2 = 0;
- if (!c->opts.reflink)
- return -EOPNOTSUPP;
-
if (!percpu_ref_tryget(&c->writes))
return -EROFS;
dst_end.offset += remap_sectors;
src_end.offset += remap_sectors;
- bkey_on_stack_init(&new_src);
+ bch2_bkey_buf_init(&new_dst);
+ bch2_bkey_buf_init(&new_src);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
- src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
BTREE_ITER_INTENT);
- dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
BTREE_ITER_INTENT);
- while (1) {
+ while (ret == 0 || ret == -EINTR) {
bch2_trans_begin(&trans);
- trans.mem_top = 0;
-
if (fatal_signal_pending(current)) {
ret = -EINTR;
- goto err;
+ break;
}
src_k = get_next_src(src_iter, src_end);
ret = bkey_err(src_k);
if (ret)
- goto btree_err;
+ continue;
src_done = bpos_min(src_iter->pos, src_end).offset -
src_start.offset;
if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
journal_seq, i_sectors_delta);
- if (ret)
- goto btree_err;
continue;
}
break;
if (src_k.k->type != KEY_TYPE_reflink_p) {
- bkey_on_stack_reassemble(&new_src, c, src_k);
+ bch2_bkey_buf_reassemble(&new_src, c, src_k);
src_k = bkey_i_to_s_c(new_src.k);
bch2_cut_front(src_iter->pos, new_src.k);
ret = bch2_make_extent_indirect(&trans, src_iter,
new_src.k);
if (ret)
- goto btree_err;
+ continue;
BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
}
struct bkey_s_c_reflink_p src_p =
bkey_s_c_to_reflink_p(src_k);
struct bkey_i_reflink_p *dst_p =
- bkey_reflink_p_init(&new_dst.k);
+ bkey_reflink_p_init(new_dst.k);
u64 offset = le64_to_cpu(src_p.v->idx) +
(src_iter->pos.offset -
BUG();
}
- new_dst.k.k.p = dst_iter->pos;
- bch2_key_resize(&new_dst.k.k,
+ new_dst.k->k.p = dst_iter->pos;
+ bch2_key_resize(&new_dst.k->k,
min(src_k.k->p.offset - src_iter->pos.offset,
dst_end.offset - dst_iter->pos.offset));
- ret = bch2_extent_update(&trans, dst_iter, &new_dst.k,
+ ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
NULL, journal_seq,
new_i_size, i_sectors_delta);
if (ret)
- goto btree_err;
+ continue;
dst_done = dst_iter->pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(src_iter, src_want);
-btree_err:
- if (ret == -EINTR)
- ret = 0;
- if (ret)
- goto err;
}
+ bch2_trans_iter_put(&trans, dst_iter);
+ bch2_trans_iter_put(&trans, src_iter);
- BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
-err:
+ BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
dst_done = dst_iter->pos.offset - dst_start.offset;
ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL, journal_seq, 0);
}
+
+ bch2_trans_iter_put(&trans, inode_iter);
} while (ret2 == -EINTR);
ret = bch2_trans_exit(&trans) ?: ret;
- bkey_on_stack_exit(&new_src, c);
+ bch2_bkey_buf_exit(&new_src, c);
+ bch2_bkey_buf_exit(&new_dst, c);
percpu_ref_put(&c->writes);
/* Replicas tracking - in memory: */
-static inline int u8_cmp(u8 l, u8 r)
-{
- return cmp_int(l, r);
-}
-
static void verify_replicas_entry(struct bch_replicas_entry *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
#endif
}
-static void replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
break;
}
- replicas_entry_sort(e);
+ bch2_replicas_entry_sort(e);
}
void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
for (i = 0; i < devs.nr; i++)
e->devs[e->nr_devs++] = devs.devs[i];
- replicas_entry_sort(e);
+ bch2_replicas_entry_sort(e);
}
static struct bch_replicas_cpu
BUG_ON(!new_entry->data_type);
verify_replicas_entry(new_entry);
- new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
+ new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
if (!new.entries)
return new;
int bch2_replicas_entry_idx(struct bch_fs *c,
struct bch_replicas_entry *search)
{
- replicas_entry_sort(search);
+ bch2_replicas_entry_sort(search);
return __replicas_entry_idx(&c->replicas, search);
}
static int replicas_table_update(struct bch_fs *c,
struct bch_replicas_cpu *new_r)
{
- struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
- struct bch_fs_usage *new_scratch = NULL;
+ struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
+ struct bch_fs_usage_online *new_scratch = NULL;
struct bch_fs_usage __percpu *new_gc = NULL;
struct bch_fs_usage *new_base = NULL;
- unsigned bytes = sizeof(struct bch_fs_usage) +
+ unsigned i, bytes = sizeof(struct bch_fs_usage) +
+ sizeof(u64) * new_r->nr;
+ unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
sizeof(u64) * new_r->nr;
- int ret = -ENOMEM;
-
- if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
- !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
- GFP_NOIO)) ||
- !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
- GFP_NOIO)) ||
- !(new_scratch = kmalloc(bytes, GFP_NOIO)) ||
+ int ret = 0;
+
+ memset(new_usage, 0, sizeof(new_usage));
+
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
+ sizeof(u64), GFP_KERNEL)))
+ goto err;
+
+ if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
+ !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) ||
(c->usage_gc &&
- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
- bch_err(c, "error updating replicas table: memory allocation failure");
+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
goto err;
- }
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ if (c->usage[i])
+ __replicas_table_update_pcpu(new_usage[i], new_r,
+ c->usage[i], &c->replicas);
if (c->usage_base)
__replicas_table_update(new_base, new_r,
c->usage_base, &c->replicas);
- if (c->usage[0])
- __replicas_table_update_pcpu(new_usage[0], new_r,
- c->usage[0], &c->replicas);
- if (c->usage[1])
- __replicas_table_update_pcpu(new_usage[1], new_r,
- c->usage[1], &c->replicas);
if (c->usage_gc)
__replicas_table_update_pcpu(new_gc, new_r,
c->usage_gc, &c->replicas);
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
+ swap(c->usage[i], new_usage[i]);
swap(c->usage_base, new_base);
- swap(c->usage[0], new_usage[0]);
- swap(c->usage[1], new_usage[1]);
swap(c->usage_scratch, new_scratch);
swap(c->usage_gc, new_gc);
swap(c->replicas, *new_r);
- ret = 0;
-err:
+out:
free_percpu(new_gc);
kfree(new_scratch);
free_percpu(new_usage[1]);
free_percpu(new_usage[0]);
kfree(new_base);
return ret;
+err:
+ bch_err(c, "error updating replicas table: memory allocation failure");
+ ret = -ENOMEM;
+ goto out;
}
static unsigned reserve_journal_replicas(struct bch_fs *c,
return 0;
}
+/* replicas delta list: */
+
+bool bch2_replicas_delta_list_marked(struct bch_fs *c,
+ struct replicas_delta_list *r)
+{
+ struct replicas_delta *d = r->d;
+ struct replicas_delta *top = (void *) r->d + r->used;
+
+ percpu_rwsem_assert_held(&c->mark_lock);
+
+ for (d = r->d; d != top; d = replicas_delta_next(d))
+ if (bch2_replicas_entry_idx(c, &d->r) < 0)
+ return false;
+ return true;
+}
+
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
+ struct replicas_delta_list *r)
+{
+ struct replicas_delta *d = r->d;
+ struct replicas_delta *top = (void *) r->d + r->used;
+ int ret = 0;
+
+ for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
+ ret = bch2_mark_replicas(c, &d->r);
+ return ret;
+}
+
+/* bkey replicas: */
+
bool bch2_bkey_replicas_marked(struct bch_fs *c,
struct bkey_s_c k)
{
return __bch2_mark_bkey_replicas(c, k, false);
}
+/*
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
+ * die at some point:
+ */
+
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
{
unsigned i;
struct bch_replicas_cpu n;
if (!__replicas_has_entry(&c->replicas_gc, e) &&
- (c->usage_base->replicas[i] ||
- percpu_u64_get(&c->usage[0]->replicas[i]) ||
- percpu_u64_get(&c->usage[1]->replicas[i]))) {
+ bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
n = cpu_replicas_add_entry(&c->replicas_gc, e);
if (!n.entries) {
ret = -ENOSPC;
c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
c->replicas_gc.entry_size,
- GFP_NOIO);
+ GFP_KERNEL);
if (!c->replicas_gc.entries) {
mutex_unlock(&c->sb_lock);
bch_err(c, "error allocating c->replicas_gc");
return 0;
}
+/* New much simpler mechanism for clearing out unneeded replicas entries: */
+
int bch2_replicas_gc2(struct bch_fs *c)
{
struct bch_replicas_cpu new = { 0 };
if (e->data_type == BCH_DATA_journal ||
c->usage_base->replicas[i] ||
percpu_u64_get(&c->usage[0]->replicas[i]) ||
- percpu_u64_get(&c->usage[1]->replicas[i]))
+ percpu_u64_get(&c->usage[1]->replicas[i]) ||
+ percpu_u64_get(&c->usage[2]->replicas[i]) ||
+ percpu_u64_get(&c->usage[3]->replicas[i]))
memcpy(cpu_replicas_entry(&new, new.nr++),
e, new.entry_size);
}
nr++;
}
- cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries)
return -ENOMEM;
for_each_replicas_entry(sb_r, e) {
dst = cpu_replicas_entry(cpu_r, idx++);
memcpy(dst, e, replicas_entry_bytes(e));
- replicas_entry_sort(dst);
+ bch2_replicas_entry_sort(dst);
}
return 0;
entry_size += sizeof(struct bch_replicas_entry) -
sizeof(struct bch_replicas_entry_v0);
- cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
if (!cpu_r->entries)
return -ENOMEM;
dst->nr_devs = e->nr_devs;
dst->nr_required = 1;
memcpy(dst->devs, e->devs, e->nr_devs);
- replicas_entry_sort(dst);
+ bch2_replicas_entry_sort(dst);
}
return 0;
/* Query replicas: */
-struct replicas_status __bch2_replicas_status(struct bch_fs *c,
- struct bch_devs_mask online_devs)
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
+ unsigned flags, bool print)
{
- struct bch_sb_field_members *mi;
struct bch_replicas_entry *e;
- unsigned i, nr_online, nr_offline;
- struct replicas_status ret;
-
- memset(&ret, 0, sizeof(ret));
-
- for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
- ret.replicas[i].redundancy = INT_MAX;
-
- mi = bch2_sb_get_members(c->disk_sb.sb);
+ bool ret = true;
percpu_down_read(&c->mark_lock);
-
for_each_cpu_replicas_entry(&c->replicas, e) {
- if (e->data_type >= ARRAY_SIZE(ret.replicas))
- panic("e %p data_type %u\n", e, e->data_type);
-
- nr_online = nr_offline = 0;
+ unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
+ bool metadata = e->data_type < BCH_DATA_user;
for (i = 0; i < e->nr_devs; i++) {
- BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
- e->devs[i]));
+ struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
- if (test_bit(e->devs[i], online_devs.d))
- nr_online++;
- else
- nr_offline++;
+ nr_online += test_bit(e->devs[i], devs.d);
+ nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
}
- ret.replicas[e->data_type].redundancy =
- min(ret.replicas[e->data_type].redundancy,
- (int) nr_online - (int) e->nr_required);
-
- ret.replicas[e->data_type].nr_offline =
- max(ret.replicas[e->data_type].nr_offline,
- nr_offline);
- }
-
- percpu_up_read(&c->mark_lock);
-
- for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
- if (ret.replicas[i].redundancy == INT_MAX)
- ret.replicas[i].redundancy = 0;
+ if (nr_failed == e->nr_devs)
+ continue;
- return ret;
-}
+ if (nr_online < e->nr_required)
+ dflags |= metadata
+ ? BCH_FORCE_IF_METADATA_LOST
+ : BCH_FORCE_IF_DATA_LOST;
-struct replicas_status bch2_replicas_status(struct bch_fs *c)
-{
- return __bch2_replicas_status(c, bch2_online_devs(c));
-}
+ if (nr_online < e->nr_devs)
+ dflags |= metadata
+ ? BCH_FORCE_IF_METADATA_DEGRADED
+ : BCH_FORCE_IF_DATA_DEGRADED;
-static bool have_enough_devs(struct replicas_status s,
- enum bch_data_type type,
- bool force_if_degraded,
- bool force_if_lost)
-{
- return (!s.replicas[type].nr_offline || force_if_degraded) &&
- (s.replicas[type].redundancy >= 0 || force_if_lost);
-}
+ if (dflags & ~flags) {
+ if (print) {
+ char buf[100];
-bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
-{
- return (have_enough_devs(s, BCH_DATA_journal,
- flags & BCH_FORCE_IF_METADATA_DEGRADED,
- flags & BCH_FORCE_IF_METADATA_LOST) &&
- have_enough_devs(s, BCH_DATA_btree,
- flags & BCH_FORCE_IF_METADATA_DEGRADED,
- flags & BCH_FORCE_IF_METADATA_LOST) &&
- have_enough_devs(s, BCH_DATA_user,
- flags & BCH_FORCE_IF_DATA_DEGRADED,
- flags & BCH_FORCE_IF_DATA_LOST));
-}
+ bch2_replicas_entry_to_text(&PBUF(buf), e);
+ bch_err(c, "insufficient devices online (%u) for replicas entry %s",
+ nr_online, buf);
+ }
+ ret = false;
+ break;
+ }
-int bch2_replicas_online(struct bch_fs *c, bool meta)
-{
- struct replicas_status s = bch2_replicas_status(c);
+ }
+ percpu_up_read(&c->mark_lock);
- return (meta
- ? min(s.replicas[BCH_DATA_journal].redundancy,
- s.replicas[BCH_DATA_btree].redundancy)
- : s.replicas[BCH_DATA_user].redundancy) + 1;
+ return ret;
}
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
int bch2_fs_replicas_init(struct bch_fs *c)
{
- c->journal.entry_u64s_reserved +=
- reserve_journal_replicas(c, &c->replicas);
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->replicas_journal_res,
+ reserve_journal_replicas(c, &c->replicas));
return replicas_table_update(c, &c->replicas);
}
#include "eytzinger.h"
#include "replicas_types.h"
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry *);
+struct replicas_delta {
+ s64 delta;
+ struct bch_replicas_entry r;
+} __packed;
+
+struct replicas_delta_list {
+ unsigned size;
+ unsigned used;
+
+ struct {} memset_start;
+ u64 nr_inodes;
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
+ struct {} memset_end;
+ struct replicas_delta d[0];
+};
+
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+ return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
+
void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
e->devs[0] = dev;
}
-struct replicas_status {
- struct {
- int redundancy;
- unsigned nr_offline;
- } replicas[BCH_DATA_NR];
-};
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *,
- struct bch_devs_mask);
-struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct replicas_status, unsigned);
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
+ unsigned, bool);
-int bch2_replicas_online(struct bch_fs *, bool);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
int bch2_replicas_gc_end(struct bch_fs *, int);
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
switch (opt) {
- case BCH_STR_HASH_OPT_CRC32C:
+ case BCH_STR_HASH_OPT_crc32c:
return BCH_STR_HASH_CRC32C;
- case BCH_STR_HASH_OPT_CRC64:
+ case BCH_STR_HASH_OPT_crc64:
return BCH_STR_HASH_CRC64;
- case BCH_STR_HASH_OPT_SIPHASH:
+ case BCH_STR_HASH_OPT_siphash:
return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
? BCH_STR_HASH_SIPHASH
: BCH_STR_HASH_SIPHASH_OLD;
if (k.k->type == desc.key_type) {
if (!desc.cmp_key(k, key))
return iter;
- } else if (k.k->type == KEY_TYPE_whiteout) {
+ } else if (k.k->type == KEY_TYPE_hash_whiteout) {
;
} else {
/* hole, not found */
int ret;
iter = bch2_trans_copy_iter(trans, start);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
bch2_btree_iter_next_slot(iter);
for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
if (k.k->type != desc.key_type &&
- k.k->type != KEY_TYPE_whiteout)
+ k.k->type != KEY_TYPE_hash_whiteout)
break;
if (k.k->type == desc.key_type &&
}
if (!slot &&
- !(flags & BCH_HASH_SET_MUST_REPLACE)) {
+ !(flags & BCH_HASH_SET_MUST_REPLACE))
slot = bch2_trans_copy_iter(trans, iter);
- if (IS_ERR(slot))
- return PTR_ERR(slot);
- }
- if (k.k->type != KEY_TYPE_whiteout)
+ if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
}
bkey_init(&delete->k);
delete->k.p = iter->pos;
- delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted;
+ delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
bch2_trans_update(trans, iter, delete, 0);
return 0;
#include "error.h"
#include "io.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "quota.h"
return "Bad number of member devices";
if (!BCH_SB_META_REPLICAS_WANT(sb) ||
- BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+ BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
if (!BCH_SB_META_REPLICAS_REQ(sb) ||
- BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+ BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
return "Invalid number of metadata replicas";
if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
- BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+ BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
return "Invalid number of data replicas";
if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
- BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+ BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
return "Invalid number of data replicas";
if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
c->sb.uuid = src->uuid;
c->sb.user_uuid = src->user_uuid;
c->sb.version = le16_to_cpu(src->version);
+ c->sb.version_min = le16_to_cpu(src->version_min);
c->sb.nr_devices = src->nr_devices;
c->sb.clean = BCH_SB_CLEAN(src);
c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
ca->mi = bch2_mi_to_cpu(mi->members + i);
}
-/* doesn't copy member info */
static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
{
struct bch_sb_field *src_f, *dst_f;
bdev_logical_block_size(sb->bdev))
goto err;
- if (sb->mode & FMODE_WRITE)
- bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
- |= BDI_CAP_STABLE_WRITES;
ret = 0;
sb->have_layout = true;
out:
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
if (test_bit(BCH_FS_ERROR, &c->flags))
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+ SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
+
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
nr_wrote = dev_mask_nr(&sb_written);
can_mount_with_written =
- bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
- BCH_FORCE_IF_DEGRADED);
+ bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
- bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
- BCH_FORCE_IF_DEGRADED);
+ bch2_have_enough_devs(c, sb_written, BCH_FORCE_IF_DEGRADED, false);
/*
* If we would be able to mount _without_ the devices we successfully
* mount with the devices we did successfully write to:
*/
if (bch2_fs_fatal_err_on(!nr_wrote ||
+ !can_mount_with_written ||
(can_mount_without_written &&
!can_mount_with_written), c,
"Unable to write superblock to sufficient devices"))
/* BCH_SB_FIELD_clean: */
-void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write)
+int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
{
struct jset_entry *entry;
+ int ret;
for (entry = clean->start;
entry < (struct jset_entry *) vstruct_end(&clean->field);
- entry = vstruct_next(entry))
- bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write);
+ entry = vstruct_next(entry)) {
+ ret = bch2_journal_entry_validate(c, "superblock", entry,
+ le16_to_cpu(c->disk_sb.sb->version),
+ BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+ write);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
- c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return ret;
}
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
- memset(entry, 0, u64s * sizeof(u64));
+ struct jset_entry *entry = *end;
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+ memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = u64s - 1;
-}
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
- entry_init_u64s(entry, u64s);
+ *end = vstruct_next(*end);
+ return entry;
}
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
- struct jset_entry *entry,
- u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+ struct jset_entry **end,
+ u64 journal_seq)
{
- unsigned i;
+ struct bch_dev *ca;
+ unsigned i, dev;
- percpu_down_write(&c->mark_lock);
+ percpu_down_read(&c->mark_lock);
if (!journal_seq) {
- bch2_fs_usage_acc_to_base(c, 0);
- bch2_fs_usage_acc_to_base(c, 1);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ bch2_fs_usage_acc_to_base(c, i);
} else {
- bch2_fs_usage_acc_to_base(c, journal_seq & 1);
+ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
}
{
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_INODES;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
-
- entry = vstruct_next(entry);
}
{
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_KEY_VERSION;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
-
- entry = vstruct_next(entry);
}
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
- container_of(entry, struct jset_entry_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u)),
+ struct jset_entry_usage, entry);
- entry_init_size(entry, sizeof(*u));
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = FS_USAGE_RESERVED;
u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
- entry = vstruct_next(entry);
}
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
+ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+ struct jset_entry_data_usage, entry);
- entry_init_size(entry, sizeof(*u) + e->nr_devs);
u->entry.type = BCH_JSET_ENTRY_data_usage;
u->v = cpu_to_le64(c->usage_base->replicas[i]);
memcpy(&u->r, e, replicas_entry_bytes(e));
+ }
- entry = vstruct_next(entry);
+ for_each_member_device(ca, c, dev) {
+ unsigned b = sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+ struct jset_entry_dev_usage *u =
+ container_of(jset_entry_init(end, b),
+ struct jset_entry_dev_usage, entry);
+
+ u->entry.type = BCH_JSET_ENTRY_dev_usage;
+ u->dev = cpu_to_le32(dev);
+ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
+ u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+ }
}
- percpu_up_write(&c->mark_lock);
+ percpu_up_read(&c->mark_lock);
- return entry;
+ for (i = 0; i < 2; i++) {
+ struct jset_entry_clock *clock =
+ container_of(jset_entry_init(end, sizeof(*clock)),
+ struct jset_entry_clock, entry);
+
+ clock->entry.type = BCH_JSET_ENTRY_clock;
+ clock->rw = i;
+ clock->time = atomic64_read(&c->io_clock[i].now);
+ }
}
void bch2_fs_mark_clean(struct bch_fs *c)
struct bch_sb_field_clean *sb_clean;
struct jset_entry *entry;
unsigned u64s;
+ int ret;
mutex_lock(&c->sb_lock);
if (BCH_SB_CLEAN(c->disk_sb.sb))
SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
- c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
}
sb_clean->flags = 0;
- sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
/* Trying to catch outstanding bug: */
BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
entry = sb_clean->start;
- entry = bch2_journal_super_entries_add_common(c, entry, 0);
+ bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0,
vstruct_end(&sb_clean->field) - (void *) entry);
- if (le16_to_cpu(c->disk_sb.sb->version) <
- bcachefs_metadata_version_bkey_renumber)
- bch2_sb_clean_renumber(sb_clean, WRITE);
+ /*
+ * this should be in the write path, and we should be validating every
+ * superblock section:
+ */
+ ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
+ if (ret) {
+ bch_err(c, "error writing marking filesystem clean: validate error");
+ goto out;
+ }
bch2_write_super(c);
out:
/* BCH_SB_FIELD_clean: */
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
- struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+ struct jset_entry **, u64);
-void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
+int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
int bch2_fs_mark_dirty(struct bch_fs *);
void bch2_fs_mark_clean(struct bch_fs *);
#include <linux/device.h>
#include <linux/genhd.h>
#include <linux/idr.h>
-#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/random.h>
return c;
}
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i, nr = 0, u64s =
+ ((sizeof(struct jset_entry_dev_usage) +
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
+ sizeof(u64);
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i, NULL)
+ nr++;
+ rcu_read_unlock();
+
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->dev_usage_journal_res, u64s * nr);
+}
+
/* Filesystem RO/RW: */
/*
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
- bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
- bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
* the journal kicks off btree writes via reclaim - wait for in flight
* writes after stopping journal:
*/
- if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- bch2_btree_flush_all_writes(c);
- else
- bch2_btree_verify_flushed(c);
+ bch2_btree_flush_all_writes(c);
/*
* After stopping journal:
void bch2_fs_read_only(struct bch_fs *c)
{
if (!test_bit(BCH_FS_RW, &c->flags)) {
- cancel_delayed_work_sync(&c->journal.reclaim_work);
+ BUG_ON(c->journal.reclaim_thread);
return;
}
(!early || c->opts.read_only)))
return -EROFS;
+ bch_info(c, "going read-write");
+
ret = bch2_fs_mark_dirty(c);
if (ret)
goto err;
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
- bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+ for_each_rw_member(ca, c, i)
+ bch2_wake_allocator(ca);
+
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret) {
+ bch_err(c, "error starting journal reclaim: %i", ret);
+ return ret;
+ }
+
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
percpu_ref_reinit(&c->writes);
set_bit(BCH_FS_RW, &c->flags);
-
- queue_delayed_work(c->journal_reclaim_wq,
- &c->journal.reclaim_work, 0);
return 0;
err:
__bch2_fs_read_only(c);
static void __bch2_fs_free(struct bch_fs *c)
{
unsigned i;
+ int cpu;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
bch2_journal_entries_free(&c->journal_entries);
percpu_free_rwsem(&c->mark_lock);
kfree(c->usage_scratch);
- free_percpu(c->usage[1]);
- free_percpu(c->usage[0]);
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ free_percpu(c->usage[i]);
kfree(c->usage_base);
+
+ if (c->btree_iters_bufs)
+ for_each_possible_cpu(cpu)
+ kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+
+ free_percpu(c->online_reserved);
+ free_percpu(c->btree_iters_bufs);
free_percpu(c->pcpu);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
+ kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap);
- if (c->journal_reclaim_wq)
- destroy_workqueue(c->journal_reclaim_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->wq)
bch2_blacklist_entries_gc);
INIT_LIST_HEAD(&c->journal_entries);
+ INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
bch2_fs_btree_cache_init_early(&c->btree_cache);
+ mutex_init(&c->sectors_available_lock);
+
if (percpu_init_rwsem(&c->mark_lock))
goto err;
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
+ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
- WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+ !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
+ !(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+ sizeof(u64), GFP_KERNEL)) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
bch2_dev_alloc(c, i))
goto err;
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->btree_root_journal_res,
+ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
+ bch2_dev_usage_journal_reserve(c);
+ bch2_journal_entry_res_resize(&c->journal,
+ &c->clock_journal_res,
+ (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
+
mutex_lock(&bch_fs_list_lock);
err = bch2_fs_online(c);
mutex_unlock(&bch_fs_list_lock);
static void bch2_dev_free(struct bch_dev *ca)
{
+ bch2_dev_allocator_stop(ca);
+
cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs &&
if (!ca)
goto err;
+ ca->fs = c;
+
+ if (ca->mi.state == BCH_MEMBER_STATE_rw &&
+ bch2_dev_allocator_start(ca)) {
+ bch2_dev_free(ca);
+ goto err;
+ }
+
bch2_dev_attach(c, ca, dev_idx);
out:
pr_verbose_init(c->opts, "ret %i", ret);
if (ret)
return ret;
- if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) &&
- !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) {
- mutex_lock(&c->sb_lock);
- bch2_mark_dev_superblock(ca->fs, ca, 0);
- mutex_unlock(&c->sb_lock);
- }
-
bch2_dev_sysfs_online(c, ca);
if (c->sb.nr_devices == 1)
enum bch_member_state new_state, int flags)
{
struct bch_devs_mask new_online_devs;
- struct replicas_status s;
struct bch_dev *ca2;
int i, nr_rw = 0, required;
lockdep_assert_held(&c->state_lock);
switch (new_state) {
- case BCH_MEMBER_STATE_RW:
+ case BCH_MEMBER_STATE_rw:
return true;
- case BCH_MEMBER_STATE_RO:
- if (ca->mi.state != BCH_MEMBER_STATE_RW)
+ case BCH_MEMBER_STATE_ro:
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
return true;
/* do we have enough devices to write to? */
for_each_member_device(ca2, c, i)
if (ca2 != ca)
- nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
: c->opts.data_replicas_required);
return nr_rw >= required;
- case BCH_MEMBER_STATE_FAILED:
- case BCH_MEMBER_STATE_SPARE:
- if (ca->mi.state != BCH_MEMBER_STATE_RW &&
- ca->mi.state != BCH_MEMBER_STATE_RO)
+ case BCH_MEMBER_STATE_failed:
+ case BCH_MEMBER_STATE_spare:
+ if (ca->mi.state != BCH_MEMBER_STATE_rw &&
+ ca->mi.state != BCH_MEMBER_STATE_ro)
return true;
/* do we have enough devices to read from? */
new_online_devs = bch2_online_devs(c);
__clear_bit(ca->dev_idx, new_online_devs.d);
- s = __bch2_replicas_status(c, new_online_devs);
-
- return bch2_have_enough_devs(s, flags);
+ return bch2_have_enough_devs(c, new_online_devs, flags, false);
default:
BUG();
}
static bool bch2_fs_may_start(struct bch_fs *c)
{
- struct replicas_status s;
struct bch_sb_field_members *mi;
struct bch_dev *ca;
- unsigned i, flags = c->opts.degraded
- ? BCH_FORCE_IF_DEGRADED
- : 0;
+ unsigned i, flags = 0;
+
+ if (c->opts.very_degraded)
+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
+
+ if (c->opts.degraded)
+ flags |= BCH_FORCE_IF_DEGRADED;
- if (!c->opts.degraded) {
+ if (!c->opts.degraded &&
+ !c->opts.very_degraded) {
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
ca = bch_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
- (ca->mi.state == BCH_MEMBER_STATE_RW ||
- ca->mi.state == BCH_MEMBER_STATE_RO)) {
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ ca->mi.state == BCH_MEMBER_STATE_ro)) {
mutex_unlock(&c->sb_lock);
return false;
}
mutex_unlock(&c->sb_lock);
}
- s = bch2_replicas_status(c);
-
- return bch2_have_enough_devs(s, flags);
+ return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
- BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW);
+ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
if (!bch2_dev_state_allowed(c, ca, new_state, flags))
return -EINVAL;
- if (new_state != BCH_MEMBER_STATE_RW)
+ if (new_state != BCH_MEMBER_STATE_rw)
__bch2_dev_read_only(c, ca);
- bch_notice(ca, "%s", bch2_dev_state[new_state]);
+ bch_notice(ca, "%s", bch2_member_states[new_state]);
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (new_state == BCH_MEMBER_STATE_RW &&
+ if (new_state == BCH_MEMBER_STATE_rw &&
__bch2_dev_read_write(c, ca))
ret = -ENOMEM;
for (i = 0; i < ca->mi.nbuckets; i++) {
ret = bch2_btree_key_cache_flush(&trans,
- BTREE_ID_ALLOC, POS(ca->dev_idx, i));
+ BTREE_ID_alloc, POS(ca->dev_idx, i));
if (ret)
break;
}
if (ret)
return ret;
- return bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+ return bch2_btree_delete_range(c, BTREE_ID_alloc,
POS(ca->dev_idx, 0),
POS(ca->dev_idx + 1, 0),
NULL);
*/
percpu_ref_put(&ca->ref);
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
goto err;
}
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
+
+ bch2_dev_usage_journal_reserve(c);
return 0;
err:
- if (ca->mi.state == BCH_MEMBER_STATE_RW &&
+ if (ca->mi.state == BCH_MEMBER_STATE_rw &&
!percpu_ref_is_zero(&ca->io_ref))
__bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return ret;
}
-static void dev_usage_clear(struct bch_dev *ca)
-{
- struct bucket_array *buckets;
-
- percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
- up_read(&ca->bucket_lock);
-}
-
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
* allocate the journal, reset all the marks, then remark after we
* attach...
*/
- bch2_mark_dev_superblock(ca->fs, ca, 0);
+ bch2_mark_dev_superblock(NULL, ca, 0);
err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
if (ret)
goto err;
- dev_usage_clear(ca);
-
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
- bch2_mark_dev_superblock(c, ca, 0);
-
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- err = "alloc write failed";
- ret = bch2_dev_alloc_write(c, ca, 0);
+ bch2_dev_usage_journal_reserve(c);
+
+ err = "error marking superblock";
+ ret = bch2_trans_mark_dev_sb(c, NULL, ca);
if (ret)
- goto err;
+ goto err_late;
- if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+ if (ca->mi.state == BCH_MEMBER_STATE_rw) {
err = __bch2_dev_read_write(c, ca);
if (err)
goto err_late;
bch_err(c, "Unable to add device: %s", err);
return ret;
err_late:
+ up_write(&c->state_lock);
bch_err(c, "Error going rw after adding device: %s", err);
return -EINVAL;
}
}
ca = bch_dev_locked(c, dev_idx);
- if (ca->mi.state == BCH_MEMBER_STATE_RW) {
+
+ if (bch2_trans_mark_dev_sb(c, NULL, ca)) {
+ err = "bch2_trans_mark_dev_sb() error";
+ goto err;
+ }
+
+ if (ca->mi.state == BCH_MEMBER_STATE_rw) {
err = __bch2_dev_read_write(c, ca);
if (err)
goto err;
return 0;
}
- if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) {
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot offline required disk");
up_write(&c->state_lock);
return -EINVAL;
bch2_debug_exit();
bch2_vfs_exit();
bch2_chardev_exit();
+ bch2_btree_key_cache_exit();
if (bcachefs_kset)
kset_unregister(bcachefs_kset);
}
static int __init bcachefs_init(void)
{
bch2_bkey_pack_test();
- bch2_inode_pack_test();
if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
+ bch2_btree_key_cache_init() ||
bch2_chardev_init() ||
bch2_vfs_init() ||
bch2_debug_init())
static inline bool bch2_dev_is_readable(struct bch_dev *ca)
{
return bch2_dev_is_online(ca) &&
- ca->mi.state != BCH_MEMBER_STATE_FAILED;
+ ca->mi.state != BCH_MEMBER_STATE_failed;
}
static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
if (!percpu_ref_tryget(&ca->io_ref))
return false;
- if (ca->mi.state == BCH_MEMBER_STATE_RW ||
- (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+ if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
return true;
percpu_ref_put(&ca->io_ref);
__for_each_online_member(ca, c, iter, ~0)
#define for_each_rw_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW)
+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
#define for_each_readable_member(ca, c, iter) \
__for_each_online_member(ca, c, iter, \
- (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+ (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
/*
* If a key exists that references a device, the device won't be going away and
struct bch_devs_list {
u8 nr;
- u8 devs[BCH_REPLICAS_MAX + 1];
+ u8 devs[BCH_BKEY_PTRS_MAX];
};
struct bch_member_cpu {
read_attribute(io_latency_stats_write);
read_attribute(congested);
+read_attribute(btree_avg_write_size);
+
read_attribute(bucket_quantiles_last_read);
read_attribute(bucket_quantiles_last_write);
read_attribute(bucket_quantiles_fragmentation);
read_attribute(journal_pins);
read_attribute(btree_updates);
read_attribute(dirty_btree_nodes);
+read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_transactions);
read_attribute(stripes_heap);
rw_attribute(pd_controllers_update_seconds);
-read_attribute(meta_replicas_have);
-read_attribute(data_replicas_have);
-
read_attribute(io_timers_read);
read_attribute(io_timers_write);
write_attribute(perf_test);
#endif /* CONFIG_BCACHEFS_TESTS */
-#define BCH_DEBUG_PARAM(name, description) \
- rw_attribute(name);
-
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
{ .name = #_name, .mode = S_IRUGO };
return ret;
}
+static size_t bch2_btree_avg_write_size(struct bch_fs *c)
+{
+ u64 nr = atomic64_read(&c->btree_writes_nr);
+ u64 sectors = atomic64_read(&c->btree_writes_sectors);
+
+ return nr ? div64_u64(sectors, nr) : 0;
+}
+
static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
+ struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
if (!fs_usage)
return -ENOMEM;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret)
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
if (k.k->type == KEY_TYPE_extent) {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const union bch_extent_entry *entry;
sysfs_print(block_size, block_bytes(c));
sysfs_print(btree_node_size, btree_bytes(c));
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
+ sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
sysfs_print(read_realloc_races,
atomic_long_read(&c->read_realloc_races));
sysfs_print(promote_whole_extents, c->promote_whole_extents);
- sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true));
- sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false));
-
/* Debugging: */
if (attr == &sysfs_alloc_debug)
return out.pos - buf;
}
+ if (attr == &sysfs_btree_cache) {
+ bch2_btree_cache_to_text(&out, c);
+ return out.pos - buf;
+ }
+
if (attr == &sysfs_btree_key_cache) {
bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
return out.pos - buf;
return out.pos - buf;
}
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
return 0;
}
/* Debugging: */
-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EPERM;
/* Debugging: */
if (attr == &sysfs_trigger_journal_flush)
- bch2_journal_meta_async(&c->journal, NULL);
+ bch2_journal_meta(&c->journal);
if (attr == &sysfs_trigger_btree_coalesce)
bch2_coalesce(c);
*/
#if 0
down_read(&c->state_lock);
- bch2_gc(c, NULL, false, false);
+ bch2_gc(c, false, false);
up_read(&c->state_lock);
#else
bch2_gc_gens(c);
if (threads_str &&
!(ret = kstrtouint(threads_str, 10, &threads)) &&
!(ret = bch2_strtoull_h(nr_str, &nr)))
- bch2_btree_perf_test(c, test, nr, threads);
- else
- size = ret;
+ ret = bch2_btree_perf_test(c, test, nr, threads);
kfree(tmp);
+
+ if (ret)
+ size = ret;
}
#endif
return size;
&sysfs_block_size,
&sysfs_btree_node_size,
&sysfs_btree_cache_size,
-
- &sysfs_meta_replicas_have,
- &sysfs_data_replicas_have,
+ &sysfs_btree_avg_write_size,
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
&sysfs_journal_pins,
&sysfs_btree_updates,
&sysfs_dirty_btree_nodes,
+ &sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_btree_transactions,
&sysfs_stripes_heap,
&sysfs_io_timers_write,
&sysfs_internal_uuid,
-
-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
NULL
};
{
int rw = (private ? 1 : 0);
- return bucket_last_io(c, bucket(ca, b), rw);
+ return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
}
static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
size_t b, void *private)
{
- return bucket_gc_gen(ca, b);
+ return bucket_gc_gen(bucket(ca, b));
}
static int unsigned_cmp(const void *_l, const void *_r)
nr[c->open_buckets[i].type]++;
pr_buf(out,
- "free_inc: %zu/%zu\n"
- "free[RESERVE_BTREE]: %zu/%zu\n"
- "free[RESERVE_MOVINGGC]: %zu/%zu\n"
- "free[RESERVE_NONE]: %zu/%zu\n"
- "buckets:\n"
- " capacity: %llu\n"
- " alloc: %llu\n"
- " sb: %llu\n"
- " journal: %llu\n"
- " meta: %llu\n"
- " user: %llu\n"
- " cached: %llu\n"
- " erasure coded: %llu\n"
- " available: %lli\n"
- "sectors:\n"
- " sb: %llu\n"
- " journal: %llu\n"
- " meta: %llu\n"
- " user: %llu\n"
- " cached: %llu\n"
- " erasure coded: %llu\n"
- " fragmented: %llu\n"
- " copygc threshold: %llu\n"
- "freelist_wait: %s\n"
- "open buckets: %u/%u (reserved %u)\n"
- "open_buckets_wait: %s\n"
- "open_buckets_btree: %u\n"
- "open_buckets_user: %u\n"
- "btree reserve cache: %u\n",
- fifo_used(&ca->free_inc), ca->free_inc.size,
- fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size,
- fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
- fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
- ca->mi.nbuckets - ca->mi.first_bucket,
- stats.buckets_alloc,
- stats.buckets[BCH_DATA_sb],
- stats.buckets[BCH_DATA_journal],
- stats.buckets[BCH_DATA_btree],
- stats.buckets[BCH_DATA_user],
- stats.buckets[BCH_DATA_cached],
- stats.buckets_ec,
- __dev_buckets_available(ca, stats),
- stats.sectors[BCH_DATA_sb],
- stats.sectors[BCH_DATA_journal],
- stats.sectors[BCH_DATA_btree],
- stats.sectors[BCH_DATA_user],
- stats.sectors[BCH_DATA_cached],
- stats.sectors_ec,
- stats.sectors_fragmented,
- c->copygc_threshold,
- c->freelist_wait.list.first ? "waiting" : "empty",
- c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
- BTREE_NODE_OPEN_BUCKET_RESERVE,
- c->open_buckets_wait.list.first ? "waiting" : "empty",
- nr[BCH_DATA_btree],
- nr[BCH_DATA_user],
- c->btree_reserve_cache_nr);
+ "\t\t buckets\t sectors fragmented\n"
+ "capacity%16llu\n",
+ ca->mi.nbuckets - ca->mi.first_bucket);
+
+ for (i = 1; i < BCH_DATA_NR; i++)
+ pr_buf(out, "%-8s%16llu%16llu%16llu\n",
+ bch2_data_types[i], stats.d[i].buckets,
+ stats.d[i].sectors, stats.d[i].fragmented);
+
+ pr_buf(out,
+ "ec\t%16llu\n"
+ "available%15llu\n"
+ "\n"
+ "free_inc\t\t%zu/%zu\n"
+ "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
+ "free[RESERVE_NONE]\t%zu/%zu\n"
+ "freelist_wait\t\t%s\n"
+ "open buckets\t\t%u/%u (reserved %u)\n"
+ "open_buckets_wait\t%s\n"
+ "open_buckets_btree\t%u\n"
+ "open_buckets_user\t%u\n"
+ "btree reserve cache\t%u\n",
+ stats.buckets_ec,
+ __dev_buckets_available(ca, stats),
+ fifo_used(&ca->free_inc), ca->free_inc.size,
+ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
+ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
+ c->freelist_wait.list.first ? "waiting" : "empty",
+ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT,
+ BTREE_NODE_OPEN_BUCKET_RESERVE,
+ c->open_buckets_wait.list.first ? "waiting" : "empty",
+ nr[BCH_DATA_btree],
+ nr[BCH_DATA_user],
+ c->btree_reserve_cache_nr);
}
static const char * const bch2_rw[] = {
}
if (attr == &sysfs_state_rw) {
- bch2_string_opt_to_text(&out, bch2_dev_state,
+ bch2_string_opt_to_text(&out, bch2_member_states,
ca->mi.state);
pr_buf(&out, "\n");
return out.pos - buf;
{
int ret;
- ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+ ret = bch2_btree_delete_range(c, BTREE_ID_extents,
POS(0, 0), POS(0, U64_MAX),
NULL);
BUG_ON(ret);
- ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
POS(0, 0), POS(0, U64_MAX),
NULL);
BUG_ON(ret);
/* unit tests */
-static void test_delete(struct bch_fs *c, u64 nr)
+static int test_delete(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(iter);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "lookup error in test_delete: %i", ret);
+ goto err;
+ }
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &k.k_i, 0));
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "update error in test_delete: %i", ret);
+ goto err;
+ }
pr_info("deleting once");
ret = bch2_btree_delete_at(&trans, iter, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "delete error (first) in test_delete: %i", ret);
+ goto err;
+ }
pr_info("deleting twice");
ret = bch2_btree_delete_at(&trans, iter, 0);
- BUG_ON(ret);
-
+ if (ret) {
+ bch_err(c, "delete error (second) in test_delete: %i", ret);
+ goto err;
+ }
+err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_delete_written(struct bch_fs *c, u64 nr)
+static int test_delete_written(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p,
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(iter);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "lookup error in test_delete_written: %i", ret);
+ goto err;
+ }
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &k.k_i, 0));
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "update error in test_delete_written: %i", ret);
+ goto err;
+ }
bch2_journal_flush_all_pins(&c->journal);
ret = bch2_btree_delete_at(&trans, iter, 0);
- BUG_ON(ret);
-
+ if (ret) {
+ bch_err(c, "delete error in test_delete_written: %i", ret);
+ goto err;
+ }
+err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_iterate(struct bch_fs *c, u64 nr)
+static int test_iterate(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
bkey_cookie_init(&k.k_i);
k.k.p.offset = i;
- ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "insert error in test_iterate: %i", ret);
+ goto err;
+ }
}
pr_info("iterating forwards");
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
POS_MIN, 0, k, ret) {
if (k.k->p.inode)
break;
BUG_ON(k.k->p.offset != --i);
BUG_ON(i);
-
+err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_iterate_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
k.k.p.offset = i + 8;
k.k.size = 8;
- ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "insert error in test_iterate_extents: %i", ret);
+ goto err;
+ }
}
pr_info("iterating forwards");
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
POS_MIN, 0, k, ret) {
BUG_ON(bkey_start_offset(k.k) != i);
i = k.k->p.offset;
}
BUG_ON(i);
-
+err:
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_iterate_slots(struct bch_fs *c, u64 nr)
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u64 i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
bkey_cookie_init(&k.k_i);
k.k.p.offset = i * 2;
- ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i,
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "insert error in test_iterate_slots: %i", ret);
+ goto err;
+ }
}
pr_info("iterating forwards");
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
0, k, ret) {
if (k.k->p.inode)
break;
BUG_ON(k.k->p.offset != i);
i += 2;
}
- bch2_trans_iter_free(&trans, iter);
+ bch2_trans_iter_put(&trans, iter);
BUG_ON(i != nr * 2);
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
BTREE_ITER_SLOTS, k, ret) {
BUG_ON(k.k->p.offset != i);
BUG_ON(bkey_deleted(k.k) != (i & 1));
if (i == nr * 2)
break;
}
-
+ bch2_trans_iter_put(&trans, iter);
+err:
bch2_trans_exit(&trans);
+ return ret;
}
-static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u64 i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
k.k.p.offset = i + 16;
k.k.size = 8;
- ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
+ goto err;
+ }
}
pr_info("iterating forwards");
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
0, k, ret) {
BUG_ON(bkey_start_offset(k.k) != i + 8);
BUG_ON(k.k->size != 8);
i += 16;
}
- bch2_trans_iter_free(&trans, iter);
+ bch2_trans_iter_put(&trans, iter);
BUG_ON(i != nr);
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
BTREE_ITER_SLOTS, k, ret) {
BUG_ON(bkey_deleted(k.k) != !(i % 16));
if (i == nr)
break;
}
-
+ bch2_trans_iter_put(&trans, iter);
+err:
bch2_trans_exit(&trans);
+ return 0;
}
/*
* XXX: we really want to make sure we've got a btree with depth > 0 for these
* tests
*/
-static void test_peek_end(struct bch_fs *c, u64 nr)
+static int test_peek_end(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
k = bch2_btree_iter_peek(iter);
BUG_ON(k.k);
k = bch2_btree_iter_peek(iter);
BUG_ON(k.k);
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
+ return 0;
}
-static void test_peek_end_extents(struct bch_fs *c, u64 nr)
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
k = bch2_btree_iter_peek(iter);
BUG_ON(k.k);
k = bch2_btree_iter_peek(iter);
BUG_ON(k.k);
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
+ return 0;
}
/* extent unit tests */
u64 test_version;
-static void insert_test_extent(struct bch_fs *c,
- u64 start, u64 end)
+static int insert_test_extent(struct bch_fs *c,
+ u64 start, u64 end)
{
struct bkey_i_cookie k;
int ret;
k.k_i.k.size = end - start;
k.k_i.k.version.lo = test_version++;
- ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
NULL, NULL, 0);
- BUG_ON(ret);
+ if (ret)
+ bch_err(c, "insert error in insert_test_extent: %i", ret);
+ return ret;
}
-static void __test_extent_overwrite(struct bch_fs *c,
+static int __test_extent_overwrite(struct bch_fs *c,
u64 e1_start, u64 e1_end,
u64 e2_start, u64 e2_end)
{
- insert_test_extent(c, e1_start, e1_end);
- insert_test_extent(c, e2_start, e2_end);
+ int ret;
+
+ ret = insert_test_extent(c, e1_start, e1_end) ?:
+ insert_test_extent(c, e2_start, e2_end);
delete_test_keys(c);
+ return ret;
}
-static void test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
{
- __test_extent_overwrite(c, 0, 64, 0, 32);
- __test_extent_overwrite(c, 8, 64, 0, 32);
+ return __test_extent_overwrite(c, 0, 64, 0, 32) ?:
+ __test_extent_overwrite(c, 8, 64, 0, 32);
}
-static void test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
{
- __test_extent_overwrite(c, 0, 64, 32, 64);
- __test_extent_overwrite(c, 0, 64, 32, 72);
+ return __test_extent_overwrite(c, 0, 64, 32, 64) ?:
+ __test_extent_overwrite(c, 0, 64, 32, 72);
}
-static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
{
- __test_extent_overwrite(c, 0, 64, 32, 40);
+ return __test_extent_overwrite(c, 0, 64, 32, 40);
}
-static void test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
{
- __test_extent_overwrite(c, 32, 64, 0, 64);
- __test_extent_overwrite(c, 32, 64, 0, 128);
- __test_extent_overwrite(c, 32, 64, 32, 64);
- __test_extent_overwrite(c, 32, 64, 32, 128);
+ return __test_extent_overwrite(c, 32, 64, 0, 64) ?:
+ __test_extent_overwrite(c, 32, 64, 0, 128) ?:
+ __test_extent_overwrite(c, 32, 64, 32, 64) ?:
+ __test_extent_overwrite(c, 32, 64, 32, 128);
}
/* perf tests */
return v;
}
-static void rand_insert(struct bch_fs *c, u64 nr)
+static int rand_insert(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct bkey_i_cookie k;
- int ret;
+ int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < nr; i++) {
bkey_cookie_init(&k.k_i);
k.k.p.offset = test_rand();
+ k.k.p.snapshot = U32_MAX;
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i));
-
- BUG_ON(ret);
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
+ if (ret) {
+ bch_err(c, "error in rand_insert: %i", ret);
+ break;
+ }
}
bch2_trans_exit(&trans);
+ return ret;
}
-static void rand_lookup(struct bch_fs *c, u64 nr)
+static int rand_lookup(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
+ int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
for (i = 0; i < nr; i++) {
- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
- POS(0, test_rand()), 0);
+ bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
k = bch2_btree_iter_peek(iter);
- bch2_trans_iter_free(&trans, iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch_err(c, "error in rand_lookup: %i", ret);
+ break;
+ }
}
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
+ return ret;
}
-static void rand_mixed(struct bch_fs *c, u64 nr)
+static int rand_mixed(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- int ret;
+ int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
for (i = 0; i < nr; i++) {
- iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
- POS(0, test_rand()), 0);
+ bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch_err(c, "lookup error in rand_mixed: %i", ret);
+ break;
+ }
if (!(i & 3) && k.k) {
struct bkey_i_cookie k;
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &k.k_i, 0));
-
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "update error in rand_mixed: %i", ret);
+ break;
+ }
}
-
- bch2_trans_iter_free(&trans, iter);
}
+ bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
+ return ret;
}
static int __do_delete(struct btree_trans *trans, struct bpos pos)
struct bkey_s_c k;
int ret = 0;
- iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos,
+ iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter);
- if (ret)
- goto err;
-
k = bch2_btree_iter_peek(iter);
ret = bkey_err(k);
if (ret)
goto err;
+ if (!k.k)
+ goto err;
+
bkey_init(&delete.k);
delete.k.p = k.k->p;
return ret;
}
-static void rand_delete(struct bch_fs *c, u64 nr)
+static int rand_delete(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- int ret;
+ int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
__do_delete(&trans, pos));
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "error in rand_delete: %i", ret);
+ break;
+ }
}
bch2_trans_exit(&trans);
+ return ret;
}
-static void seq_insert(struct bch_fs *c, u64 nr)
+static int seq_insert(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_i_cookie insert;
- int ret;
+ int ret = 0;
u64 i = 0;
bkey_cookie_init(&insert.k_i);
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
insert.k.p = iter->pos;
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &insert.k_i, 0));
-
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "error in seq_insert: %i", ret);
+ break;
+ }
if (++i == nr)
break;
}
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
+ return ret;
}
-static void seq_lookup(struct bch_fs *c, u64 nr)
+static int seq_lookup(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret)
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
;
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
+ return ret;
}
-static void seq_overwrite(struct bch_fs *c, u64 nr)
+static int seq_overwrite(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
BTREE_ITER_INTENT, k, ret) {
struct bkey_i_cookie u;
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_trans_update(&trans, iter, &u.k_i, 0));
-
- BUG_ON(ret);
+ if (ret) {
+ bch_err(c, "error in seq_overwrite: %i", ret);
+ break;
+ }
}
+ bch2_trans_iter_put(&trans, iter);
+
bch2_trans_exit(&trans);
+ return ret;
}
-static void seq_delete(struct bch_fs *c, u64 nr)
+static int seq_delete(struct bch_fs *c, u64 nr)
{
int ret;
- ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
POS(0, 0), POS(0, U64_MAX),
NULL);
- BUG_ON(ret);
+ if (ret)
+ bch_err(c, "error in seq_delete: %i", ret);
+ return ret;
}
-typedef void (*perf_test_fn)(struct bch_fs *, u64);
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
struct test_job {
struct bch_fs *c;
u64 start;
u64 finish;
+ int ret;
};
static int btree_perf_test_thread(void *data)
{
struct test_job *j = data;
+ int ret;
if (atomic_dec_and_test(&j->ready)) {
wake_up(&j->ready_wait);
wait_event(j->ready_wait, !atomic_read(&j->ready));
}
- j->fn(j->c, j->nr / j->nr_threads);
+ ret = j->fn(j->c, j->nr / j->nr_threads);
+ if (ret)
+ j->ret = ret;
if (atomic_dec_and_test(&j->done)) {
j->finish = sched_clock();
return 0;
}
-void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
- u64 nr, unsigned nr_threads)
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+ u64 nr, unsigned nr_threads)
{
struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
char name_buf[20], nr_buf[20], per_sec_buf[20];
if (!j.fn) {
pr_err("unknown test %s", testname);
- return;
+ return -EINVAL;
}
//pr_info("running test %s:", testname);
time / NSEC_PER_SEC,
time * nr_threads / nr,
per_sec_buf);
+ return j.ret;
}
#endif /* CONFIG_BCACHEFS_TESTS */
#ifdef CONFIG_BCACHEFS_TESTS
-void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
#else
{
while (size) {
struct page *page = alloc_page(gfp_mask);
- unsigned len = min(PAGE_SIZE, size);
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
if (!page)
return -ENOMEM;
#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
-#define memcpy(dst, src, len) \
-({ \
- void *_dst = (dst); \
- const void *_src = (src); \
- size_t _len = (len); \
- \
- BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \
- (void *) (_dst) + (_len) <= (void *) (_src))); \
- memcpy(_dst, _src, _len); \
-})
-
#else /* DEBUG */
#define EBUG_ON(cond)
#define cmp_int(l, r) ((l > r) - (l < r))
+static inline int u8_cmp(u8 l, u8 r)
+{
+ return cmp_int(l, r);
+}
+
#endif /* _BCACHEFS_UTIL_H */
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#include "varint.h"
+
+int bch2_varint_encode(u8 *out, u64 v)
+{
+ unsigned bits = fls64(v|1);
+ unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+ if (likely(bytes < 9)) {
+ v <<= bytes;
+ v |= ~(~0 << (bytes - 1));
+ } else {
+ *out++ = 255;
+ bytes = 9;
+ }
+
+ put_unaligned_le64(v, out);
+ return bytes;
+}
+
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+ u64 v = get_unaligned_le64(in);
+ unsigned bytes = ffz(v & 255) + 1;
+
+ if (unlikely(in + bytes > end))
+ return -1;
+
+ if (likely(bytes < 9)) {
+ v >>= bytes;
+ v &= ~(~0ULL << (7 * bytes));
+ } else {
+ v = get_unaligned_le64(++in);
+ }
+
+ *out = v;
+ return bytes;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */
}
const struct bch_hash_desc bch2_xattr_hash_desc = {
- .btree_id = BTREE_ID_XATTRS,
+ .btree_id = BTREE_ID_xattrs,
.key_type = KEY_TYPE_xattr,
.hash_key = xattr_hash_key,
.hash_bkey = xattr_hash_bkey,
int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
const char *name, void *buffer, size_t size, int type)
{
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
- &inode->ei_str_hash, inode->v.i_ino,
+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash,
+ inode->v.i_ino,
&X_SEARCH(type, name, strlen(name)),
0);
- if (IS_ERR(iter)) {
- bch2_trans_exit(&trans);
- BUG_ON(PTR_ERR(iter) == -EINTR);
-
- return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter);
- }
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ goto err;
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
ret = le16_to_cpu(xattr.v->x_val_len);
else
memcpy(buffer, xattr_val(xattr.v), ret);
}
-
+ bch2_trans_iter_put(&trans, iter);
+err:
bch2_trans_exit(&trans);
- return ret;
+
+ BUG_ON(ret == -EINTR);
+ return ret == -ENOENT ? -ENODATA : ret;
}
int bch2_xattr_set(struct btree_trans *trans, u64 inum,
}
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
- struct bch_inode_info *inode,
+ struct bch_inode_unpacked *inode,
struct xattr_buf *buf,
bool all)
{
u64 v;
for (id = 0; id < Inode_opt_nr; id++) {
- v = bch2_inode_opt_get(&inode->ei_inode, id);
+ v = bch2_inode_opt_get(inode, id);
if (!v)
continue;
if (!all &&
- !(inode->ei_inode.bi_fields_set & (1 << id)))
+ !(inode->bi_fields_set & (1 << id)))
continue;
ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_XATTRS,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
POS(inum, 0), 0, k, ret) {
BUG_ON(k.k->p.inode < inum);
if (ret)
break;
}
+ bch2_trans_iter_put(&trans, iter);
+
ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
return ret;
- ret = bch2_xattr_list_bcachefs(c, inode, &buf, false);
+ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
if (ret)
return ret;
- ret = bch2_xattr_list_bcachefs(c, inode, &buf, true);
+ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
if (ret)
return ret;
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
- bch2_xattr_set(&trans, inode->v.i_ino,
- &inode->ei_str_hash,
+ bch2_xattr_set(&trans, inode->v.i_ino, &hash,
name, value, size,
handler->flags, flags));
}
struct genradix_root *r;
struct genradix_node *n;
unsigned level, i;
+
+ if (iter->offset == SIZE_MAX)
+ return NULL;
+
restart:
r = READ_ONCE(radix->root);
if (!r)
(GENRADIX_ARY - 1);
while (!n->children[i]) {
+ size_t objs_per_ptr = genradix_depth_size(level);
+
+ if (iter->offset + objs_per_ptr < iter->offset) {
+ iter->offset = SIZE_MAX;
+ iter->pos = SIZE_MAX;
+ return NULL;
+ }
+
i++;
- iter->offset = round_down(iter->offset +
- genradix_depth_size(level),
- genradix_depth_size(level));
+ iter->offset = round_down(iter->offset + objs_per_ptr,
+ objs_per_ptr);
iter->pos = (iter->offset >> PAGE_SHIFT) *
objs_per_page;
if (i == GENRADIX_ARY)
ret = pthread_create(&p->thread, &attr, kthread_start_fn, p);
if (ret)
- die("pthread_create error %s", strerror(ret));
+ return ERR_PTR(-ret);
pthread_setname_np(p->thread, p->comm);
return p;
}
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Resizable, Scalable, Concurrent Hash Table
*
* Code partially derived from nft_hash
* Rewritten with rehash code from br_multicast plus single list
* pointer as suggested by Josh Triplett
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/atomic.h>
-#include <linux/cpumask.h>
#include <linux/kernel.h>
#include <linux/log2.h>
#include <linux/sched.h>
+#include <linux/rculist.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/jhash.h>
+#include <linux/overflow.h>
#include <linux/random.h>
#include <linux/rhashtable.h>
#include <linux/err.h>
+#include <linux/export.h>
#define HASH_DEFAULT_SIZE 64UL
#define HASH_MIN_SIZE 4U
-#define BUCKET_LOCKS_PER_CPU 32UL
+
+union nested_table {
+ union nested_table __rcu *table;
+ struct rhash_lock_head __rcu *bucket;
+};
static u32 head_hashfn(struct rhashtable *ht,
const struct bucket_table *tbl,
return rht_head_hashfn(ht, tbl, he, ht->p);
}
-static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
- gfp_t gfp)
-{
- unsigned int i, size;
- unsigned int nr_pcpus = num_possible_cpus();
+#ifdef CONFIG_PROVE_LOCKING
+#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
- nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
- size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
+{
+ return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
- /* Never allocate more than 0.5 locks per bucket */
- size = min_t(unsigned int, size, tbl->size >> 1);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+ if (!debug_locks)
+ return 1;
+ if (unlikely(tbl->nest))
+ return 1;
+ return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
+#else
+#define ASSERT_RHT_MUTEX(HT)
+#endif
- if (sizeof(spinlock_t) != 0) {
- tbl->locks = NULL;
- if (gfp != GFP_KERNEL)
- gfp |= __GFP_NOWARN | __GFP_NORETRY;
+static inline union nested_table *nested_table_top(
+ const struct bucket_table *tbl)
+{
+ /* The top-level bucket entry does not need RCU protection
+ * because it's set at the same time as tbl->nest.
+ */
+ return (void *)rcu_dereference_protected(tbl->buckets[0], 1);
+}
- if (!tbl->locks)
- tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
- gfp);
- if (!tbl->locks)
- return -ENOMEM;
- for (i = 0; i < size; i++)
- spin_lock_init(&tbl->locks[i]);
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+ const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+ const unsigned int len = 1 << shift;
+ unsigned int i;
+
+ ntbl = rcu_dereference_protected(ntbl->table, 1);
+ if (!ntbl)
+ return;
+
+ if (size > len) {
+ size >>= shift;
+ for (i = 0; i < len; i++)
+ nested_table_free(ntbl + i, size);
}
- tbl->locks_mask = size - 1;
- return 0;
+ kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+ unsigned int size = tbl->size >> tbl->nest;
+ unsigned int len = 1 << tbl->nest;
+ union nested_table *ntbl;
+ unsigned int i;
+
+ ntbl = nested_table_top(tbl);
+
+ for (i = 0; i < len; i++)
+ nested_table_free(ntbl + i, size);
+
+ kfree(ntbl);
}
static void bucket_table_free(struct bucket_table *tbl)
{
- if (tbl)
- kvfree(tbl->locks);
+ if (tbl->nest)
+ nested_bucket_table_free(tbl);
kvfree(tbl);
}
bucket_table_free(container_of(head, struct bucket_table, rcu));
}
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+ union nested_table __rcu **prev,
+ bool leaf)
+{
+ union nested_table *ntbl;
+ int i;
+
+ ntbl = rcu_dereference(*prev);
+ if (ntbl)
+ return ntbl;
+
+ ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+ if (ntbl && leaf) {
+ for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
+ INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
+ }
+
+ if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
+ return ntbl;
+ /* Raced with another thread. */
+ kfree(ntbl);
+ return rcu_dereference(*prev);
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+ size_t nbuckets,
+ gfp_t gfp)
+{
+ const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+ struct bucket_table *tbl;
+ size_t size;
+
+ if (nbuckets < (1 << (shift + 1)))
+ return NULL;
+
+ size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+ tbl = kzalloc(size, gfp);
+ if (!tbl)
+ return NULL;
+
+ if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+ false)) {
+ kfree(tbl);
+ return NULL;
+ }
+
+ tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+ return tbl;
+}
+
static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
size_t nbuckets,
gfp_t gfp)
size_t size;
int i;
- size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
- gfp != GFP_KERNEL)
- tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
- if (tbl == NULL && gfp == GFP_KERNEL)
- tbl = vzalloc(size);
- if (tbl == NULL)
- return NULL;
+ tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
- tbl->size = nbuckets;
+ size = nbuckets;
- if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
- bucket_table_free(tbl);
- return NULL;
+ if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
+ tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+ nbuckets = 0;
}
+ if (tbl == NULL)
+ return NULL;
+
+ tbl->size = size;
+
+ rcu_head_init(&tbl->rcu);
INIT_LIST_HEAD(&tbl->walkers);
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+ tbl->hash_rnd = get_random_u32();
for (i = 0; i < nbuckets; i++)
- INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+ INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
return tbl;
}
return new_tbl;
}
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht,
+ struct rhash_lock_head __rcu **bkt,
+ unsigned int old_hash)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
- struct bucket_table *new_tbl = rhashtable_last_table(ht,
- rht_dereference_rcu(old_tbl->future_tbl, ht));
- struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
- int err = -ENOENT;
+ struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
+ int err = -EAGAIN;
struct rhash_head *head, *next, *entry;
- spinlock_t *new_bucket_lock;
+ struct rhash_head __rcu **pprev = NULL;
unsigned int new_hash;
- rht_for_each(entry, old_tbl, old_hash) {
+ if (new_tbl->nest)
+ goto out;
+
+ err = -ENOENT;
+
+ rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
+ old_tbl, old_hash) {
err = 0;
next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
new_hash = head_hashfn(ht, new_tbl, entry);
- new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
+ rht_lock(new_tbl, &new_tbl->buckets[new_hash]);
- spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
- head = rht_dereference_bucket(new_tbl->buckets[new_hash],
- new_tbl, new_hash);
+ head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
RCU_INIT_POINTER(entry->next, head);
- rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
- spin_unlock(new_bucket_lock);
+ rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
- rcu_assign_pointer(*pprev, next);
+ if (pprev)
+ rcu_assign_pointer(*pprev, next);
+ else
+ /* Need to preserved the bit lock. */
+ rht_assign_locked(bkt, next);
out:
return err;
}
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
unsigned int old_hash)
{
struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
- spinlock_t *old_bucket_lock;
+ struct rhash_lock_head __rcu **bkt = rht_bucket_var(old_tbl, old_hash);
+ int err;
- old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+ if (!bkt)
+ return 0;
+ rht_lock(old_tbl, bkt);
- spin_lock_bh(old_bucket_lock);
- while (!rhashtable_rehash_one(ht, old_hash))
+ while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
;
- old_tbl->rehash++;
- spin_unlock_bh(old_bucket_lock);
+
+ if (err == -ENOENT)
+ err = 0;
+ rht_unlock(old_tbl, bkt);
+
+ return err;
}
static int rhashtable_rehash_attach(struct rhashtable *ht,
struct bucket_table *old_tbl,
struct bucket_table *new_tbl)
{
- /* Protect future_tbl using the first bucket lock. */
- spin_lock_bh(old_tbl->locks);
-
- /* Did somebody beat us to it? */
- if (rcu_access_pointer(old_tbl->future_tbl)) {
- spin_unlock_bh(old_tbl->locks);
- return -EEXIST;
- }
-
/* Make insertions go into the new, empty table right away. Deletions
* and lookups will be attempted in both tables until we synchronize.
+ * As cmpxchg() provides strong barriers, we do not need
+ * rcu_assign_pointer().
*/
- rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
- spin_unlock_bh(old_tbl->locks);
+ if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
+ new_tbl) != NULL)
+ return -EEXIST;
return 0;
}
struct bucket_table *new_tbl;
struct rhashtable_walker *walker;
unsigned int old_hash;
+ int err;
new_tbl = rht_dereference(old_tbl->future_tbl, ht);
if (!new_tbl)
return 0;
- for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
- rhashtable_rehash_chain(ht, old_hash);
+ for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+ err = rhashtable_rehash_chain(ht, old_hash);
+ if (err)
+ return err;
+ cond_resched();
+ }
/* Publish the new table pointer. */
rcu_assign_pointer(ht->tbl, new_tbl);
spin_lock(&ht->lock);
list_for_each_entry(walker, &old_tbl->walkers, list)
walker->tbl = NULL;
- spin_unlock(&ht->lock);
/* Wait for readers. All new readers will see the new
* table, and thus no references to the old table will
* remain.
+ * We do this inside the locked region so that
+ * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
+ * to check if it should not re-link the table.
*/
call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+ spin_unlock(&ht->lock);
return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
}
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+ struct bucket_table *old_tbl,
+ unsigned int size)
{
- struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+ struct bucket_table *new_tbl;
int err;
- old_tbl = rhashtable_last_table(ht, old_tbl);
+ ASSERT_RHT_MUTEX(ht);
- new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+ new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
if (new_tbl == NULL)
return -ENOMEM;
return err;
}
+/**
+ * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
+ * @ht: the hash table to shrink
+ *
+ * This function shrinks the hash table to fit, i.e., the smallest
+ * size would not cause it to expand right away automatically.
+ *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * The caller must ensure that no concurrent table mutations take place.
+ * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
+ */
static int rhashtable_shrink(struct rhashtable *ht)
{
- struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+ struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
unsigned int nelems = atomic_read(&ht->nelems);
unsigned int size = 0;
- int err;
if (nelems)
size = roundup_pow_of_two(nelems * 3 / 2);
if (rht_dereference(old_tbl->future_tbl, ht))
return -EEXIST;
- new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
- if (new_tbl == NULL)
- return -ENOMEM;
-
- err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
- if (err)
- bucket_table_free(new_tbl);
-
- return err;
+ return rhashtable_rehash_alloc(ht, old_tbl, size);
}
static void rht_deferred_worker(struct work_struct *work)
tbl = rhashtable_last_table(ht, tbl);
if (rht_grow_above_75(ht, tbl))
- rhashtable_expand(ht);
+ err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
- rhashtable_shrink(ht);
+ err = rhashtable_shrink(ht);
+ else if (tbl->nest)
+ err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
+
+ if (!err || err == -EEXIST) {
+ int nerr;
- err = rhashtable_rehash_table(ht);
+ nerr = rhashtable_rehash_table(ht);
+ err = err ?: nerr;
+ }
mutex_unlock(&ht->mutex);
schedule_work(&ht->run_work);
}
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
- struct bucket_table *tbl,
- unsigned int hash)
-{
- unsigned int elasticity = ht->elasticity;
- struct rhash_head *head;
-
- rht_for_each(head, tbl, hash)
- if (!--elasticity)
- return true;
-
- return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
- struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+ struct bucket_table *tbl)
{
struct bucket_table *old_tbl;
struct bucket_table *new_tbl;
err = -ENOMEM;
- new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
+ new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
if (new_tbl == NULL)
goto fail;
fail:
/* Do not fail the insert if someone else did a rehash. */
- if (likely(rcu_dereference_raw(tbl->future_tbl)))
+ if (likely(rcu_access_pointer(tbl->future_tbl)))
return 0;
/* Schedule async rehash to retry allocation in process context. */
return err;
}
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
- const void *key,
- struct rhash_head *obj,
- struct bucket_table *tbl)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+ struct rhash_lock_head __rcu **bkt,
+ struct bucket_table *tbl, unsigned int hash,
+ const void *key, struct rhash_head *obj)
{
+ struct rhashtable_compare_arg arg = {
+ .ht = ht,
+ .key = key,
+ };
+ struct rhash_head __rcu **pprev = NULL;
struct rhash_head *head;
- unsigned int hash;
- int err;
+ int elasticity;
+
+ elasticity = RHT_ELASTICITY;
+ rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
+ struct rhlist_head *list;
+ struct rhlist_head *plist;
+
+ elasticity--;
+ if (!key ||
+ (ht->p.obj_cmpfn ?
+ ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+ rhashtable_compare(&arg, rht_obj(ht, head)))) {
+ pprev = &head->next;
+ continue;
+ }
- tbl = rhashtable_last_table(ht, tbl);
- hash = head_hashfn(ht, tbl, obj);
- spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+ if (!ht->rhlist)
+ return rht_obj(ht, head);
- err = -EEXIST;
- if (key && rhashtable_lookup_fast(ht, key, ht->p))
- goto exit;
+ list = container_of(obj, struct rhlist_head, rhead);
+ plist = container_of(head, struct rhlist_head, rhead);
- err = -E2BIG;
- if (unlikely(rht_grow_above_max(ht, tbl)))
- goto exit;
+ RCU_INIT_POINTER(list->next, plist);
+ head = rht_dereference_bucket(head->next, tbl, hash);
+ RCU_INIT_POINTER(list->rhead.next, head);
+ if (pprev)
+ rcu_assign_pointer(*pprev, obj);
+ else
+ /* Need to preserve the bit lock */
+ rht_assign_locked(bkt, obj);
+
+ return NULL;
+ }
+
+ if (elasticity <= 0)
+ return ERR_PTR(-EAGAIN);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(
+ struct rhashtable *ht, struct rhash_lock_head __rcu **bkt,
+ struct bucket_table *tbl, unsigned int hash, struct rhash_head *obj,
+ void *data)
+{
+ struct bucket_table *new_tbl;
+ struct rhash_head *head;
+
+ if (!IS_ERR_OR_NULL(data))
+ return ERR_PTR(-EEXIST);
+
+ if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+ return ERR_CAST(data);
+
+ new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ if (new_tbl)
+ return new_tbl;
+
+ if (PTR_ERR(data) != -ENOENT)
+ return ERR_CAST(data);
- err = -EAGAIN;
- if (rhashtable_check_elasticity(ht, tbl, hash) ||
- rht_grow_above_100(ht, tbl))
- goto exit;
+ if (unlikely(rht_grow_above_max(ht, tbl)))
+ return ERR_PTR(-E2BIG);
- err = 0;
+ if (unlikely(rht_grow_above_100(ht, tbl)))
+ return ERR_PTR(-EAGAIN);
- head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+ head = rht_ptr(bkt, tbl, hash);
RCU_INIT_POINTER(obj->next, head);
+ if (ht->rhlist) {
+ struct rhlist_head *list;
- rcu_assign_pointer(tbl->buckets[hash], obj);
+ list = container_of(obj, struct rhlist_head, rhead);
+ RCU_INIT_POINTER(list->next, NULL);
+ }
+
+ /* bkt is always the head of the list, so it holds
+ * the lock, which we need to preserve
+ */
+ rht_assign_locked(bkt, obj);
atomic_inc(&ht->nelems);
+ if (rht_grow_above_75(ht, tbl))
+ schedule_work(&ht->run_work);
+
+ return NULL;
+}
+
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+ struct rhash_head *obj)
+{
+ struct bucket_table *new_tbl;
+ struct bucket_table *tbl;
+ struct rhash_lock_head __rcu **bkt;
+ unsigned int hash;
+ void *data;
+
+ new_tbl = rcu_dereference(ht->tbl);
+
+ do {
+ tbl = new_tbl;
+ hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+ if (rcu_access_pointer(tbl->future_tbl))
+ /* Failure is OK */
+ bkt = rht_bucket_var(tbl, hash);
+ else
+ bkt = rht_bucket_insert(ht, tbl, hash);
+ if (bkt == NULL) {
+ new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ data = ERR_PTR(-EAGAIN);
+ } else {
+ rht_lock(tbl, bkt);
+ data = rhashtable_lookup_one(ht, bkt, tbl,
+ hash, key, obj);
+ new_tbl = rhashtable_insert_one(ht, bkt, tbl,
+ hash, obj, data);
+ if (PTR_ERR(new_tbl) != -EEXIST)
+ data = ERR_CAST(new_tbl);
+
+ rht_unlock(tbl, bkt);
+ }
+ } while (!IS_ERR_OR_NULL(new_tbl));
+
+ if (PTR_ERR(data) == -EAGAIN)
+ data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+ -EAGAIN);
+
+ return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+ struct rhash_head *obj)
+{
+ void *data;
+
+ do {
+ rcu_read_lock();
+ data = rhashtable_try_insert(ht, key, obj);
+ rcu_read_unlock();
+ } while (PTR_ERR(data) == -EAGAIN);
-exit:
- spin_unlock(rht_bucket_lock(tbl, hash));
+ return data;
+}
+EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
- if (err == 0)
+/**
+ * rhashtable_walk_enter - Initialise an iterator
+ * @ht: Table to walk over
+ * @iter: Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice. Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
+ *
+ * You must call rhashtable_walk_exit after this function returns.
+ */
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+ iter->ht = ht;
+ iter->p = NULL;
+ iter->slot = 0;
+ iter->skip = 0;
+ iter->end_of_table = 0;
+
+ spin_lock(&ht->lock);
+ iter->walker.tbl =
+ rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
+ list_add(&iter->walker.list, &iter->walker.tbl->walkers);
+ spin_unlock(&ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter: Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_enter.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+ spin_lock(&iter->ht->lock);
+ if (iter->walker.tbl)
+ list_del(&iter->walker.list);
+ spin_unlock(&iter->ht->lock);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start_check - Start a hash table walk
+ * @iter: Hash table iterator
+ *
+ * Start a hash table walk at the current iterator position. Note that we take
+ * the RCU lock in all cases including when we return an error. So you must
+ * always call rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured. Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ *
+ * rhashtable_walk_start is defined as an inline variant that returns
+ * void. This is preferred in cases where the caller would ignore
+ * resize events and always continue.
+ */
+int rhashtable_walk_start_check(struct rhashtable_iter *iter)
+ __acquires(RCU)
+{
+ struct rhashtable *ht = iter->ht;
+ bool rhlist = ht->rhlist;
+
+ rcu_read_lock();
+
+ spin_lock(&ht->lock);
+ if (iter->walker.tbl)
+ list_del(&iter->walker.list);
+ spin_unlock(&ht->lock);
+
+ if (iter->end_of_table)
+ return 0;
+ if (!iter->walker.tbl) {
+ iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+ iter->slot = 0;
+ iter->skip = 0;
+ return -EAGAIN;
+ }
+
+ if (iter->p && !rhlist) {
+ /*
+ * We need to validate that 'p' is still in the table, and
+ * if so, update 'skip'
+ */
+ struct rhash_head *p;
+ int skip = 0;
+ rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+ skip++;
+ if (p == iter->p) {
+ iter->skip = skip;
+ goto found;
+ }
+ }
+ iter->p = NULL;
+ } else if (iter->p && rhlist) {
+ /* Need to validate that 'list' is still in the table, and
+ * if so, update 'skip' and 'p'.
+ */
+ struct rhash_head *p;
+ struct rhlist_head *list;
+ int skip = 0;
+ rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+ for (list = container_of(p, struct rhlist_head, rhead);
+ list;
+ list = rcu_dereference(list->next)) {
+ skip++;
+ if (list == iter->list) {
+ iter->p = p;
+ iter->skip = skip;
+ goto found;
+ }
+ }
+ }
+ iter->p = NULL;
+ }
+found:
+ return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
+
+/**
+ * __rhashtable_walk_find_next - Find the next element in a table (or the first
+ * one in case of a new walk).
+ *
+ * @iter: Hash table iterator
+ *
+ * Returns the found object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred.
+ */
+static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
+{
+ struct bucket_table *tbl = iter->walker.tbl;
+ struct rhlist_head *list = iter->list;
+ struct rhashtable *ht = iter->ht;
+ struct rhash_head *p = iter->p;
+ bool rhlist = ht->rhlist;
+
+ if (!tbl)
return NULL;
- else if (err == -EAGAIN)
- return tbl;
+
+ for (; iter->slot < tbl->size; iter->slot++) {
+ int skip = iter->skip;
+
+ rht_for_each_rcu(p, tbl, iter->slot) {
+ if (rhlist) {
+ list = container_of(p, struct rhlist_head,
+ rhead);
+ do {
+ if (!skip)
+ goto next;
+ skip--;
+ list = rcu_dereference(list->next);
+ } while (list);
+
+ continue;
+ }
+ if (!skip)
+ break;
+ skip--;
+ }
+
+next:
+ if (!rht_is_a_nulls(p)) {
+ iter->skip++;
+ iter->p = p;
+ iter->list = list;
+ return rht_obj(ht, rhlist ? &list->rhead : p);
+ }
+
+ iter->skip = 0;
+ }
+
+ iter->p = NULL;
+
+ /* Ensure we see any new tables. */
+ smp_rmb();
+
+ iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ if (iter->walker.tbl) {
+ iter->slot = 0;
+ iter->skip = 0;
+ return ERR_PTR(-EAGAIN);
+ } else {
+ iter->end_of_table = true;
+ }
+
+ return NULL;
+}
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter: Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred. Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+ struct rhlist_head *list = iter->list;
+ struct rhashtable *ht = iter->ht;
+ struct rhash_head *p = iter->p;
+ bool rhlist = ht->rhlist;
+
+ if (p) {
+ if (!rhlist || !(list = rcu_dereference(list->next))) {
+ p = rcu_dereference(p->next);
+ list = container_of(p, struct rhlist_head, rhead);
+ }
+ if (!rht_is_a_nulls(p)) {
+ iter->skip++;
+ iter->p = p;
+ iter->list = list;
+ return rht_obj(ht, rhlist ? &list->rhead : p);
+ }
+
+ /* At the end of this slot, switch to next one and then find
+ * next entry from that point.
+ */
+ iter->skip = 0;
+ iter->slot++;
+ }
+
+ return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_peek - Return the next object but don't advance the iterator
+ * @iter: Hash table iterator
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occurred. Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_peek(struct rhashtable_iter *iter)
+{
+ struct rhlist_head *list = iter->list;
+ struct rhashtable *ht = iter->ht;
+ struct rhash_head *p = iter->p;
+
+ if (p)
+ return rht_obj(ht, ht->rhlist ? &list->rhead : p);
+
+ /* No object found in current iter, find next one in the table. */
+
+ if (iter->skip) {
+ /* A nonzero skip value points to the next entry in the table
+ * beyond that last one that was found. Decrement skip so
+ * we find the current value. __rhashtable_walk_find_next
+ * will restore the original value of skip assuming that
+ * the table hasn't changed.
+ */
+ iter->skip--;
+ }
+
+ return __rhashtable_walk_find_next(iter);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter: Hash table iterator
+ *
+ * Finish a hash table walk. Does not reset the iterator to the start of the
+ * hash table.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+ __releases(RCU)
+{
+ struct rhashtable *ht;
+ struct bucket_table *tbl = iter->walker.tbl;
+
+ if (!tbl)
+ goto out;
+
+ ht = iter->ht;
+
+ spin_lock(&ht->lock);
+ if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
+ /* This bucket table is being freed, don't re-link it. */
+ iter->walker.tbl = NULL;
else
- return ERR_PTR(err);
+ list_add(&iter->walker.list, &tbl->walkers);
+ spin_unlock(&ht->lock);
+
+out:
+ rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
static size_t rounded_hashtable_size(const struct rhashtable_params *params)
{
- return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
- (unsigned long)params->min_size);
+ size_t retsize;
+
+ if (params->nelem_hint)
+ retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
+ (unsigned long)params->min_size);
+ else
+ retsize = max(HASH_DEFAULT_SIZE,
+ (unsigned long)params->min_size);
+
+ return retsize;
}
static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
return jhash2(key, length, seed);
}
+/**
+ * rhashtable_init - initialize a new hash table
+ * @ht: hash table to be initialized
+ * @params: configuration parameters
+ *
+ * Initializes a new hash table based on the provided configuration
+ * parameters. A table can be configured either with a variable or
+ * fixed length key:
+ *
+ * Configuration Example 1: Fixed length keys
+ * struct test_obj {
+ * int key;
+ * void * my_member;
+ * struct rhash_head node;
+ * };
+ *
+ * struct rhashtable_params params = {
+ * .head_offset = offsetof(struct test_obj, node),
+ * .key_offset = offsetof(struct test_obj, key),
+ * .key_len = sizeof(int),
+ * .hashfn = jhash,
+ * };
+ *
+ * Configuration Example 2: Variable length keys
+ * struct test_obj {
+ * [...]
+ * struct rhash_head node;
+ * };
+ *
+ * u32 my_hash_fn(const void *data, u32 len, u32 seed)
+ * {
+ * struct test_obj *obj = data;
+ *
+ * return [... hash ...];
+ * }
+ *
+ * struct rhashtable_params params = {
+ * .head_offset = offsetof(struct test_obj, node),
+ * .hashfn = jhash,
+ * .obj_hashfn = my_hash_fn,
+ * };
+ */
int rhashtable_init(struct rhashtable *ht,
const struct rhashtable_params *params)
{
struct bucket_table *tbl;
size_t size;
- size = HASH_DEFAULT_SIZE;
-
if ((!params->key_len && !params->obj_hashfn) ||
(params->obj_hashfn && !params->obj_cmpfn))
return -EINVAL;
- if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
- return -EINVAL;
-
memset(ht, 0, sizeof(*ht));
mutex_init(&ht->mutex);
spin_lock_init(&ht->lock);
if (params->min_size)
ht->p.min_size = roundup_pow_of_two(params->min_size);
- if (params->max_size)
- ht->p.max_size = rounddown_pow_of_two(params->max_size);
+ /* Cap total entries at 2^31 to avoid nelems overflow. */
+ ht->max_elems = 1u << 31;
- if (params->insecure_max_entries)
- ht->p.insecure_max_entries =
- rounddown_pow_of_two(params->insecure_max_entries);
- else
- ht->p.insecure_max_entries = ht->p.max_size * 2;
-
- ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+ if (params->max_size) {
+ ht->p.max_size = rounddown_pow_of_two(params->max_size);
+ if (ht->p.max_size < ht->max_elems / 2)
+ ht->max_elems = ht->p.max_size * 2;
+ }
- if (params->nelem_hint)
- size = rounded_hashtable_size(&ht->p);
-
- /* The maximum (not average) chain length grows with the
- * size of the hash table, at a rate of (log N)/(log log N).
- * The value of 16 is selected so that even if the hash
- * table grew to 2^32 you would not expect the maximum
- * chain length to exceed it unless we are under attack
- * (or extremely unlucky).
- *
- * As this limit is only to detect attacks, we don't need
- * to set it to a lower value as you'd need the chain
- * length to vastly exceed 16 to have any real effect
- * on the system.
- */
- if (!params->insecure_elasticity)
- ht->elasticity = 16;
+ ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
- if (params->locks_mul)
- ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
- else
- ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+ size = rounded_hashtable_size(&ht->p);
ht->key_len = ht->p.key_len;
if (!params->hashfn) {
}
}
+ /*
+ * This is api initialization and thus we need to guarantee the
+ * initial rhashtable allocation. Upon failure, retry with the
+ * smallest possible size with __GFP_NOFAIL semantics.
+ */
tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
- if (tbl == NULL)
- return -ENOMEM;
+ if (unlikely(tbl == NULL)) {
+ size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
+ tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
+ }
atomic_set(&ht->nelems, 0);
return 0;
}
+EXPORT_SYMBOL_GPL(rhashtable_init);
-void rhashtable_destroy(struct rhashtable *ht)
+/**
+ * rhltable_init - initialize a new hash list table
+ * @hlt: hash list table to be initialized
+ * @params: configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
{
- struct bucket_table *tbl;
+ int err;
+
+ err = rhashtable_init(&hlt->ht, params);
+ hlt->ht.rhlist = true;
+ return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+ void (*free_fn)(void *ptr, void *arg),
+ void *arg)
+{
+ struct rhlist_head *list;
+
+ if (!ht->rhlist) {
+ free_fn(rht_obj(ht, obj), arg);
+ return;
+ }
+
+ list = container_of(obj, struct rhlist_head, rhead);
+ do {
+ obj = &list->rhead;
+ list = rht_dereference(list->next, ht);
+ free_fn(rht_obj(ht, obj), arg);
+ } while (list);
+}
+
+/**
+ * rhashtable_free_and_destroy - free elements and destroy hash table
+ * @ht: the hash table to destroy
+ * @free_fn: callback to release resources of element
+ * @arg: pointer passed to free_fn
+ *
+ * Stops an eventual async resize. If defined, invokes free_fn for each
+ * element to releasal resources. Please note that RCU protected
+ * readers may still be accessing the elements. Releasing of resources
+ * must occur in a compatible manner. Then frees the bucket array.
+ *
+ * This function will eventually sleep to wait for an async resize
+ * to complete. The caller is responsible that no further write operations
+ * occurs in parallel.
+ */
+void rhashtable_free_and_destroy(struct rhashtable *ht,
+ void (*free_fn)(void *ptr, void *arg),
+ void *arg)
+{
+ struct bucket_table *tbl, *next_tbl;
+ unsigned int i;
cancel_work_sync(&ht->run_work);
mutex_lock(&ht->mutex);
tbl = rht_dereference(ht->tbl, ht);
+restart:
+ if (free_fn) {
+ for (i = 0; i < tbl->size; i++) {
+ struct rhash_head *pos, *next;
+
+ cond_resched();
+ for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
+ next = !rht_is_a_nulls(pos) ?
+ rht_dereference(pos->next, ht) : NULL;
+ !rht_is_a_nulls(pos);
+ pos = next,
+ next = !rht_is_a_nulls(pos) ?
+ rht_dereference(pos->next, ht) : NULL)
+ rhashtable_free_one(ht, pos, free_fn, arg);
+ }
+ }
+
+ next_tbl = rht_dereference(tbl->future_tbl, ht);
bucket_table_free(tbl);
+ if (next_tbl) {
+ tbl = next_tbl;
+ goto restart;
+ }
mutex_unlock(&ht->mutex);
}
+EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
+
+void rhashtable_destroy(struct rhashtable *ht)
+{
+ return rhashtable_free_and_destroy(ht, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_lock_head __rcu **__rht_bucket_nested(
+ const struct bucket_table *tbl, unsigned int hash)
+{
+ const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+ unsigned int index = hash & ((1 << tbl->nest) - 1);
+ unsigned int size = tbl->size >> tbl->nest;
+ unsigned int subhash = hash;
+ union nested_table *ntbl;
+
+ ntbl = nested_table_top(tbl);
+ ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
+ subhash >>= tbl->nest;
+
+ while (ntbl && size > (1 << shift)) {
+ index = subhash & ((1 << shift) - 1);
+ ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
+ tbl, hash);
+ size >>= shift;
+ subhash >>= shift;
+ }
+
+ if (!ntbl)
+ return NULL;
+
+ return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(__rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested(
+ const struct bucket_table *tbl, unsigned int hash)
+{
+ static struct rhash_lock_head __rcu *rhnull;
+
+ if (!rhnull)
+ INIT_RHT_NULLS_HEAD(rhnull);
+ return __rht_bucket_nested(tbl, hash) ?: &rhnull;
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(
+ struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
+{
+ const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+ unsigned int index = hash & ((1 << tbl->nest) - 1);
+ unsigned int size = tbl->size >> tbl->nest;
+ union nested_table *ntbl;
+
+ ntbl = nested_table_top(tbl);
+ hash >>= tbl->nest;
+ ntbl = nested_table_alloc(ht, &ntbl[index].table,
+ size <= (1 << shift));
+
+ while (ntbl && size > (1 << shift)) {
+ index = hash & ((1 << shift) - 1);
+ size >>= shift;
+ hash >>= shift;
+ ntbl = nested_table_alloc(ht, &ntbl[index].table,
+ size <= (1 << shift));
+ }
+
+ if (!ntbl)
+ return NULL;
+
+ return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
{
struct task_struct *p = malloc(sizeof(*p));
- mlockall(MCL_CURRENT|MCL_FUTURE);
-
memset(p, 0, sizeof(*p));
p->state = TASK_RUNNING;
struct meminfo {
u64 total;
u64 available;
-
};
static u64 parse_meminfo_line(const char *line)
f = fopen("/proc/meminfo", "r");
if (!f)
- die("error opening /proc/meminfo: %m");
+ return ret;
while ((len = getline(&line, &n, f)) != -1) {
if ((v = strcmp_prefix(line, "MemTotal:")))
return;
info = read_meminfo();
- want_shrink = (info.total >> 2) - info.available;
- if (want_shrink <= 0)
- return;
+ if (info.total && info.available) {
+ want_shrink = (info.total >> 2) - info.available;
+
+ if (want_shrink <= 0)
+ return;
+ } else {
+ /* If we weren't able to read /proc/meminfo, we must be pretty
+ * low: */
+
+ want_shrink = 8 << 20;
+ }
mutex_lock(&shrinker_lock);
list_for_each_entry(shrinker, &shrinker_list, list) {
#include <linux/export.h>
#include <linux/log2.h>
+#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/six.h>
+#include <linux/slab.h>
#ifdef DEBUG
#define EBUG_ON(cond) BUG_ON(cond)
#define LOCK_VALS { \
[SIX_LOCK_read] = { \
.lock_val = __SIX_VAL(read_lock, 1), \
- .lock_fail = __SIX_LOCK_HELD_write, \
+ .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
.unlock_val = -__SIX_VAL(read_lock, 1), \
.held_mask = __SIX_LOCK_HELD_read, \
.unlock_wakeup = SIX_LOCK_write, \
}
}
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+ unsigned read_count = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ read_count += *per_cpu_ptr(lock->readers, cpu);
+ return read_count;
+}
+
+struct six_lock_waiter {
+ struct list_head list;
+ struct task_struct *task;
+};
+
+/* This is probably up there with the more evil things I've done */
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+
+static inline void six_lock_wakeup(struct six_lock *lock,
+ union six_lock_state state,
+ unsigned waitlist_id)
+{
+ if (waitlist_id == SIX_LOCK_write) {
+ if (state.write_locking && !state.read_lock) {
+ struct task_struct *p = READ_ONCE(lock->owner);
+ if (p)
+ wake_up_process(p);
+ }
+ } else {
+ struct list_head *wait_list = &lock->wait_list[waitlist_id];
+ struct six_lock_waiter *w, *next;
+
+ if (!(state.waiters & (1 << waitlist_id)))
+ return;
+
+ clear_bit(waitlist_bitnr(waitlist_id),
+ (unsigned long *) &lock->state.v);
+
+ raw_spin_lock(&lock->wait_lock);
+
+ list_for_each_entry_safe(w, next, wait_list, list) {
+ list_del_init(&w->list);
+
+ if (wake_up_process(w->task) &&
+ waitlist_id != SIX_LOCK_read) {
+ if (!list_empty(wait_list))
+ set_bit(waitlist_bitnr(waitlist_id),
+ (unsigned long *) &lock->state.v);
+ break;
+ }
+ }
+
+ raw_spin_unlock(&lock->wait_lock);
+ }
+}
+
static __always_inline bool do_six_trylock_type(struct six_lock *lock,
- enum six_lock_type type)
+ enum six_lock_type type,
+ bool try)
{
const struct six_lock_vals l[] = LOCK_VALS;
- union six_lock_state old;
- u64 v = READ_ONCE(lock->state.v);
+ union six_lock_state old, new;
+ bool ret;
+ u64 v;
EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
+ EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
- do {
- old.v = v;
+ EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
- EBUG_ON(type == SIX_LOCK_write &&
- ((old.v & __SIX_LOCK_HELD_write) ||
- !(old.v & __SIX_LOCK_HELD_intent)));
+ /*
+ * Percpu reader mode:
+ *
+ * The basic idea behind this algorithm is that you can implement a lock
+ * between two threads without any atomics, just memory barriers:
+ *
+ * For two threads you'll need two variables, one variable for "thread a
+ * has the lock" and another for "thread b has the lock".
+ *
+ * To take the lock, a thread sets its variable indicating that it holds
+ * the lock, then issues a full memory barrier, then reads from the
+ * other thread's variable to check if the other thread thinks it has
+ * the lock. If we raced, we backoff and retry/sleep.
+ */
- if (old.v & l[type].lock_fail)
- return false;
- } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
- old.v,
- old.v + l[type].lock_val)) != old.v);
+ if (type == SIX_LOCK_read && lock->readers) {
+retry:
+ preempt_disable();
+ this_cpu_inc(*lock->readers); /* signal that we own lock */
- six_set_owner(lock, type, old);
- return true;
+ smp_mb();
+
+ old.v = READ_ONCE(lock->state.v);
+ ret = !(old.v & l[type].lock_fail);
+
+ this_cpu_sub(*lock->readers, !ret);
+ preempt_enable();
+
+ /*
+ * If we failed because a writer was trying to take the
+ * lock, issue a wakeup because we might have caused a
+ * spurious trylock failure:
+ */
+ if (old.write_locking) {
+ struct task_struct *p = READ_ONCE(lock->owner);
+
+ if (p)
+ wake_up_process(p);
+ }
+
+ /*
+ * If we failed from the lock path and the waiting bit wasn't
+ * set, set it:
+ */
+ if (!try && !ret) {
+ v = old.v;
+
+ do {
+ new.v = old.v = v;
+
+ if (!(old.v & l[type].lock_fail))
+ goto retry;
+
+ if (new.waiters & (1 << type))
+ break;
+
+ new.waiters |= 1 << type;
+ } while ((v = atomic64_cmpxchg(&lock->state.counter,
+ old.v, new.v)) != old.v);
+ }
+ } else if (type == SIX_LOCK_write && lock->readers) {
+ if (try) {
+ atomic64_add(__SIX_VAL(write_locking, 1),
+ &lock->state.counter);
+ smp_mb__after_atomic();
+ }
+
+ ret = !pcpu_read_count(lock);
+
+ /*
+ * On success, we increment lock->seq; also we clear
+ * write_locking unless we failed from the lock path:
+ */
+ v = 0;
+ if (ret)
+ v += __SIX_VAL(seq, 1);
+ if (ret || try)
+ v -= __SIX_VAL(write_locking, 1);
+
+ if (try && !ret) {
+ old.v = atomic64_add_return(v, &lock->state.counter);
+ six_lock_wakeup(lock, old, SIX_LOCK_read);
+ } else {
+ atomic64_add(v, &lock->state.counter);
+ }
+ } else {
+ v = READ_ONCE(lock->state.v);
+ do {
+ new.v = old.v = v;
+
+ if (!(old.v & l[type].lock_fail)) {
+ new.v += l[type].lock_val;
+
+ if (type == SIX_LOCK_write)
+ new.write_locking = 0;
+ } else if (!try && type != SIX_LOCK_write &&
+ !(new.waiters & (1 << type)))
+ new.waiters |= 1 << type;
+ else
+ break; /* waiting bit already set */
+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
+ old.v, new.v)) != old.v);
+
+ ret = !(old.v & l[type].lock_fail);
+
+ EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+ }
+
+ if (ret)
+ six_set_owner(lock, type, old);
+
+ EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
+
+ return ret;
}
__always_inline __flatten
static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
{
- if (!do_six_trylock_type(lock, type))
+ if (!do_six_trylock_type(lock, type, true))
return false;
if (type != SIX_LOCK_write)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old;
- u64 v = READ_ONCE(lock->state.v);
+ u64 v;
+
+ EBUG_ON(type == SIX_LOCK_write);
+
+ if (type == SIX_LOCK_read &&
+ lock->readers) {
+ bool ret;
+ preempt_disable();
+ this_cpu_inc(*lock->readers);
+
+ smp_mb();
+
+ old.v = READ_ONCE(lock->state.v);
+ ret = !(old.v & l[type].lock_fail) && old.seq == seq;
+
+ this_cpu_sub(*lock->readers, !ret);
+ preempt_enable();
+
+ /*
+ * Similar to the lock path, we may have caused a spurious write
+ * lock fail and need to issue a wakeup:
+ */
+ if (old.write_locking) {
+ struct task_struct *p = READ_ONCE(lock->owner);
+
+ if (p)
+ wake_up_process(p);
+ }
+
+ if (ret)
+ six_acquire(&lock->dep_map, 1);
+
+ return ret;
+ }
+
+ v = READ_ONCE(lock->state.v);
do {
old.v = v;
return true;
}
-struct six_lock_waiter {
- struct list_head list;
- struct task_struct *task;
-};
-
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-
#ifdef CONFIG_LOCK_SPIN_ON_OWNER
static inline int six_can_spin_on_owner(struct six_lock *lock)
if (owner && !six_spin_on_owner(lock, owner))
break;
- if (do_six_trylock_type(lock, type)) {
+ if (do_six_trylock_type(lock, type, false)) {
osq_unlock(&lock->osq);
preempt_enable();
return true;
static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
- const struct six_lock_vals l[] = LOCK_VALS;
- union six_lock_state old, new;
+ union six_lock_state old;
struct six_lock_waiter wait;
int ret = 0;
- u64 v;
+
+ if (type == SIX_LOCK_write) {
+ EBUG_ON(lock->state.write_locking);
+ atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+ smp_mb__after_atomic();
+ }
ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
if (ret)
- return ret;
+ goto out_before_sleep;
if (six_optimistic_spin(lock, type))
- return 0;
+ goto out_before_sleep;
lock_contended(&lock->dep_map, _RET_IP_);
raw_spin_unlock(&lock->wait_lock);
}
- ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (ret)
+ if (do_six_trylock_type(lock, type, false))
break;
- v = READ_ONCE(lock->state.v);
- do {
- new.v = old.v = v;
-
- if (!(old.v & l[type].lock_fail))
- new.v += l[type].lock_val;
- else if (!(new.waiters & (1 << type)))
- new.waiters |= 1 << type;
- else
- break; /* waiting bit already set */
- } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
- old.v, new.v)) != old.v);
-
- if (!(old.v & l[type].lock_fail))
+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+ if (ret)
break;
schedule();
}
- if (!ret)
- six_set_owner(lock, type, old);
-
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait.list)) {
list_del_init(&wait.list);
raw_spin_unlock(&lock->wait_lock);
}
+out_before_sleep:
+ if (ret && type == SIX_LOCK_write) {
+ old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
+ &lock->state.counter);
+ six_lock_wakeup(lock, old, SIX_LOCK_read);
+ }
return ret;
}
if (type != SIX_LOCK_write)
six_acquire(&lock->dep_map, 0);
- ret = do_six_trylock_type(lock, type) ? 0
+ ret = do_six_trylock_type(lock, type, true) ? 0
: __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
if (ret && type != SIX_LOCK_write)
return ret;
}
-static inline void six_lock_wakeup(struct six_lock *lock,
- union six_lock_state state,
- unsigned waitlist_id)
-{
- struct list_head *wait_list = &lock->wait_list[waitlist_id];
- struct six_lock_waiter *w, *next;
-
- if (waitlist_id == SIX_LOCK_write && state.read_lock)
- return;
-
- if (!(state.waiters & (1 << waitlist_id)))
- return;
-
- clear_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
-
- if (waitlist_id == SIX_LOCK_write) {
- struct task_struct *p = READ_ONCE(lock->owner);
-
- if (p)
- wake_up_process(p);
- return;
- }
-
- raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry_safe(w, next, wait_list, list) {
- list_del_init(&w->list);
-
- if (wake_up_process(w->task) &&
- waitlist_id != SIX_LOCK_read) {
- if (!list_empty(wait_list))
- set_bit(waitlist_bitnr(waitlist_id),
- (unsigned long *) &lock->state.v);
- break;
- }
- }
-
- raw_spin_unlock(&lock->wait_lock);
-}
-
__always_inline __flatten
static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
{
const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state state;
- EBUG_ON(!(lock->state.v & l[type].held_mask));
EBUG_ON(type == SIX_LOCK_write &&
!(lock->state.v & __SIX_LOCK_HELD_intent));
lock->owner = NULL;
}
- state.v = atomic64_add_return_release(l[type].unlock_val,
- &lock->state.counter);
+ if (type == SIX_LOCK_read &&
+ lock->readers) {
+ smp_mb(); /* unlock barrier */
+ this_cpu_dec(*lock->readers);
+ smp_mb(); /* between unlocking and checking for waiters */
+ state.v = READ_ONCE(lock->state.v);
+ } else {
+ EBUG_ON(!(lock->state.v & l[type].held_mask));
+ state.v = atomic64_add_return_release(l[type].unlock_val,
+ &lock->state.counter);
+ }
+
six_lock_wakeup(lock, state, l[type].unlock_wakeup);
}
bool six_lock_tryupgrade(struct six_lock *lock)
{
- const struct six_lock_vals l[] = LOCK_VALS;
union six_lock_state old, new;
u64 v = READ_ONCE(lock->state.v);
do {
new.v = old.v = v;
- EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask));
-
- new.v += l[SIX_LOCK_read].unlock_val;
-
- if (new.v & l[SIX_LOCK_intent].lock_fail)
+ if (new.intent_lock)
return false;
- new.v += l[SIX_LOCK_intent].lock_val;
+ if (!lock->readers) {
+ EBUG_ON(!new.read_lock);
+ new.read_lock--;
+ }
+
+ new.intent_lock = 1;
} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
old.v, new.v)) != old.v);
+ if (lock->readers)
+ this_cpu_dec(*lock->readers);
+
six_set_owner(lock, SIX_LOCK_intent, old);
- six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup);
return true;
}
{
const struct six_lock_vals l[] = LOCK_VALS;
- EBUG_ON(type == SIX_LOCK_write);
six_acquire(&lock->dep_map, 0);
/* XXX: assert already locked, and that we don't overflow: */
switch (type) {
case SIX_LOCK_read:
- atomic64_add(l[type].lock_val, &lock->state.counter);
+ if (lock->readers) {
+ this_cpu_inc(*lock->readers);
+ } else {
+ EBUG_ON(!lock->state.read_lock &&
+ !lock->state.intent_lock);
+ atomic64_add(l[type].lock_val, &lock->state.counter);
+ }
break;
case SIX_LOCK_intent:
+ EBUG_ON(!lock->state.intent_lock);
lock->intent_lock_recurse++;
break;
case SIX_LOCK_write:
raw_spin_unlock(&lock->wait_lock);
}
EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+struct free_pcpu_rcu {
+ struct rcu_head rcu;
+ void __percpu *p;
+};
+
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
+{
+ struct free_pcpu_rcu *rcu =
+ container_of(_rcu, struct free_pcpu_rcu, rcu);
+
+ free_percpu(rcu->p);
+ kfree(rcu);
+}
+
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
+{
+ struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
+
+ if (!rcu)
+ return;
+
+ rcu->p = lock->readers;
+ lock->readers = NULL;
+
+ call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
+
+void six_lock_pcpu_free(struct six_lock *lock)
+{
+ BUG_ON(lock->readers && pcpu_read_count(lock));
+ BUG_ON(lock->state.read_lock);
+
+ free_percpu(lock->readers);
+ lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
+
+void six_lock_pcpu_alloc(struct six_lock *lock)
+{
+#ifdef __KERNEL__
+ if (!lock->readers)
+ lock->readers = alloc_percpu(unsigned);
+#endif
+}
+EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
return 1;
return 2;
}
+
+struct bpos bpos_parse(char *buf)
+{
+ char *s = buf, *field;
+ u64 inode_v = 0, offset_v = 0;
+
+ if (!(field = strsep(&s, ":")) ||
+ kstrtoull(field, 10, &inode_v))
+ die("invalid bpos %s", buf);
+
+ if ((field = strsep(&s, ":")) &&
+ kstrtoull(field, 10, &offset_v))
+ die("invalid bpos %s", buf);
+
+ if (s)
+ die("invalid bpos %s", buf);
+
+ return (struct bpos) { .inode = inode_v, .offset = offset_v };
+}
_ret; \
})
+struct bpos bpos_parse(char *);
+
#endif /* _TOOLS_UTIL_H */