* zlib1g
In addition a recent Rust toolchain is required (rustc, cargo), either by using
-[rustup](https://rustup.rs/) or make sure to use a distribution where rustc (>=1.64)
+[rustup](https://rustup.rs/) or make sure to use a distribution where rustc (>=1.65)
is available.
Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with
+VERSION=1.3.3
+
PREFIX?=/usr/local
PKG_CONFIG?=pkg-config
INSTALL=install
+LN=ln
ifeq ("$(origin V)", "command line")
BUILD_VERBOSE = $(V)
$(EXTRA_CFLAGS)
LDFLAGS+=$(CFLAGS) $(EXTRA_LDFLAGS)
-CARGO_ARGS=
+ifdef CARGO_TOOLCHAIN_VERSION
+ CARGO_TOOLCHAIN = +$(CARGO_TOOLCHAIN_VERSION)
+endif
+
+CARGO_ARGS=${CARGO_TOOLCHAIN}
CARGO=cargo $(CARGO_ARGS)
CARGO_PROFILE=release
# CARGO_PROFILE=debug
CARGO_BUILD_ARGS=--$(CARGO_PROFILE)
CARGO_BUILD=$(CARGO) build $(CARGO_BUILD_ARGS)
-VERSION?=$(shell git describe --dirty=+ 2>/dev/null || echo v0.1-nogit)
include Makefile.compiler
install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs
install: bcachefs
$(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR)
- $(INSTALL) -m0755 fsck.bcachefs $(DESTDIR)$(ROOT_SBINDIR)
- $(INSTALL) -m0755 mkfs.bcachefs $(DESTDIR)$(ROOT_SBINDIR)
- $(INSTALL) -m0755 mount.bcachefs $(DESTDIR)$(ROOT_SBINDIR)
$(INSTALL) -m0644 -D bcachefs.8 -t $(DESTDIR)$(PREFIX)/share/man/man8/
$(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT)
$(INSTALL) -m0755 -D initramfs/hook $(DESTDIR)$(INITRAMFS_HOOK)
+ $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.bcachefs
+ $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.bcachefs
+ $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs
+ $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.fuse.bcachefs
+ $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.fuse.bcachefs
+ $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.fuse.bcachefs
sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK)
echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK)
doc: bcachefs-principles-of-operation.pdf
+.PHONY: cargo-update-msrv
+cargo-update-msrv:
+ cargo +nightly generate-lockfile --manifest-path rust-src/Cargo.toml -Zmsrv-policy
+ cargo +nightly generate-lockfile --manifest-path rust-src/bch_bindgen/Cargo.toml -Zmsrv-policy
+
.PHONY: update-bcachefs-sources
update-bcachefs-sources:
git rm -rf --ignore-unmatch libbcachefs
git add include/linux/xxhash.h
cp $(LINUX_DIR)/lib/xxhash.c linux/
git add linux/xxhash.c
- cp $(LINUX_DIR)/kernel/locking/six.c linux/
- git add linux/six.c
- cp $(LINUX_DIR)/include/linux/six.h include/linux/
- git add include/linux/six.h
cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/
git add include/linux/list_nulls.h
cp $(LINUX_DIR)/include/linux/poison.h include/linux/
# Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
as-option = $(call try-run,\
- $(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
+ $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
# as-instr
# Usage: aflags-y += $(call as-instr,instr,option1,option2)
as-instr = $(call try-run,\
- printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
+ printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
# __cc-option
# Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
# ld-option
# Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y)
ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3))
-
-# ld-ifversion
-# Usage: $(call ld-ifversion, -ge, 22252, y)
-ld-ifversion = $(shell [ $(CONFIG_LD_VERSION)0 $(1) $(2)0 ] && echo $(3) || echo $(4))
"\n"
"Commands for operating on files in a bcachefs filesystem:\n"
" setattr Set various per file attributes\n"
+ "\n"
"Debug:\n"
"These commands work on offline, unmounted filesystems\n"
" dump Dump filesystem metadata to a qcow2 image\n"
#endif
" list_journal List contents of journal\n"
"\n"
+ "FUSE:\n"
+ " fusemount Mount a filesystem via FUSE\n"
+ "\n"
"Miscellaneous:\n"
+#ifndef BCACHEFS_NO_RUST
+ " completions Generate shell completions\n"
+#endif
" version Display the version of the invoked bcachefs tool\n");
}
full_cmd = argv[0];
+ /* Are we being called via a symlink? */
+
+ if (strstr(full_cmd, "mkfs"))
+ return cmd_format(argc, argv);
+
+ if (strstr(full_cmd, "fsck"))
+ return cmd_fsck(argc, argv);
+
+#ifdef BCACHEFS_FUSE
+ if (strstr(full_cmd, "mount.fuse"))
+ return cmd_fusemount(argc, argv);
+#endif
+
+#ifndef BCACHEFS_NO_RUST
+ if (strstr(full_cmd, "mount"))
+ return cmd_mount(argc, argv);
+#endif
+
setvbuf(stdout, NULL, _IOLBF, 0);
char *cmd = pop_cmd(&argc, argv);
if (!strcmp(cmd, "setattr"))
return cmd_setattr(argc, argv);
#ifndef BCACHEFS_NO_RUST
- if (!strcmp(cmd, "mount")) {
- cmd_mount(argc, argv);
- return 0;
- }
+ if (!strcmp(cmd, "mount"))
+ return cmd_mount(argc, argv);
+ if (strstr(cmd, "completions"))
+ return cmd_completions(argc, argv);
#endif
#ifdef BCACHEFS_FUSE
-{ lib
-, stdenv
-, pkg-config
-, attr
-, libuuid
-, libsodium
-, keyutils
-, liburcu
-, zlib
-, libaio
-, udev
-, zstd
-, lz4
-, nix-gitignore
-, rustPlatform
-, rustc
-, cargo
- }:
-
+{ lib, stdenv, pkg-config, attr, libuuid, libsodium, keyutils, liburcu, zlib
+, libaio, udev, zstd, lz4, nix-gitignore, rustPlatform, rustc, cargo, }:
let
- src = nix-gitignore.gitignoreSource [] ./. ;
+ src = nix-gitignore.gitignoreSource [ ] ./.;
commit = lib.strings.substring 0 7 (builtins.readFile ./.bcachefs_revision);
version = "git-${commit}";
-
in stdenv.mkDerivation {
inherit src version;
};
};
- makeFlags = [
- "PREFIX=${placeholder "out"}"
- "VERSION=${commit}"
- ];
+ makeFlags = [ "DESTDIR=${placeholder "out"}" "PREFIX=" "VERSION=${commit}" ];
dontStrip = true;
checkPhase = "./bcachefs version";
doCheck = true;
+
+ meta = {
+ mainProgram = "bcachefs";
+ license = lib.licenses.gpl2Only;
+ };
}
switch (opt) {
case 'b':
op.start_btree = read_string_list_or_die(optarg,
- bch2_btree_ids, "btree id");
+ __bch2_btree_ids, "btree id");
op.end_btree = op.start_btree;
break;
case 's':
#include "libbcachefs/bcachefs_ioctl.h"
#include "libbcachefs/errcode.h"
#include "libbcachefs/journal.h"
+#include "libbcachefs/sb-members.h"
#include "libbcachefs/super-io.h"
#include "cmds.h"
#include "libbcachefs.h"
struct bchfs_handle fs = bcache_fs_open(fs_path);
- dev_opts.fd = open_for_format(dev_opts.path, force);
+ int ret = open_for_format(&dev_opts, force);
+ if (ret)
+ die("Error opening %s: %s", dev_opts.path, strerror(-ret));
struct bch_opt_strs fs_opt_strs;
memset(&fs_opt_strs, 0, sizeof(fs_opt_strs));
format_opts,
&dev_opts, 1);
free(sb);
- fsync(dev_opts.fd);
- close(dev_opts.fd);
+ fsync(dev_opts.bdev->bd_buffered_fd);
+ close(dev_opts.bdev->bd_buffered_fd);
bchu_disk_add(fs, dev_opts.path);
return 0;
if (ret)
die("error opening %s: %s", dev_str, bch2_err_str(ret));
- struct bch_member *m = bch2_sb_get_members(sb.sb)->members + sb.sb->dev_idx;
+ struct bch_member *m = bch2_members_v2_get_mut(sb.sb, sb.sb->dev_idx);
SET_BCH_MEMBER_STATE(m, new_state);
char *size_arg = arg_pop();
if (!size_arg)
- size = get_size(dev, dev_fd);
+ size = get_size(dev_fd);
else if (bch2_strtoull_h(size_arg, &size))
die("invalid size");
if (idx >= sb->nr_devices)
die("error reading superblock: dev idx >= sb->nr_devices");
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- if (!mi)
- die("error reading superblock: no member info");
+ struct bch_member m = bch2_sb_member_get(sb, idx);
- /* could also just read this out of sysfs... meh */
- struct bch_member *m = mi->members + idx;
+ u64 nbuckets = size / le16_to_cpu(m.bucket_size);
- u64 nbuckets = size / le16_to_cpu(m->bucket_size);
-
- if (nbuckets < le64_to_cpu(m->nbuckets))
+ if (nbuckets < le64_to_cpu(m.nbuckets))
die("Shrinking not supported yet");
printf("resizing %s to %llu buckets\n", dev, nbuckets);
if (idx >= sb->nr_devices)
die("error reading superblock: dev idx >= sb->nr_devices");
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- if (!mi)
- die("error reading superblock: no member info");
-
- /* could also just read this out of sysfs... meh */
- struct bch_member *m = mi->members + idx;
+ struct bch_member m = bch2_sb_member_get(sb, idx);
- u64 nbuckets = size / le16_to_cpu(m->bucket_size);
+ u64 nbuckets = size / le16_to_cpu(m.bucket_size);
printf("resizing journal on %s to %llu buckets\n", dev, nbuckets);
bchu_disk_resize_journal(fs, idx, nbuckets);
#include <fcntl.h>
+#include <getopt.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "libbcachefs/btree_iter.h"
#include "libbcachefs/error.h"
#include "libbcachefs/extents.h"
+#include "libbcachefs/sb-members.h"
#include "libbcachefs/super.h"
static void dump_usage(void)
"\n"
"Options:\n"
" -o output Output qcow2 image(s)\n"
- " -f Force; overwrite when needed\n"
- " -j Dump entire journal, not just dirty entries\n"
- " -h Display this help and exit\n"
+ " -f, --force Force; overwrite when needed\n"
+ " --nojournal Don't dump entire journal, just dirty entries\n"
+ " -h, --help Display this help and exit\n"
"Report bugs to <linux-bcachefs@vger.kernel.org>");
}
for (i = 0; i < BTREE_ID_NR; i++) {
const struct bch_extent_ptr *ptr;
struct bkey_ptrs_c ptrs;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct btree *b;
- bch2_trans_init(&trans, c, 0, 0);
-
- __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
+ __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
struct btree_node_iter iter;
struct bkey u;
struct bkey_s_c k;
btree_bytes(c));
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
}
qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data,
int cmd_dump(int argc, char *argv[])
{
+ static const struct option longopts[] = {
+ { "force", no_argument, NULL, 'f' },
+ { "nojournal", no_argument, NULL, 'j' },
+ { "verbose", no_argument, NULL, 'v' },
+ { "help", no_argument, NULL, 'h' },
+ { NULL }
+ };
struct bch_opts opts = bch2_opts_empty();
struct bch_dev *ca;
char *out = NULL;
unsigned i, nr_devices = 0;
- bool force = false, entire_journal = false;
+ bool force = false, entire_journal = true;
int fd, opt;
+ opt_set(opts, read_only, true);
opt_set(opts, nochanges, true);
opt_set(opts, norecovery, true);
opt_set(opts, degraded, true);
opt_set(opts, errors, BCH_ON_ERROR_continue);
opt_set(opts, fix_errors, FSCK_FIX_no);
- while ((opt = getopt(argc, argv, "o:fjvh")) != -1)
+ while ((opt = getopt_long(argc, argv, "o:fvh",
+ longopts, NULL)) != -1)
switch (opt) {
case 'o':
out = optarg;
force = true;
break;
case 'j':
- entire_journal = true;
+ entire_journal = false;
break;
case 'v':
opt_set(opts, verbose, true);
struct format_opts opts = format_opts_default();
struct dev_opts dev_opts = dev_opts_default(), *dev;
bool force = false, no_passphrase = false, quiet = false, initialize = true, verbose = false;
+ bool unconsumed_dev_option = false;
unsigned v;
int opt;
case O_fs_size:
if (bch2_strtoull_h(optarg, &dev_opts.size))
die("invalid filesystem size");
+ unconsumed_dev_option = true;
break;
case O_superblock_size:
if (bch2_strtouint_h(optarg, &opts.superblock_size))
case O_bucket_size:
if (bch2_strtoull_h(optarg, &dev_opts.bucket_size))
die("bad bucket_size %s", optarg);
+ unconsumed_dev_option = true;
break;
case O_label:
case 'l':
dev_opts.label = optarg;
+ unconsumed_dev_option = true;
break;
case O_discard:
dev_opts.discard = true;
+ unconsumed_dev_option = true;
break;
case O_data_allowed:
dev_opts.data_allowed =
read_flag_list_or_die(optarg,
bch2_data_types, "data type");
+ unconsumed_dev_option = true;
break;
case O_durability:
if (kstrtouint(optarg, 10, &dev_opts.durability) ||
dev_opts.durability > BCH_REPLICAS_MAX)
die("invalid durability");
+ unconsumed_dev_option = true;
break;
case O_version:
if (kstrtouint(optarg, 10, &opts.version))
dev_opts.path = optarg;
darray_push(&devices, dev_opts);
dev_opts.size = 0;
+ unconsumed_dev_option = false;
break;
case O_quiet:
case 'q':
break;
}
+ if (unconsumed_dev_option)
+ die("Options for devices apply to subsequent devices; got a device option with no device");
+
if (opts.version != bcachefs_metadata_version_current)
initialize = false;
initialize = false;
}
- darray_for_each(devices, dev)
- dev->fd = open_for_format(dev->path, force);
+ darray_for_each(devices, dev) {
+ int ret = open_for_format(dev, force);
+ if (ret)
+ die("Error opening %s: %s", dev_opts.path, strerror(-ret));
+ }
struct bch_sb *sb =
bch2_format(fs_opt_strs,
buf.human_readable_units = true;
- bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members);
+ bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members_v2);
printf("%s", buf.buf);
printbuf_exit(&buf);
{ "help", 0, NULL, 'h' },
{ NULL }
};
- unsigned fields = 1 << BCH_SB_FIELD_members;
+ unsigned fields = 0;
bool print_layout = false;
+ bool print_default_fields = true;
int opt;
while ((opt = getopt_long(argc, argv, "f:lh", longopts, NULL)) != -1)
? ~0
: read_flag_list_or_die(optarg,
bch2_sb_fields, "superblock field");
+ print_default_fields = false;
break;
case 'l':
print_layout = true;
if (ret)
die("Error opening %s: %s", dev, bch2_err_str(ret));
+ if (print_default_fields) {
+ fields |= bch2_sb_field_get(sb.sb, members_v2)
+ ? 1 << BCH_SB_FIELD_members_v2
+ : 1 << BCH_SB_FIELD_members_v1;
+ fields |= BCH_SB_FIELD_errors;
+ }
+
struct printbuf buf = PRINTBUF;
buf.human_readable_units = true;
#include "libbcachefs/error.h"
#include "libbcachefs/fs-common.h"
#include "libbcachefs/inode.h"
-#include "libbcachefs/io.h"
+#include "libbcachefs/io_read.h"
+#include "libbcachefs/io_write.h"
#include "libbcachefs/opts.h"
#include "libbcachefs/super.h"
/* XXX cut and pasted from fsck.c */
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static inline u64 map_root_ino(u64 ino)
+static inline subvol_inum map_root_ino(u64 ino)
{
- return ino == 1 ? 4096 : ino;
+ return (subvol_inum) { 1, ino == 1 ? 4096 : ino };
}
static inline u64 unmap_root_ino(u64 ino)
bch2_fs_stop(c);
}
-static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir_ino,
const char *name)
{
+ subvol_inum dir = map_root_ino(dir_ino);
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked bi;
struct qstr qstr = QSTR(name);
- u64 inum;
+ subvol_inum inum;
int ret;
fuse_log(FUSE_LOG_DEBUG, "fuse_lookup(dir=%llu name=%s)\n",
- dir, name);
-
- dir = map_root_ino(dir);
+ dir.inum, name);
ret = bch2_inode_find_by_inum(c, dir, &bi);
if (ret) {
struct bch_hash_info hash_info = bch2_hash_info_init(c, &bi);
- inum = bch2_dirent_lookup(c, dir, &hash_info, &qstr);
- if (!inum) {
+ ret = bch2_dirent_lookup(c, dir, &hash_info, &qstr, &inum);
+ if (ret) {
struct fuse_entry_param e = {
.attr_timeout = DBL_MAX,
.entry_timeout = DBL_MAX,
fuse_reply_err(req, -ret);
}
-static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t inum,
+static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
+ subvol_inum inum = map_root_ino(ino);
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked bi;
struct stat attr;
- int ret;
- fuse_log(FUSE_LOG_DEBUG, "fuse_getattr(inum=%llu)\n",
- inum);
+ fuse_log(FUSE_LOG_DEBUG, "fuse_getattr(inum=%llu)\n", inum.inum);
- inum = map_root_ino(inum);
-
- ret = bch2_inode_find_by_inum(c, inum, &bi);
+ int ret = bch2_inode_find_by_inum(c, inum, &bi);
if (ret) {
fuse_log(FUSE_LOG_DEBUG, "fuse_getattr error %i\n", ret);
fuse_reply_err(req, -ret);
fuse_reply_attr(req, &attr, DBL_MAX);
}
-static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t inum,
+static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t ino,
struct stat *attr, int to_set,
struct fuse_file_info *fi)
{
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked inode_u;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
u64 now;
int ret;
- fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_setattr(%llu, %x)\n",
- inum, to_set);
+ subvol_inum inum = map_root_ino(ino);
- inum = map_root_ino(inum);
+ fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_setattr(%llu, %x)\n", inum.inum, to_set);
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
now = bch2_current_time(c);
- ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
if (ret)
goto err;
inode_u.bi_mtime = now;
/* TODO: CTIME? */
- ret = bch2_inode_write(&trans, &iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL, NULL,
+ ret = bch2_inode_write(trans, &iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (ret == -EINTR)
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (!ret) {
*attr = inode_to_stat(c, &inode_u);
}
}
-static int do_create(struct bch_fs *c, u64 dir,
+static int do_create(struct bch_fs *c, subvol_inum dir,
const char *name, mode_t mode, dev_t rdev,
struct bch_inode_unpacked *new_inode)
{
struct qstr qstr = QSTR(name);
struct bch_inode_unpacked dir_u;
-
- dir = map_root_ino(dir);
+ uid_t uid = 0;
+ gid_t gid = 0;
bch2_inode_init_early(c, new_inode);
return bch2_trans_do(c, NULL, NULL, 0,
- bch2_create_trans(&trans,
+ bch2_create_trans(trans,
dir, &dir_u,
new_inode, &qstr,
- 0, 0, mode, rdev, NULL, NULL));
+ uid, gid, mode, rdev, NULL, NULL,
+ (subvol_inum) { 0 }, 0));
}
-static void bcachefs_fuse_mknod(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_mknod(fuse_req_t req, fuse_ino_t dir_ino,
const char *name, mode_t mode,
dev_t rdev)
{
+ subvol_inum dir = map_root_ino(dir_ino);
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked new_inode;
int ret;
fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_mknod(%llu, %s, %x, %x)\n",
- dir, name, mode, rdev);
+ dir.inum, name, mode, rdev);
+
ret = do_create(c, dir, name, mode, rdev, &new_inode);
if (ret)
goto err;
bcachefs_fuse_mknod(req, dir, name, mode, 0);
}
-static void bcachefs_fuse_unlink(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_unlink(fuse_req_t req, fuse_ino_t dir_ino,
const char *name)
{
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked dir_u, inode_u;
struct qstr qstr = QSTR(name);
- int ret;
+ subvol_inum dir = map_root_ino(dir_ino);
- fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_unlink(%llu, %s)\n", dir, name);
+ fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_unlink(%llu, %s)\n", dir.inum, name);
- dir = map_root_ino(dir);
-
- ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_unlink_trans(&trans, dir, &dir_u,
- &inode_u, &qstr));
+ int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_unlink_trans(trans, dir, &dir_u,
+ &inode_u, &qstr, false));
fuse_reply_err(req, -ret);
}
{
fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_rmdir(%llu, %s)\n", dir, name);
- dir = map_root_ino(dir);
-
bcachefs_fuse_unlink(req, dir, name);
}
static void bcachefs_fuse_rename(fuse_req_t req,
- fuse_ino_t src_dir, const char *srcname,
- fuse_ino_t dst_dir, const char *dstname,
+ fuse_ino_t src_dir_ino, const char *srcname,
+ fuse_ino_t dst_dir_ino, const char *dstname,
unsigned flags)
{
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked src_inode_u, dst_inode_u;
struct qstr dst_name = QSTR(srcname);
struct qstr src_name = QSTR(dstname);
+ subvol_inum src_dir = map_root_ino(src_dir_ino);
+ subvol_inum dst_dir = map_root_ino(dst_dir_ino);
int ret;
fuse_log(FUSE_LOG_DEBUG,
"bcachefs_fuse_rename(%llu, %s, %llu, %s, %x)\n",
- src_dir, srcname, dst_dir, dstname, flags);
-
- src_dir = map_root_ino(src_dir);
- dst_dir = map_root_ino(dst_dir);
+ src_dir.inum, srcname, dst_dir.inum, dstname, flags);
/* XXX handle overwrites */
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_rename_trans(&trans,
+ bch2_rename_trans(trans,
src_dir, &src_dir_u,
dst_dir, &dst_dir_u,
&src_inode_u, &dst_inode_u,
fuse_reply_err(req, -ret);
}
-static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t inum,
- fuse_ino_t newparent, const char *newname)
+static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t ino,
+ fuse_ino_t newparent_ino, const char *newname)
{
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked dir_u, inode_u;
struct qstr qstr = QSTR(newname);
+ subvol_inum newparent = map_root_ino(newparent_ino);
+ subvol_inum inum = map_root_ino(ino);
int ret;
fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_link(%llu, %llu, %s)\n",
- inum, newparent, newname);
-
- newparent = map_root_ino(newparent);
+ inum, newparent.inum, newname);
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_link_trans(&trans, newparent,
- inum, &dir_u, &inode_u, &qstr));
+ bch2_link_trans(trans, newparent, &dir_u,
+ inum, &inode_u, &qstr));
if (!ret) {
struct fuse_entry_param e = inode_to_entry(c, &inode_u);
static void userbio_init(struct bio *bio, struct bio_vec *bv,
void *buf, size_t size)
{
- bio_init(bio, bv, 1);
+ bio_init(bio, NULL, bv, 1, 0);
bio->bi_iter.bi_size = size;
bv->bv_page = buf;
bv->bv_len = size;
bv->bv_offset = 0;
}
-static int get_inode_io_opts(struct bch_fs *c, u64 inum,
- struct bch_io_opts *opts)
+static int get_inode_io_opts(struct bch_fs *c, subvol_inum inum, struct bch_io_opts *opts)
{
struct bch_inode_unpacked inode;
if (bch2_inode_find_by_inum(c, inum, &inode))
return -EINVAL;
- *opts = bch2_opts_to_inode_opts(c->opts);
- bch2_io_opts_apply(opts, bch2_inode_opts_get(&inode));
+ bch2_inode_opts_get(opts, c, &inode);
return 0;
}
/*
* Read aligned data.
*/
-static int read_aligned(struct bch_fs *c, fuse_ino_t inum, size_t aligned_size,
+static int read_aligned(struct bch_fs *c, subvol_inum inum, size_t aligned_size,
off_t aligned_offset, void *buf)
{
BUG_ON(aligned_size & (block_bytes(c) - 1));
return -blk_status_to_errno(rbio.bio.bi_status);
}
-static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t inum,
+static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t ino,
size_t size, off_t offset,
struct fuse_file_info *fi)
{
+ subvol_inum inum = map_root_ino(ino);
struct bch_fs *c = fuse_req_userdata(req);
fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_read(%llu, %zd, %lld)\n",
free(buf);
}
-static int inode_update_times(struct bch_fs *c, fuse_ino_t inum)
+static int inode_update_times(struct bch_fs *c, subvol_inum inum)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bch_inode_unpacked inode_u;
int ret = 0;
u64 now;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
now = bch2_current_time(c);
- ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
if (ret)
goto err;
inode_u.bi_mtime = now;
inode_u.bi_ctime = now;
- ret = bch2_inode_write(&trans, &iter, &inode_u);
+ ret = bch2_inode_write(trans, &iter, &inode_u);
if (ret)
goto err;
- ret = bch2_trans_commit(&trans, NULL, NULL,
+ ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (ret == -EINTR)
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
-static int write_aligned(struct bch_fs *c, fuse_ino_t inum,
+static int write_aligned(struct bch_fs *c, subvol_inum inum,
struct bch_io_opts io_opts, void *buf,
size_t aligned_size, off_t aligned_offset,
off_t new_i_size, size_t *written_out)
op.write_point = writepoint_hashed(0);
op.nr_replicas = io_opts.data_replicas;
op.target = io_opts.foreground_target;
- op.pos = POS(inum, aligned_offset >> 9);
+ op.subvol = inum.subvol;
+ op.pos = POS(inum.inum, aligned_offset >> 9);
op.new_i_size = new_i_size;
userbio_init(&op.wbio.bio, &bv, buf, aligned_size);
return op.error;
}
-static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t inum,
+static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t ino,
const char *buf, size_t size,
off_t offset,
struct fuse_file_info *fi)
{
+ subvol_inum inum = map_root_ino(ino);
struct bch_fs *c = fuse_req_userdata(req);
struct bch_io_opts io_opts;
size_t aligned_written;
}
static void bcachefs_fuse_symlink(fuse_req_t req, const char *link,
- fuse_ino_t dir, const char *name)
+ fuse_ino_t dir_ino, const char *name)
{
+ subvol_inum dir = map_root_ino(dir_ino);
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked new_inode;
size_t link_len = strlen(link);
int ret;
fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_symlink(%s, %llu, %s)\n",
- link, dir, name);
-
- dir = map_root_ino(dir);
+ link, dir.inum, name);
ret = do_create(c, dir, name, S_IFLNK|S_IRWXUGO, 0, &new_inode);
if (ret)
goto err;
struct bch_io_opts io_opts;
- ret = get_inode_io_opts(c, new_inode.bi_inum, &io_opts);
+ ret = get_inode_io_opts(c, dir, &io_opts);
if (ret)
goto err;
memset(aligned_buf, 0, align.size);
memcpy(aligned_buf, link, link_len); /* already terminated */
+ subvol_inum inum = (subvol_inum) { dir.subvol, new_inode.bi_inum };
+
size_t aligned_written;
- ret = write_aligned(c, new_inode.bi_inum, io_opts, aligned_buf,
+ ret = write_aligned(c, inum, io_opts, aligned_buf,
align.size, align.start, link_len + 1,
&aligned_written);
free(aligned_buf);
size_t written = align_fix_up_bytes(&align, aligned_written);
BUG_ON(written != link_len + 1); // TODO: handle short
- ret = inode_update_times(c, new_inode.bi_inum);
+ ret = inode_update_times(c, inum);
if (ret)
goto err;
fuse_reply_err(req, -ret);
}
-static void bcachefs_fuse_readlink(fuse_req_t req, fuse_ino_t inum)
+static void bcachefs_fuse_readlink(fuse_req_t req, fuse_ino_t ino)
{
+ subvol_inum inum = map_root_ino(ino);
struct bch_fs *c = fuse_req_userdata(req);
char *buf = NULL;
- fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readlink(%llu)\n", inum);
+ fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readlink(%llu)\n", inum.inum);
struct bch_inode_unpacked bi;
int ret = bch2_inode_find_by_inum(c, inum, &bi);
return true;
}
-static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir_ino,
size_t size, off_t off,
struct fuse_file_info *fi)
{
+ subvol_inum dir = map_root_ino(dir_ino);
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked bi;
char *buf = calloc(size, 1);
int ret = 0;
fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readdir(dir=%llu, size=%zu, "
- "off=%lld)\n", dir, size, off);
-
- dir = map_root_ino(dir);
+ "off=%lld)\n", dir.inum, size, off);
ret = bch2_inode_find_by_inum(c, dir, &bi);
if (ret)
goto reply;
}
- if (!handle_dots(&ctx, dir))
+ if (!handle_dots(&ctx, dir.inum))
goto reply;
ret = bch2_readdir(c, dir, &ctx.ctx);
}
#endif
-static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir_ino,
const char *name, mode_t mode,
struct fuse_file_info *fi)
{
+ subvol_inum dir = map_root_ino(dir_ino);
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked new_inode;
int ret;
fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_create(%llu, %s, %x)\n",
- dir, name, mode);
+ dir.inum, name, mode);
ret = do_create(c, dir, name, mode, 0, &new_inode);
if (ret)
return;
err:
fuse_reply_err(req, -ret);
-
}
#if 0
}
tokenize_devices(&ctx);
+ struct printbuf fsname = PRINTBUF;
+ prt_printf(&fsname, "fsname=");
+ for (i = 0; i < ctx.nr_devices; ++i) {
+ if (i)
+ prt_str(&fsname, ":");
+ prt_str(&fsname, ctx.devices[i]);
+ }
+
+ fuse_opt_add_arg(&args, "-o");
+ fuse_opt_add_arg(&args, fsname.buf);
+
/* Open bch */
printf("Opening bcachefs filesystem on:\n");
for (i = 0; i < ctx.nr_devices; ++i)
if (IS_ERR(c))
die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c)));
- struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+ struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
die("Filesystem does not have encryption enabled");
die("error encrypting key");
crypt->key = new_key;
+ bch2_revoke_key(c->disk_sb.sb);
bch2_write_super(c);
bch2_fs_stop(c);
return 0;
if (IS_ERR(c))
die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c)));
- struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+ struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
die("Filesystem does not have encryption enabled");
#include "libbcachefs/btree_iter.h"
#include "libbcachefs/errcode.h"
#include "libbcachefs/error.h"
+#include "libbcachefs/sb-members.h"
#include "libbcachefs/super.h"
static void kill_btree_node_usage(void)
switch (opt) {
case 'b':
btree_id = read_string_list_or_die(optarg,
- bch2_btree_ids, "btree id");
+ __bch2_btree_ids, "btree id");
break;
case 'l':
if (kstrtouint(optarg, 10, &level) || level >= BTREE_MAX_DEPTH)
if (IS_ERR(c))
die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c)));
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct btree *b;
int ret;
if (ret)
die("error %s from posix_memalign", bch2_err_str(ret));
- bch2_trans_init(&trans, c, 0, 0);
-
- __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
+ __for_each_btree_node(trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
if (b->c.level != level)
continue;
bch_err(c, "node at specified index not found");
ret = EXIT_FAILURE;
done:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
bch2_fs_stop(c);
return ret;
darray_push(&transaction_filter, bbpos_parse(optarg));
break;
case 'k':
- darray_push(&key_filter, read_string_list_or_die(optarg, bch2_btree_ids, "btree id"));
+ darray_push(&key_filter, read_string_list_or_die(optarg, __bch2_btree_ids, "btree id"));
break;
case 'v':
opt_set(opts, verbose, true);
#include "libbcachefs/errcode.h"
#include "libbcachefs/fs-common.h"
#include "libbcachefs/inode.h"
-#include "libbcachefs/io.h"
+#include "libbcachefs/io_write.h"
#include "libbcachefs/replicas.h"
#include "libbcachefs/str_hash.h"
#include "libbcachefs/super.h"
bch2_inode_pack(&packed, inode);
packed.inode.k.p.snapshot = U32_MAX;
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
- NULL, NULL, 0);
+ NULL, 0);
if (ret)
die("error updating inode: %s", bch2_err_str(ret));
}
struct bch_inode_unpacked inode;
int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_link_trans(&trans,
+ bch2_link_trans(trans,
(subvol_inum) { 1, parent->bi_inum }, &parent_u,
(subvol_inum) { 1, inum }, &inode, &qstr));
if (ret)
bch2_inode_init_early(c, &new_inode);
int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_create_trans(&trans,
+ bch2_create_trans(trans,
(subvol_inum) { 1, parent->bi_inum }, parent,
&new_inode, &qstr,
uid, gid, mode, rdev, NULL, NULL,
struct bch_inode_unpacked inode_u;
int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_xattr_set(&trans,
+ bch2_xattr_set(trans,
(subvol_inum) { 1, dst->bi_inum },
&inode_u, &hash_info, attr,
val, val_size, h->flags, 0));
die("error reserving space in new filesystem: %s",
bch2_err_str(ret));
- ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
- &res, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0);
if (ret)
die("btree insert error %s", bch2_err_str(ret));
struct dev_opts dev = dev_opts_default();
dev.path = dev_t_to_path(stat.st_dev);
- dev.fd = xopen(dev.path, O_RDWR);
+ dev.bdev = blkdev_get_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL);
- opt_set(fs_opts, block_size, get_blocksize(dev.path, dev.fd));
+ opt_set(fs_opts, block_size, get_blocksize(dev.bdev->bd_buffered_fd));
char *file_path = mprintf("%s/bcachefs", fs_path);
printf("Creating new filesystem on %s in space reserved at %s\n",
dev.path, file_path);
- bch2_pick_bucket_size(fs_opts, &dev);
+ dev.size = get_size(dev.bdev->bd_buffered_fd);
+ dev.bucket_size = bch2_pick_bucket_size(fs_opts, &dev);
+ dev.nbuckets = dev.size / dev.bucket_size;
+
+ bch2_check_bucket_size(fs_opts, &dev);
u64 bcachefs_inum;
ranges extents = reserve_new_fs_space(file_path,
fs_opts.block_size >> 9,
- get_size(dev.path, dev.fd) / 5,
+ get_size(dev.bdev->bd_buffered_fd) / 5,
&bcachefs_inum, stat.st_dev, force);
find_superblock_space(extents, format_opts, &dev);
int cmd_version(int argc, char *argv[])
{
- printf("bcachefs tool version %s\n", VERSION_STRING);
+ printf("%s\n", VERSION_STRING);
return 0;
}
int cmd_show_super(int argc, char *argv[]);
int cmd_set_option(int argc, char *argv[]);
-#if 0
-int cmd_assemble(int argc, char *argv[]);
-int cmd_incremental(int argc, char *argv[]);
-int cmd_run(int argc, char *argv[]);
-int cmd_stop(int argc, char *argv[]);
-#endif
-
int cmd_fs_usage(int argc, char *argv[]);
int device_usage(void);
int cmd_subvolume_snapshot(int argc, char *argv[]);
int cmd_fusemount(int argc, char *argv[]);
-void cmd_mount(int agc, char *argv[]);
+int cmd_mount(int argc, char *argv[]);
+int cmd_completions(int argc, char *argv[]);
#endif /* _CMDS_H */
{
struct bch_sb_field_crypt *crypt;
- return (crypt = bch2_sb_get_crypt(sb)) &&
+ return (crypt = bch2_sb_field_get(sb, crypt)) &&
bch2_key_is_encrypted(&crypt->key);
}
struct bch_key *passphrase_key,
struct bch_encrypted_key *sb_key)
{
- struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(sb);
+ struct bch_sb_field_crypt *crypt = bch2_sb_field_get(sb, crypt);
if (!crypt)
die("filesystem is not encrypted");
+bcachefs-tools (24+really1.3.4-1) unstable; urgency=medium
+
+ * New upstream release
+
+ -- Jonathan Carter <jcc@debian.org> Tue, 21 Nov 2023 17:26:13 +0200
+
bcachefs-tools (24+really1.2-1) unstable; urgency=medium
* New upstream release (Closes: #1054613)
-(import
- (
- let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in
- fetchTarball {
- url = "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz";
- sha256 = lock.nodes.flake-compat.locked.narHash;
- }
- )
- { src = ./.; }
-).defaultNix
+(import (let lock = builtins.fromJSON (builtins.readFile ./flake.lock);
+in fetchTarball {
+ url =
+ "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz";
+ sha256 = lock.nodes.flake-compat.locked.narHash;
+}) { src = ./.; }).defaultNix
"flake-compat": {
"flake": false,
"locked": {
- "lastModified": 1673956053,
- "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
+ "lastModified": 1696426674,
+ "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
"owner": "edolstra",
"repo": "flake-compat",
- "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
+ "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
"type": "github"
},
"original": {
},
"nixpkgs": {
"locked": {
- "lastModified": 1686592866,
- "narHash": "sha256-riGg89eWhXJcPNrQGcSwTEEm7CGxWC06oSX44hajeMw=",
+ "lastModified": 1698924604,
+ "narHash": "sha256-GCFbkl2tj8fEZBZCw3Tc0AkGo0v+YrQlohhEGJ/X4s0=",
"owner": "nixos",
"repo": "nixpkgs",
- "rev": "0eeebd64de89e4163f4d3cf34ffe925a5cf67a05",
+ "rev": "fa804edfb7869c9fb230e174182a8a1a7e512c40",
"type": "github"
},
"original": {
"systems": "systems"
},
"locked": {
- "lastModified": 1685518550,
- "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
+ "lastModified": 1694529238,
+ "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
"owner": "numtide",
"repo": "flake-utils",
- "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
+ "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
"type": "github"
},
"original": {
};
outputs = { self, nixpkgs, utils, ... }:
- utils.lib.eachDefaultSystem (system:
+ {
+ overlays.default = final: prev: {
+ bcachefs = final.callPackage ./build.nix { };
+ };
+ } // utils.lib.eachDefaultSystem (system:
let
- pkgs = nixpkgs.legacyPackages.${system};
- bcachefs = pkgs.callPackage ./build.nix {};
+ pkgs = import nixpkgs {
+ inherit system;
+ overlays = [ self.overlays.default ];
+ };
in {
packages = {
- default = bcachefs;
+ inherit (pkgs) bcachefs;
+ default = pkgs.bcachefs;
};
+
+ formatter = pkgs.nixfmt;
+
+ devShells.default = pkgs.callPackage ({ mkShell, rustc, cargo, gnumake
+ , gcc, clang, pkg-config, libuuid, libsodium, keyutils, liburcu, zlib
+ , libaio, zstd, lz4, udev, bcachefs }:
+ mkShell {
+ LIBCLANG_PATH = "${clang.cc.lib}/lib";
+ inherit (bcachefs) nativeBuildInputs buildInputs;
+ }) { };
});
}
+++ /dev/null
-#!/bin/sh
-
-SDIR="$(readlink -f "$0")"
-exec "${SDIR%/*}/bcachefs" fsck "$@"
#define smp_rmb() cmm_smp_rmb()
#define smp_mb() cmm_smp_mb()
#define smp_read_barrier_depends() cmm_smp_read_barrier_depends()
+#define smp_acquire__after_ctrl_dep() cmm_smp_mb()
#else /* C11_ATOMICS */
return __ATOMIC_DEC_RETURN(&v->counter); \
} \
\
+static inline i_type a_type##_dec_return_release(a_type##_t *v) \
+{ \
+ return __ATOMIC_SUB_RETURN_RELEASE(1, &v->counter); \
+} \
+ \
static inline void a_type##_inc(a_type##_t *v) \
{ \
__ATOMIC_INC(&v->counter); \
return atomic64_cmpxchg(v, old, new);
}
+static inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v)
+{
+ smp_mb__before_atomic();
+ return atomic64_sub_return(i, v);
+}
+
#endif
#endif /* __TOOLS_LINUX_ATOMIC_H */
#include <linux/futex.h>
#include <urcu/futex.h>
+/*
+ * The futex wait op wants an explicit 32-bit address and value. If the bitmap
+ * used for the spinlock is 64-bit, cast down and pass the right 32-bit region
+ * for the in-kernel checks. The value is the copy that has already been read
+ * from the atomic op.
+ *
+ * The futex wake op interprets the value as the number of waiters to wake (up
+ * to INT_MAX), so pass that along directly.
+ */
+static inline void do_futex(int nr, unsigned long *addr, unsigned long v, int futex_flags)
+{
+ u32 *addr32 = (u32 *) addr;
+ u32 *v32 = (u32 *) &v;
+ int shift = 0;
+
+ futex_flags |= FUTEX_PRIVATE_FLAG;
+
+#if BITS_PER_LONG == 64
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ shift = (nr >= 32) ? 1 : 0;
+#else
+ shift = (nr < 32) ? 1 : 0;
+#endif
+#endif
+ if (shift) {
+ addr32 += shift;
+ v32 += shift;
+ }
+ /*
+ * The shift to determine the futex address may have cast away a
+ * literal wake count value. The value is capped to INT_MAX and thus
+ * always in the low bytes of v regardless of bit nr. Copy in the wake
+ * count to whatever 32-bit range was selected.
+ */
+ if (futex_flags == FUTEX_WAKE_PRIVATE)
+ *v32 = (u32) v;
+ futex(addr32, futex_flags, *v32, NULL, NULL, 0);
+}
+
static inline void bit_spin_lock(int nr, unsigned long *_addr)
{
- u32 mask, *addr = ((u32 *) _addr) + (nr / 32), v;
+ unsigned long mask;
+ unsigned long *addr = _addr + (nr / BITS_PER_LONG);
+ unsigned long v;
- nr &= 31;
- mask = 1U << nr;
+ nr &= BITS_PER_LONG - 1;
+ mask = 1UL << nr;
while (1) {
v = __atomic_fetch_or(addr, mask, __ATOMIC_ACQUIRE);
if (!(v & mask))
break;
- futex(addr, FUTEX_WAIT|FUTEX_PRIVATE_FLAG, v, NULL, NULL, 0);
+ do_futex(nr, addr, v, FUTEX_WAIT);
}
}
static inline void bit_spin_wake(int nr, unsigned long *_addr)
{
- u32 *addr = ((u32 *) _addr) + (nr / 32);
-
- futex(addr, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, INT_MAX, NULL, NULL, 0);
+ do_futex(nr, _addr, INT_MAX, FUTEX_WAKE);
}
static inline void bit_spin_unlock(int nr, unsigned long *_addr)
{
- u32 mask, *addr = ((u32 *) _addr) + (nr / 32);
+ unsigned long mask;
+ unsigned long *addr = _addr + (nr / BITS_PER_LONG);
- nr &= 31;
- mask = 1U << nr;
+ nr &= BITS_PER_LONG - 1;
+ mask = 1UL << nr;
__atomic_and_fetch(addr, ~mask, __ATOMIC_RELEASE);
- futex(addr, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, INT_MAX, NULL, NULL, 0);
+ do_futex(nr, addr, INT_MAX, FUTEX_WAKE);
}
#endif /* __LINUX_BIT_SPINLOCK_H */
#include <linux/kobject.h>
#include <linux/types.h>
+#define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX)
+
#define BIO_MAX_VECS 256U
typedef unsigned fmode_t;
#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK))
#define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi))
-/* file is open for reading */
-#define FMODE_READ ((__force fmode_t)0x1)
-/* file is open for writing */
-#define FMODE_WRITE ((__force fmode_t)0x2)
-/* file is seekable */
-#define FMODE_LSEEK ((__force fmode_t)0x4)
-/* file can be accessed using pread */
-#define FMODE_PREAD ((__force fmode_t)0x8)
-/* file can be accessed using pwrite */
-#define FMODE_PWRITE ((__force fmode_t)0x10)
-/* File is opened for execution with sys_execve / sys_uselib */
-#define FMODE_EXEC ((__force fmode_t)0x20)
-/* File is opened with O_NDELAY (only set for block devices) */
-#define FMODE_NDELAY ((__force fmode_t)0x40)
-/* File is opened with O_EXCL (only set for block devices) */
-#define FMODE_EXCL ((__force fmode_t)0x80)
-/* File is opened using open(.., 3, ..) and is writeable only for ioctls
- (specialy hack for floppy.c) */
-#define FMODE_WRITE_IOCTL ((__force fmode_t)0x100)
-/* 32bit hashes as llseek() offset (for directories) */
-#define FMODE_32BITHASH ((__force fmode_t)0x200)
-/* 64bit hashes as llseek() offset (for directories) */
-#define FMODE_64BITHASH ((__force fmode_t)0x400)
-#define FMODE_BUFFERED ((__force fmode_t)0x800)
+typedef unsigned int __bitwise blk_mode_t;
+
+/* open for reading */
+#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0))
+/* open for writing */
+#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1))
+/* open exclusively (vs other exclusive openers */
+#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2))
+/* opened with O_NDELAY */
+#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3))
+/* open for "writes" only for ioctls (specialy hack for floppy.c) */
+#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4))
+
+#define BLK_OPEN_BUFFERED ((__force blk_mode_t)(1 << 5))
struct inode {
unsigned long i_ino;
unsigned bdev_logical_block_size(struct block_device *bdev);
sector_t get_capacity(struct gendisk *disk);
-void blkdev_put(struct block_device *bdev, fmode_t mode);
+struct blk_holder_ops {
+ void (*mark_dead)(struct block_device *bdev);
+};
+
+void blkdev_put(struct block_device *bdev, void *holder);
void bdput(struct block_device *bdev);
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder);
+struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+ void *holder, const struct blk_holder_ops *hop);
int lookup_bdev(const char *path, dev_t *);
struct super_block {
struct closure *parent;
atomic_t remaining;
+ bool closure_get_happened;
#ifdef CONFIG_DEBUG_CLOSURES
#define CLOSURE_MAGIC_DEAD 0xc054dead
*/
static inline void closure_sync(struct closure *cl)
{
- if (closure_nr_remaining(cl) != 1)
+#ifdef CONFIG_DEBUG_CLOSURES
+ BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+
+ if (cl->closure_get_happened)
__closure_sync(cl);
}
closure_set_ip(cl);
cl->fn = fn;
cl->wq = wq;
- /* between atomic_dec() in closure_put() */
- smp_mb__before_atomic();
}
static inline void closure_queue(struct closure *cl)
*/
static inline void closure_get(struct closure *cl)
{
+ cl->closure_get_happened = true;
+
#ifdef CONFIG_DEBUG_CLOSURES
BUG_ON((atomic_inc_return(&cl->remaining) &
CLOSURE_REMAINING_MASK) <= 1);
closure_get(parent);
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+ cl->closure_get_happened = false;
closure_debug_create(cl);
closure_set_ip(cl);
#define unreachable() __builtin_unreachable()
#define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
#define fallthrough __attribute__((__fallthrough__))
+#define __noreturn __attribute__((__noreturn__))
+
+#ifndef __counted_by
+#define __counted_by(nr)
+#endif
#define ___PASTE(a,b) a##b
#define __PASTE(a,b) ___PASTE(a,b)
size_t, size_t);
/**
- * genradix_iter_peek - get first entry at or below iterator's current
- * position
+ * genradix_iter_peek_prev - get first entry at or below iterator's current
+ * position
* @_iter: a genradix_iter
* @_radix: genradix being iterated over
*
#define unsafe_memcpy(dst, src, bytes, justification) \
memcpy(dst, src, bytes)
+#define DECLARE_FLEX_ARRAY(TYPE, NAME) \
+ __DECLARE_FLEX_ARRAY(TYPE, NAME)
+
#endif
pos; \
pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+static inline size_t list_count_nodes(struct list_head *head)
+{
+ struct list_head *pos;
+ size_t count = 0;
+
+ list_for_each(pos, head)
+ count++;
+
+ return count;
+}
+
#endif /* _LIST_LIST_H */
#define kmap_atomic(page) page_address(page)
#define kunmap_atomic(addr) do {} while (0)
+#define kmap_local_page(page) page_address(page)
+#define kunmap_local(addr) do {} while (0)
+
#define PageHighMem(page) false
static const char zero_page[PAGE_SIZE];
#define rcu_access_pointer(p) READ_ONCE(p)
#define kfree_rcu(ptr, rcu_head) kfree(ptr) /* XXX */
-#define kvfree_rcu(ptr) kfree(ptr) /* XXX */
+#define kvfree_rcu_mightsleep(ptr) kfree(ptr) /* XXX */
#define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v)
#define set_current_state(state_value) \
smp_store_mb(current->state, (state_value))
-#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+static inline struct task_struct *get_task_struct(struct task_struct *task)
+{
+ atomic_inc(&task->usage);
+ return task;
+
+}
extern void __put_task_struct(struct task_struct *t);
return ts.tv_sec;
}
+static inline u64 ktime_get_real_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ return timespec_to_ns(&ts);
+}
+
static inline u64 ktime_get_real_seconds(void)
{
struct timespec ts;
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _LINUX_SIX_H
-#define _LINUX_SIX_H
-
-/**
- * DOC: SIX locks overview
- *
- * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
- * but with an additional state: read/shared, intent, exclusive/write
- *
- * The purpose of the intent state is to allow for greater concurrency on tree
- * structures without deadlocking. In general, a read can't be upgraded to a
- * write lock without deadlocking, so an operation that updates multiple nodes
- * will have to take write locks for the full duration of the operation.
- *
- * But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at thte start of the operation,
- * and then take write locks only for the actual update to each individual
- * nodes, without deadlocking.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
- *
- * An intent lock must be held before taking a write lock:
- * six_lock_intent(&foo->lock);
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
- * six_unlock_intent(&foo->lock);
- *
- * Other operations:
- * six_trylock_read()
- * six_trylock_intent()
- * six_trylock_write()
- *
- * six_lock_downgrade() convert from intent to read
- * six_lock_tryupgrade() attempt to convert from read to intent, may fail
- *
- * There are also interfaces that take the lock type as an enum:
- *
- * six_lock_type(&foo->lock, SIX_LOCK_read);
- * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
- * six_lock_type(&foo->lock, SIX_LOCK_write);
- * six_unlock_type(&foo->lock, SIX_LOCK_write);
- * six_unlock_type(&foo->lock, SIX_LOCK_intent);
- *
- * Lock sequence numbers - unlock(), relock():
- *
- * Locks embed sequences numbers, which are incremented on write lock/unlock.
- * This allows locks to be dropped and the retaken iff the state they protect
- * hasn't changed; this makes it much easier to avoid holding locks while e.g.
- * doing IO or allocating memory.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * u32 seq = six_lock_seq(&foo->lock);
- * six_unlock_read(&foo->lock);
- *
- * some_operation_that_may_block();
- *
- * if (six_relock_read(&foo->lock, seq)) { ... }
- *
- * If the relock operation succeeds, it is as if the lock was never unlocked.
- *
- * Reentrancy:
- *
- * Six locks are not by themselves reentrent, but have counters for both the
- * read and intent states that can be used to provide reentrency by an upper
- * layer that tracks held locks. If a lock is known to already be held in the
- * read or intent state, six_lock_increment() can be used to bump the "lock
- * held in this state" counter, increasing the number of unlock calls that
- * will be required to fully unlock it.
- *
- * Example usage:
- * six_lock_read(&foo->lock);
- * six_lock_increment(&foo->lock, SIX_LOCK_read);
- * six_unlock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
- * foo->lock is now fully unlocked.
- *
- * Since the intent state supercedes read, it's legal to increment the read
- * counter when holding an intent lock, but not the reverse.
- *
- * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
- * is not legal.
- *
- * should_sleep_fn:
- *
- * There is a six_lock() variant that takes a function pointer that is called
- * immediately prior to schedule() when blocking, and may return an error to
- * abort.
- *
- * One possible use for this feature is when objects being locked are part of
- * a cache and may reused, and lock ordering is based on a property of the
- * object that will change when the object is reused - i.e. logical key order.
- *
- * If looking up an object in the cache may race with object reuse, and lock
- * ordering is required to prevent deadlock, object reuse may change the
- * correct lock order for that object and cause a deadlock. should_sleep_fn
- * can be used to check if the object is still the object we want and avoid
- * this deadlock.
- *
- * Wait list entry interface:
- *
- * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
- * wait list entry. By embedding six_lock_waiter into another object, and by
- * traversing lock waitlists, it is then possible for an upper layer to
- * implement full cycle detection for deadlock avoidance.
- *
- * should_sleep_fn should be used for invoking the cycle detector, walking the
- * graph of held locks to check for a deadlock. The upper layer must track
- * held locks for each thread, and each thread's held locks must be reachable
- * from its six_lock_waiter object.
- *
- * six_lock_waiter() will add the wait object to the waitlist re-trying taking
- * the lock, and before calling should_sleep_fn, and the wait object will not
- * be removed from the waitlist until either the lock has been successfully
- * acquired, or we aborted because should_sleep_fn returned an error.
- *
- * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
- * have timestamps in strictly ascending order - this is so the timestamp can
- * be used as a cursor for lock graph traverse.
- */
-
-#include <linux/lockdep.h>
-#include <linux/osq_lock.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-enum six_lock_type {
- SIX_LOCK_read,
- SIX_LOCK_intent,
- SIX_LOCK_write,
-};
-
-struct six_lock {
- atomic_t state;
- u32 seq;
- unsigned intent_lock_recurse;
- struct task_struct *owner;
- unsigned __percpu *readers;
- struct optimistic_spin_queue osq;
- raw_spinlock_t wait_lock;
- struct list_head wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-};
-
-struct six_lock_waiter {
- struct list_head list;
- struct task_struct *task;
- enum six_lock_type lock_want;
- bool lock_acquired;
- u64 start_time;
-};
-
-typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
-
-void six_lock_exit(struct six_lock *lock);
-
-enum six_lock_init_flags {
- SIX_LOCK_INIT_PCPU = 1U << 0,
-};
-
-void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags);
-
-/**
- * six_lock_init - initialize a six lock
- * @lock: lock to initialize
- * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
- */
-#define six_lock_init(lock, flags) \
-do { \
- static struct lock_class_key __key; \
- \
- __six_lock_init((lock), #lock, &__key, flags); \
-} while (0)
-
-/**
- * six_lock_seq - obtain current lock sequence number
- * @lock: six_lock to obtain sequence number for
- *
- * @lock should be held for read or intent, and not write
- *
- * By saving the lock sequence number, we can unlock @lock and then (typically
- * after some blocking operation) attempt to relock it: the relock will succeed
- * if the sequence number hasn't changed, meaning no write locks have been taken
- * and state corresponding to what @lock protects is still valid.
- */
-static inline u32 six_lock_seq(const struct six_lock *lock)
-{
- return lock->seq;
-}
-
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_trylock_type - attempt to take a six lock without blocking
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
- return six_trylock_ip(lock, type, _THIS_IP_);
-}
-
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip);
-
-/**
- * six_lock_waiter - take a lock, with full waitlist interface
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait: pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- *
- * This is a convenience wrapper around six_lock_ip_waiter(), see that function
- * for full documentation.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
- return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-/**
- * six_lock_ip - take a six lock lock
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- struct six_lock_waiter wait;
-
- return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-/**
- * six_lock_type - take a six lock lock
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
- struct six_lock_waiter wait;
-
- return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
- unsigned seq, unsigned long ip);
-
-/**
- * six_relock_type - attempt to re-take a lock that was held previously
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq: lock sequence number obtained from six_lock_seq() while lock was
- * held previously
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
- unsigned seq)
-{
- return six_relock_ip(lock, type, seq, _THIS_IP_);
-}
-
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_unlock_type - drop a six lock
- * @lock: lock to unlock
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock); read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0
- */
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
- six_unlock_ip(lock, type, _THIS_IP_);
-}
-
-#define __SIX_LOCK(type) \
-static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
-{ \
- return six_trylock_ip(lock, SIX_LOCK_##type, ip); \
-} \
- \
-static inline bool six_trylock_##type(struct six_lock *lock) \
-{ \
- return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \
-} \
- \
-static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \
- struct six_lock_waiter *wait, \
- six_lock_should_sleep_fn should_sleep_fn, void *p,\
- unsigned long ip) \
-{ \
- return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-} \
- \
-static inline int six_lock_ip_##type(struct six_lock *lock, \
- six_lock_should_sleep_fn should_sleep_fn, void *p, \
- unsigned long ip) \
-{ \
- return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-} \
- \
-static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{ \
- return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \
-} \
- \
-static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \
-{ \
- return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \
-} \
- \
-static inline int six_lock_##type(struct six_lock *lock, \
- six_lock_should_sleep_fn fn, void *p)\
-{ \
- return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \
-} \
- \
-static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \
-{ \
- six_unlock_ip(lock, SIX_LOCK_##type, ip); \
-} \
- \
-static inline void six_unlock_##type(struct six_lock *lock) \
-{ \
- six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \
-}
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-#undef __SIX_LOCK
-
-void six_lock_downgrade(struct six_lock *);
-bool six_lock_tryupgrade(struct six_lock *);
-bool six_trylock_convert(struct six_lock *, enum six_lock_type,
- enum six_lock_type);
-
-void six_lock_increment(struct six_lock *, enum six_lock_type);
-
-void six_lock_wakeup_all(struct six_lock *);
-
-struct six_lock_count {
- unsigned n[3];
-};
-
-struct six_lock_count six_lock_counts(struct six_lock *);
-void six_lock_readers_add(struct six_lock *, int);
-
-#endif /* _LINUX_SIX_H */
for (i = 0; i < 10; i++) {
if (size) {
- size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE);
+ size_t alignment = min_t(size_t, PAGE_SIZE,
+ rounddown_pow_of_two(size));
alignment = max(sizeof(void *), alignment);
if (posix_memalign(&p, alignment, size))
p = NULL;
return BCH_MIN_NR_NBUCKETS * bucket_size;
}
-void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
+u64 bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
{
- if (!dev->size)
- dev->size = get_size(dev->path, dev->fd);
+ u64 bucket_size;
- if (!dev->bucket_size) {
- if (dev->size < min_size(opts.block_size))
- die("cannot format %s, too small (%llu bytes, min %llu)",
- dev->path, dev->size, min_size(opts.block_size));
+ if (dev->size < min_size(opts.block_size))
+ die("cannot format %s, too small (%llu bytes, min %llu)",
+ dev->path, dev->size, min_size(opts.block_size));
- /* Bucket size must be >= block size: */
- dev->bucket_size = opts.block_size;
+ /* Bucket size must be >= block size: */
+ bucket_size = opts.block_size;
- /* Bucket size must be >= btree node size: */
- if (opt_defined(opts, btree_node_size))
- dev->bucket_size = max_t(unsigned, dev->bucket_size,
- opts.btree_node_size);
+ /* Bucket size must be >= btree node size: */
+ if (opt_defined(opts, btree_node_size))
+ bucket_size = max_t(unsigned, bucket_size,
+ opts.btree_node_size);
- /* Want a bucket size of at least 128k, if possible: */
- dev->bucket_size = max(dev->bucket_size, 128ULL << 10);
+ /* Want a bucket size of at least 128k, if possible: */
+ bucket_size = max(bucket_size, 128ULL << 10);
- if (dev->size >= min_size(dev->bucket_size)) {
- unsigned scale = max(1,
- ilog2(dev->size / min_size(dev->bucket_size)) / 4);
+ if (dev->size >= min_size(bucket_size)) {
+ unsigned scale = max(1,
+ ilog2(dev->size / min_size(bucket_size)) / 4);
- scale = rounddown_pow_of_two(scale);
+ scale = rounddown_pow_of_two(scale);
- /* max bucket size 1 mb */
- dev->bucket_size = min(dev->bucket_size * scale, 1ULL << 20);
- } else {
- do {
- dev->bucket_size /= 2;
- } while (dev->size < min_size(dev->bucket_size));
- }
+ /* max bucket size 1 mb */
+ bucket_size = min(bucket_size * scale, 1ULL << 20);
+ } else {
+ do {
+ bucket_size /= 2;
+ } while (dev->size < min_size(bucket_size));
}
- dev->nbuckets = dev->size / dev->bucket_size;
+ return bucket_size;
+}
+void bch2_check_bucket_size(struct bch_opts opts, struct dev_opts *dev)
+{
if (dev->bucket_size < opts.block_size)
die("Bucket size (%llu) cannot be smaller than block size (%u)",
dev->bucket_size, opts.block_size);
{
struct bch_sb_handle sb = { NULL };
struct dev_opts *i;
- struct bch_sb_field_members *mi;
unsigned max_dev_block_size = 0;
unsigned opt_id;
+ u64 min_bucket_size = U64_MAX;
for (i = devs; i < devs + nr_devs; i++)
- max_dev_block_size = max(max_dev_block_size,
- get_blocksize(i->path, i->fd));
+ max_dev_block_size = max(max_dev_block_size, get_blocksize(i->bdev->bd_buffered_fd));
/* calculate block size: */
if (!opt_defined(fs_opts, block_size)) {
die("blocksize too small: %u, must be greater than device blocksize %u",
fs_opts.block_size, max_dev_block_size);
+ /* get device size, if it wasn't specified: */
+ for (i = devs; i < devs + nr_devs; i++)
+ if (!i->size)
+ i->size = get_size(i->bdev->bd_buffered_fd);
+
/* calculate bucket sizes: */
for (i = devs; i < devs + nr_devs; i++)
- bch2_pick_bucket_size(fs_opts, i);
+ min_bucket_size = min(min_bucket_size,
+ i->bucket_size ?: bch2_pick_bucket_size(fs_opts, i));
+
+ for (i = devs; i < devs + nr_devs; i++)
+ if (!i->bucket_size)
+ i->bucket_size = min_bucket_size;
+
+ for (i = devs; i < devs + nr_devs; i++) {
+ i->nbuckets = i->size / i->bucket_size;
+ bch2_check_bucket_size(fs_opts, i);
+ }
/* calculate btree node size: */
if (!opt_defined(fs_opts, btree_node_size)) {
sb.sb->nr_devices = nr_devs;
if (opts.version == bcachefs_metadata_version_current)
- sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+ sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
uuid_generate(sb.sb->uuid.b);
sb.sb->time_precision = cpu_to_le32(1);
/* Member info: */
- mi = bch2_sb_resize_members(&sb,
+ struct bch_sb_field_members_v2 *mi =
+ bch2_sb_field_resize(&sb, members_v2,
(sizeof(*mi) + sizeof(struct bch_member) *
nr_devs) / sizeof(u64));
-
+ mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
for (i = devs; i < devs + nr_devs; i++) {
- struct bch_member *m = mi->members + (i - devs);
+ struct bch_member *m = bch2_members_v2_get_mut(sb.sb, (i - devs));
uuid_generate(m->uuid.b);
m->nbuckets = cpu_to_le64(i->nbuckets);
* Recompute mi and m after each sb modification: its location
* in memory may have changed due to reallocation.
*/
- mi = bch2_sb_get_members(sb.sb);
- m = mi->members + (i - devs);
-
+ m = bch2_members_v2_get_mut(sb.sb, (i - devs));
SET_BCH_MEMBER_GROUP(m, idx + 1);
}
/* Crypt: */
if (opts.encrypted) {
struct bch_sb_field_crypt *crypt =
- bch2_sb_resize_crypt(&sb, sizeof(*crypt) / sizeof(u64));
+ bch2_sb_field_resize(&sb, crypt, sizeof(*crypt) / sizeof(u64));
bch_sb_crypt_init(sb.sb, crypt, opts.passphrase);
SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1);
}
+ bch2_sb_members_cpy_v2_v1(&sb);
+
for (i = devs; i < devs + nr_devs; i++) {
u64 size_sectors = i->size >> 9;
/* Zero start of disk */
static const char zeroes[BCH_SB_SECTOR << 9];
- xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0,
+ xpwrite(i->bdev->bd_buffered_fd, zeroes, BCH_SB_SECTOR << 9, 0,
"zeroing start of disk");
}
- bch2_super_write(i->fd, sb.sb);
- close(i->fd);
+ bch2_super_write(i->bdev->bd_buffered_fd, sb.sb);
+ close(i->bdev->bd_buffered_fd);
}
return sb.sb;
case BCH_DATA_btree:
case BCH_DATA_user:
printf(" %s:%llu:%llu",
- bch2_btree_ids[e.p.btree_id],
+ bch2_btree_id_str(e.p.btree_id),
e.p.pos.inode,
e.p.pos.offset);
}
}
struct dev_opts {
- int fd;
+ struct block_device *bdev;
char *path;
u64 size; /* bytes*/
u64 bucket_size; /* bytes */
};
}
-void bch2_pick_bucket_size(struct bch_opts, struct dev_opts *);
+u64 bch2_pick_bucket_size(struct bch_opts, struct dev_opts *);
+void bch2_check_bucket_size(struct bch_opts, struct dev_opts *);
+
struct bch_sb *bch2_format(struct bch_opt_strs,
struct bch_opts,
struct format_opts, struct dev_opts *, size_t);
// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
#include "bcachefs.h"
-#include <linux/fs.h>
+#include "acl.h"
+#include "xattr.h"
+
#include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+ [ACL_USER_OBJ] = "user_obj",
+ [ACL_USER] = "user",
+ [ACL_GROUP_OBJ] = "group_obj",
+ [ACL_GROUP] = "group",
+ [ACL_MASK] = "mask",
+ [ACL_OTHER] = "other",
+ NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+ const void *p, *end = value + size;
+
+ if (!value ||
+ size < sizeof(bch_acl_header) ||
+ ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+ return;
+
+ p = value + sizeof(bch_acl_header);
+ while (p < end) {
+ const bch_acl_entry *in = p;
+ unsigned tag = le16_to_cpu(in->e_tag);
+
+ prt_str(out, acl_types[tag]);
+
+ switch (tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ p += sizeof(bch_acl_entry_short);
+ break;
+ case ACL_USER:
+ prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+ p += sizeof(bch_acl_entry);
+ break;
+ case ACL_GROUP:
+ prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+ p += sizeof(bch_acl_entry);
+ break;
+ }
+
+ prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+ if (p != end)
+ prt_char(out, ' ');
+ }
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
#include <linux/posix_acl_xattr.h>
#include <linux/sched.h>
#include <linux/slab.h>
-#include "acl.h"
-#include "fs.h"
-#include "xattr.h"
-
static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
{
return sizeof(bch_acl_header) +
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
-
- bch2_trans_init(&trans, c, 0, 0);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
&hash, inode_inum(inode), &search, 0);
if (ret) {
if (!bch2_err_matches(ret, ENOENT))
}
xattr = bkey_s_c_to_xattr(k);
- acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v),
+ acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (!IS_ERR(acl))
if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return acl;
}
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl;
int ret;
mutex_lock(&inode->ei_update_lock);
- bch2_trans_init(&trans, c, 0, 0);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
acl = _acl;
- ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT);
if (ret)
goto btree_err;
goto btree_err;
}
- ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
+ ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
if (ret)
goto btree_err;
inode_u.bi_ctime = bch2_current_time(c);
inode_u.bi_mode = mode;
- ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL, NULL, 0);
+ ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
btree_err:
- bch2_trans_iter_exit(&trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (unlikely(ret))
goto err;
- bch2_inode_update_after_write(&trans, inode, &inode_u,
+ bch2_inode_update_after_write(trans, inode, &inode_u,
ATTR_CTIME|ATTR_MODE);
set_cached_acl(&inode->v, type, acl);
err:
- bch2_trans_exit(&trans);
mutex_unlock(&inode->ei_update_lock);
+ bch2_trans_put(trans);
return ret;
}
struct btree_iter iter;
struct bkey_s_c_xattr xattr;
struct bkey_i_xattr *new;
- struct posix_acl *acl;
+ struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
return bch2_err_matches(ret, ENOENT) ? 0 : ret;
k = bch2_btree_iter_peek_slot(&iter);
- xattr = bkey_s_c_to_xattr(k);
+ ret = bkey_err(k);
if (ret)
goto err;
+ xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
struct bch_inode_info;
struct posix_acl;
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
#define BCH_ACL_VERSION 0x0001
typedef struct {
__le32 a_version;
} bch_acl_header;
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
return DIV_ROUND_UP(bytes, sizeof(u64));
}
-int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+ int ret = 0;
/* allow for unknown fields */
- if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
- prt_printf(err, "incorrect value size (%zu < %u)",
- bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
+ alloc_v1_val_size_bad,
+ "incorrect value size (%zu < %u)",
+ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+fsck_err:
+ return ret;
}
-int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
+ int ret = 0;
- if (bch2_alloc_unpack_v2(&u, k)) {
- prt_printf(err, "unpack error");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
+ alloc_v2_unpack_error,
+ "unpack error");
+fsck_err:
+ return ret;
}
-int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
+ int ret = 0;
- if (bch2_alloc_unpack_v3(&u, k)) {
- prt_printf(err, "unpack error");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
+ alloc_v2_unpack_error,
+ "unpack error");
+fsck_err:
+ return ret;
}
-int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
+int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
{
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
- int rw = flags & WRITE;
-
- if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
- prt_printf(err, "bad val size (%u > %lu)",
- alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
- return -BCH_ERR_invalid_bkey;
- }
-
- if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
- BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
- prt_printf(err, "invalid backpointers_start");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (rw == WRITE &&
- !(flags & BKEY_INVALID_JOURNAL) &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) {
- unsigned i, bp_len = 0;
+ int ret = 0;
- for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
- bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
+ bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
+ alloc_v4_val_size_bad,
+ "bad val size (%u > %zu)",
+ alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
- if (bp_len > a.v->dirty_sectors) {
- prt_printf(err, "too many backpointers");
- return -BCH_ERR_invalid_bkey;
- }
- }
+ bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
+ alloc_v4_backpointers_start_bad,
+ "invalid backpointers_start");
- if (rw == WRITE) {
- if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
- prt_printf(err, "invalid data type (got %u should be %u)",
- a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
+ alloc_key_data_type_bad,
+ "invalid data type (got %u should be %u)",
+ a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
- switch (a.v->data_type) {
- case BCH_DATA_free:
- case BCH_DATA_need_gc_gens:
- case BCH_DATA_need_discard:
- if (a.v->dirty_sectors ||
- a.v->cached_sectors ||
- a.v->stripe) {
- prt_printf(err, "empty data type free but have data");
- return -BCH_ERR_invalid_bkey;
- }
- break;
- case BCH_DATA_sb:
- case BCH_DATA_journal:
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- if (!a.v->dirty_sectors) {
- prt_printf(err, "data_type %s but dirty_sectors==0",
- bch2_data_types[a.v->data_type]);
- return -BCH_ERR_invalid_bkey;
- }
- break;
- case BCH_DATA_cached:
- if (!a.v->cached_sectors ||
- a.v->dirty_sectors ||
- a.v->stripe) {
- prt_printf(err, "data type inconsistency");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (!a.v->io_time[READ] &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
- prt_printf(err, "cached bucket with read_time == 0");
- return -BCH_ERR_invalid_bkey;
- }
- break;
- case BCH_DATA_stripe:
- if (!a.v->stripe) {
- prt_printf(err, "data_type %s but stripe==0",
- bch2_data_types[a.v->data_type]);
- return -BCH_ERR_invalid_bkey;
- }
- break;
- }
+ switch (a.v->data_type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ bkey_fsck_err_on(a.v->dirty_sectors ||
+ a.v->cached_sectors ||
+ a.v->stripe, c, err,
+ alloc_key_empty_but_have_data,
+ "empty data type free but have data");
+ break;
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ case BCH_DATA_btree:
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
+ alloc_key_dirty_sectors_0,
+ "data_type %s but dirty_sectors==0",
+ bch2_data_types[a.v->data_type]);
+ break;
+ case BCH_DATA_cached:
+ bkey_fsck_err_on(!a.v->cached_sectors ||
+ a.v->dirty_sectors ||
+ a.v->stripe, c, err,
+ alloc_key_cached_inconsistency,
+ "data type inconsistency");
+
+ bkey_fsck_err_on(!a.v->io_time[READ] &&
+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
+ c, err,
+ alloc_key_cached_but_read_time_zero,
+ "cached bucket with read_time == 0");
+ break;
+ case BCH_DATA_stripe:
+ break;
}
-
- return 0;
-}
-
-static inline u64 swab40(u64 x)
-{
- return (((x & 0x00000000ffULL) << 32)|
- ((x & 0x000000ff00ULL) << 16)|
- ((x & 0x0000ff0000ULL) >> 0)|
- ((x & 0x00ff000000ULL) >> 16)|
- ((x & 0xff00000000ULL) >> 32));
+fsck_err:
+ return ret;
}
void bch2_alloc_v4_swab(struct bkey_s k)
a->io_time[1] = swab64(a->io_time[1]);
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+ a->fragmentation_lru = swab64(a->fragmentation_lru);
bps = alloc_v4_backpointers(a);
for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
: 0;
}
-int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
- prt_printf(err, "bad val size (%lu != %zu)",
- bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
+ bucket_gens_val_size_bad,
+ "bad val size (%zu != %zu)",
+ bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+fsck_err:
+ return ret;
}
void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
int bch2_bucket_gens_init(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct bch_alloc_v4 a;
u8 gen;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
/*
* Not a fsck error because this is checked/repaired by
pos = alloc_gens_pos(iter.pos, &offset);
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
- ret = commit_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
+ bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
if (ret)
break;
have_bucket_gens_key = false;
g.v.gens[offset] = gen;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (have_bucket_gens_key && !ret)
- ret = commit_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
+ bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
bch_err_fn(c, ret);
int bch2_alloc_read(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct bch_dev *ca;
int ret;
down_read(&c->gc_lock);
- bch2_trans_init(&trans, c, 0, 0);
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
const struct bch_bucket_gens *g;
u64 b;
- for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
b++)
*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
} else {
struct bch_alloc_v4 a;
- for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
/*
* Not a fsck error because this is checked/repaired by
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
up_read(&c->gc_lock);
if (ret)
"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
" for %s",
set ? "setting" : "clearing",
- bch2_btree_ids[btree],
+ bch2_btree_id_str(btree),
iter.pos.inode,
iter.pos.offset,
bch2_bkey_types[old.k->type],
int ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+ alloc_key_to_missing_dev_bucket,
"alloc key for invalid device:bucket %llu:%llu",
alloc_k.k->p.inode, alloc_k.k->p.offset))
return bch2_btree_delete_at(trans, alloc_iter, 0);
if (k.k->type != discard_key_type &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
+ fsck_err(c, need_discard_key_wrong,
+ "incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[discard_key_type],
if (k.k->type != freespace_key_type &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
+ fsck_err(c, freespace_key_wrong,
+ "incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[freespace_key_type],
if (a->gen != alloc_gen(k, gens_offset) &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+ fsck_err(c, bucket_gens_key_wrong,
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
(printbuf_reset(&buf),
if (k.k->type != KEY_TYPE_set &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "hole in alloc btree missing in freespace btree\n"
+ fsck_err(c, freespace_hole_missing,
+ "hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
freespace_iter->pos.offset,
for (i = gens_offset; i < gens_end_offset; i++) {
if (fsck_err_on(g.v.gens[i], c,
+ bucket_gens_hole_wrong,
"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
bucket_gens_pos_to_alloc(k.k->p, i).inode,
bucket_gens_pos_to_alloc(k.k->p, i).offset,
}
if (need_update) {
- struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g));
+ struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
- ret = PTR_ERR_OR_ZERO(k);
+ ret = PTR_ERR_OR_ZERO(u);
if (ret)
goto err;
- memcpy(k, &g, sizeof(g));
+ memcpy(u, &g, sizeof(g));
- ret = bch2_trans_update(trans, bucket_gens_iter, k, 0);
+ ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
if (ret)
goto err;
}
return ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+ need_discard_freespace_key_to_invalid_dev_bucket,
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
- bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
+ bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
goto delete;
a = bch2_alloc_to_v4(alloc_k, &a_convert);
if (fsck_err_on(a->data_type != state ||
(state == BCH_DATA_free &&
genbits != alloc_freespace_genbits(*a)), c,
+ need_discard_freespace_key_bad,
"%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
- bch2_btree_ids[iter->btree_id],
+ bch2_btree_id_str(iter->btree_id),
iter->pos.inode,
iter->pos.offset,
a->data_type == state,
ret = bch2_btree_delete_extent_at(trans, iter,
iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw);
goto out;
}
if (!btree_id_is_extents(iter->btree_id)) {
return __bch2_check_discard_freespace_key(trans, iter);
} else {
- int ret;
+ int ret = 0;
while (!bkey_eq(iter->pos, end) &&
!(ret = btree_trans_too_many_iters(trans) ?:
dev_exists = bch2_dev_exists2(c, k.k->p.inode);
if (!dev_exists) {
if (fsck_err_on(!dev_exists, c,
+ bucket_gens_to_invalid_dev,
"bucket_gens key for invalid device:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
ca = bch_dev_bkey_exists(c, k.k->p.inode);
if (fsck_err_on(end <= ca->mi.first_bucket ||
start >= ca->mi.nbuckets, c,
+ bucket_gens_to_invalid_buckets,
"bucket_gens key for invalid buckets:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
for (b = start; b < ca->mi.first_bucket; b++)
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+ bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
for (b = ca->mi.nbuckets; b < end; b++)
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+ bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
}
if (need_update) {
- struct bkey_i *k;
+ struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
- k = bch2_trans_kmalloc(trans, sizeof(g));
- ret = PTR_ERR_OR_ZERO(k);
+ ret = PTR_ERR_OR_ZERO(u);
if (ret)
goto out;
- memcpy(k, &g, sizeof(g));
- ret = bch2_trans_update(trans, iter, k, 0);
+ memcpy(u, &g, sizeof(g));
+ ret = bch2_trans_update(trans, iter, u, 0);
}
out:
fsck_err:
int bch2_check_alloc_info(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
struct bkey hole;
struct bkey_s_c k;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH);
- bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+ bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
BTREE_ITER_PREFETCH);
- bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+ bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
BTREE_ITER_PREFETCH);
- bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+ bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH);
while (1) {
struct bpos next;
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
ret = bkey_err(k);
if (k.k->type) {
next = bpos_nosnap_successor(k.k->p);
- ret = bch2_check_alloc_key(&trans,
+ ret = bch2_check_alloc_key(trans,
k, &iter,
&discard_iter,
&freespace_iter,
} else {
next = k.k->p;
- ret = bch2_check_alloc_hole_freespace(&trans,
+ ret = bch2_check_alloc_hole_freespace(trans,
bkey_start_pos(k.k),
&next,
&freespace_iter) ?:
- bch2_check_alloc_hole_bucket_gens(&trans,
+ bch2_check_alloc_hole_bucket_gens(trans,
bkey_start_pos(k.k),
&next,
&bucket_gens_iter);
goto bkey_err;
}
- ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ ret = bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw);
if (ret)
goto bkey_err;
if (ret)
break;
}
- bch2_trans_iter_exit(&trans, &bucket_gens_iter);
- bch2_trans_iter_exit(&trans, &freespace_iter);
- bch2_trans_iter_exit(&trans, &discard_iter);
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &bucket_gens_iter);
+ bch2_trans_iter_exit(trans, &freespace_iter);
+ bch2_trans_iter_exit(trans, &discard_iter);
+ bch2_trans_iter_exit(trans, &iter);
if (ret < 0)
goto err;
- ret = for_each_btree_key2(&trans, iter,
+ ret = for_each_btree_key2(trans, iter,
BTREE_ID_need_discard, POS_MIN,
BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
- for_each_btree_key2(&trans, iter,
+ bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+ for_each_btree_key2(trans, iter,
BTREE_ID_freespace, POS_MIN,
BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
- for_each_btree_key_commit(&trans, iter,
+ bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+ for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
- bch2_check_bucket_gens_key(&trans, &iter, k));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ bch2_check_bucket_gens_key(trans, &iter, k));
err:
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
bch_err_fn(c, ret);
return ret;
return ret;
if (fsck_err_on(!a->io_time[READ], c,
+ alloc_key_cached_but_read_time_zero,
"cached bucket with read_time 0\n"
" %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+ alloc_key_to_missing_lru_entry,
"missing lru entry\n"
" %s",
(printbuf_reset(&buf),
int ret = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
- bch2_check_alloc_to_lru_ref(&trans, &iter)));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ bch2_check_alloc_to_lru_ref(trans, &iter)));
if (ret)
bch_err_fn(c, ret);
return ret;
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
static void bch2_do_discards_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
- struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
struct bpos discard_pos_done = POS_MAX;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
/*
* We're doing the commit in bch2_discard_one_bucket instead of using
* for_each_btree_key_commit() so that we can increment counters after
* successful commit:
*/
- ret = for_each_btree_key2(&trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
- &seen,
- &open,
- &need_journal_commit,
- &discarded));
-
- bch2_trans_exit(&trans);
+ ret = bch2_trans_run(c,
+ for_each_btree_key2(trans, iter,
+ BTREE_ID_need_discard, POS_MIN, 0, k,
+ bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
+ &seen,
+ &open,
+ &need_journal_commit,
+ &discarded)));
if (need_journal_commit * 2 > seen)
bch2_journal_flush_async(&c->journal, NULL);
BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
{
struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
struct bch_dev *ca;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
unsigned i;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
- ret = bch2_btree_write_buffer_flush(&trans);
+ ret = bch2_btree_write_buffer_flush(trans);
if (ret)
goto err;
s64 nr_to_invalidate =
should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
- ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
lru_pos(ca->dev_idx, 0, 0),
lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
BTREE_ITER_INTENT, k,
- invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
+ invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
if (ret < 0) {
percpu_ref_put(&ca->ref);
}
}
err:
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
-static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
- unsigned long *last_updated)
+int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+ u64 bucket_start, u64 bucket_end)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct bkey hole;
- struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets);
+ struct bpos end = POS(ca->dev_idx, bucket_end);
struct bch_member *m;
+ unsigned long last_updated = jiffies;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
+ BUG_ON(bucket_start > bucket_end);
+ BUG_ON(bucket_end > ca->mi.nbuckets);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
- POS(ca->dev_idx, ca->mi.first_bucket),
- BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
+ BTREE_ITER_PREFETCH);
/*
* Scan the alloc btree for every bucket on @ca, and add buckets to the
* freespace/need_discard/need_gc_gens btrees as needed:
*/
while (1) {
- if (*last_updated + HZ * 10 < jiffies) {
+ if (last_updated + HZ * 10 < jiffies) {
bch_info(ca, "%s: currently at %llu/%llu",
__func__, iter.pos.offset, ca->mi.nbuckets);
- *last_updated = jiffies;
+ last_updated = jiffies;
}
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
if (bkey_ge(iter.pos, end)) {
ret = 0;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- ret = bch2_bucket_do_index(&trans, k, a, true) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ ret = bch2_bucket_do_index(trans, k, a, true) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
} else {
struct bkey_i *freespace;
- freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
+ freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
ret = PTR_ERR_OR_ZERO(freespace);
if (ret)
goto bkey_err;
freespace->k.p = k.k->p;
freespace->k.size = k.k->size;
- ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
break;
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
if (ret < 0) {
- bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
+ bch_err_msg(ca, ret, "initializing free space");
return ret;
}
mutex_lock(&c->sb_lock);
- m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
mutex_unlock(&c->sb_lock);
unsigned i;
int ret = 0;
bool doing_init = false;
- unsigned long last_updated = jiffies;
/*
* We can crash during the device add path, so we need to check this on
doing_init = true;
}
- ret = bch2_dev_freespace_init(c, ca, &last_updated);
+ ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
if (ret) {
percpu_ref_put(&ca->ref);
bch_err_fn(c, ret);
closure_wake_up(&c->freelist_wait);
}
+u64 bch2_min_rw_member_capacity(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+ u64 ret = U64_MAX;
+
+ for_each_rw_member(ca, c, i)
+ ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
+ return ret;
+}
+
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
{
struct open_bucket *ob;
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.min_val_size = 48, \
})
-int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
}
+int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
int bch2_fs_freespace_init(struct bch_fs *);
void bch2_recalc_capacity(struct bch_fs *);
+u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
-#include "io.h"
+#include "io_write.h"
#include "journal.h"
#include "movinggc.h"
#include "nocow_locking.h"
struct bucket_alloc_state *s,
struct closure *cl)
{
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct btree_iter iter, citer;
+ struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
- u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
- u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+ u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+ u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+ u64 alloc_cursor = alloc_start;
int ret;
+
+ /*
+ * Scan with an uncached iterator to avoid polluting the key cache. An
+ * uncached iter will return a cached key if one exists, but if not
+ * there is no other underlying protection for the associated key cache
+ * slot. To avoid racing bucket allocations, look up the cached key slot
+ * of any likely allocation candidate before attempting to proceed with
+ * the allocation. This provides proper exclusion on the associated
+ * bucket.
+ */
again:
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
BTREE_ITER_SLOTS, k, ret) {
continue;
a = bch2_alloc_to_v4(k, &a_convert);
-
if (a->data_type != BCH_DATA_free)
continue;
+ /* now check the cached key to serialize concurrent allocs of the bucket */
+ ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+ ret = bkey_err(ck);
+ if (ret)
+ break;
+
+ a = bch2_alloc_to_v4(ck, &a_convert);
+ if (a->data_type != BCH_DATA_free)
+ goto next;
+
s->buckets_seen++;
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+next:
+ citer.path->preserve = false;
+ bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
}
bch2_trans_iter_exit(trans, &iter);
+ alloc_cursor = iter.pos.offset;
ca->alloc_cursor = alloc_cursor;
if (!ob && ret)
ob = ERR_PTR(ret);
- if (!ob && alloc_cursor > alloc_start) {
- alloc_cursor = alloc_start;
+ if (!ob && alloc_start > first_bucket) {
+ alloc_cursor = alloc_start = first_bucket;
goto again;
}
}
/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans: transaction object
+ * @ca: device to allocate from
+ * @watermark: how important is this allocation?
+ * @cl: if not NULL, closure to be used to wait if buckets not available
+ * @usage: for secondarily also returning the current device usage
*
- * Returns index of bucket on success, 0 on failure
+ * Returns: an open_bucket on success, or an ERR_PTR() on failure.
*/
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bch_dev *ca,
struct open_bucket *ob;
bch2_trans_do(c, NULL, NULL, 0,
- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
cl, &usage)));
return ob;
}
struct dev_alloc_list devs_sorted;
struct ec_stripe_head *h;
struct open_bucket *ob;
- struct bch_dev *ca;
unsigned i, ec_idx;
int ret = 0;
}
goto out_put_head;
got_bucket:
- ca = bch_dev_bkey_exists(c, ob->dev);
-
ob->ec_idx = ec_idx;
ob->ec = h->s;
ec_stripe_new_get(h->s, STRIPE_REF_io);
cl = _cl;
goto retry_blocking;
}
-
}
return ret;
return ret < 0 ? ret : 0;
}
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob: open_bucket to predicate on
+ * @c: filesystem handle
+ * @ca: if set, we're killing buckets for a particular device
+ * @ec: if true, we're shutting down erasure coding and killing all ec
+ * open_buckets
+ * otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
struct bch_dev *ca, bool ec)
{
NULL
};
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+ struct write_point *wp)
+{
+ struct open_bucket *ob;
+ unsigned i;
+
+ prt_printf(out, "%lu: ", wp->write_point);
+ prt_human_readable_u64(out, wp->sectors_allocated);
+
+ prt_printf(out, " last wrote: ");
+ bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+ for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+ prt_printf(out, " %s: ", bch2_write_point_states[i]);
+ bch2_pr_time_units(out, wp->time[i]);
+ }
+
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ bch2_open_bucket_to_text(out, c, ob);
+ printbuf_indent_sub(out, 2);
+}
+
void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
{
struct write_point *wp;
- unsigned i;
+ prt_str(out, "Foreground write points\n");
for (wp = c->write_points;
wp < c->write_points + ARRAY_SIZE(c->write_points);
- wp++) {
- prt_printf(out, "%lu: ", wp->write_point);
- prt_human_readable_u64(out, wp->sectors_allocated);
+ wp++)
+ bch2_write_point_to_text(out, c, wp);
- prt_printf(out, " last wrote: ");
- bch2_pr_time_units(out, sched_clock() - wp->last_used);
+ prt_str(out, "Copygc write point\n");
+ bch2_write_point_to_text(out, c, &c->copygc_write_point);
- for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
- prt_printf(out, " %s: ", bch2_write_point_states[i]);
- bch2_pr_time_units(out, wp->time[i]);
- }
+ prt_str(out, "Rebalance write point\n");
+ bch2_write_point_to_text(out, c, &c->rebalance_write_point);
- prt_newline(out);
- }
+ prt_str(out, "Btree write point\n");
+ bch2_write_point_to_text(out, c, &c->btree_write_point);
}
#include "bcachefs.h"
#include "alloc_types.h"
#include "extents.h"
-#include "super.h"
+#include "sb-members.h"
#include <linux/hash.h>
struct dev_stripe_state stripe;
u64 sectors_allocated;
- } __attribute__((__aligned__(SMP_CACHE_BYTES)));
+ } __aligned(SMP_CACHE_BYTES);
struct {
struct work_struct index_update_work;
enum write_point_state state;
u64 last_state_change;
u64 time[WRITE_POINT_STATE_NR];
- } __attribute__((__aligned__(SMP_CACHE_BYTES)));
+ } __aligned(SMP_CACHE_BYTES);
};
struct write_point_specifier {
#include "backpointers.h"
#include "btree_cache.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "error.h"
return false;
}
-int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+ int ret = 0;
- if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
- prt_str(err, "backpointer at wrong pos");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+ c, err,
+ backpointer_pos_wrong,
+ "backpointer at wrong pos");
+fsck_err:
+ return ret;
}
void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
{
prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
- bch2_btree_ids[bp->btree_id],
+ bch2_btree_id_str(bp->btree_id),
bp->level,
(u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
(u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
{
struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
- bp.v->bucket_offset = swab32(bp.v->bucket_offset);
+ bp.v->bucket_offset = swab40(bp.v->bucket_offset);
bp.v->bucket_len = swab32(bp.v->bucket_len);
bch2_bpos_swab(&bp.v->pos);
}
static void backpointer_not_found(struct btree_trans *trans,
struct bpos bp_pos,
struct bch_backpointer bp,
- struct bkey_s_c k,
- const char *thing_it_points_to)
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ /*
+ * If we're using the btree write buffer, the backpointer we were
+ * looking at may have already been deleted - failure to find what it
+ * pointed to is not an error:
+ */
if (likely(!bch2_backpointers_no_use_write_buffer))
return;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
- thing_it_points_to);
+ bp.level ? "btree node" : "extent");
prt_printf(&buf, "bucket: ");
bch2_bpos_to_text(&buf, bucket);
prt_printf(&buf, "\n ");
struct bch_backpointer bp,
unsigned iter_flags)
{
- struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
- struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
- struct bkey_s_c k;
-
- bch2_trans_node_iter_init(trans, iter,
- bp.btree_id,
- bp.pos,
- 0,
- min(bp.level, r->level),
- iter_flags);
- k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k)) {
- bch2_trans_iter_exit(trans, iter);
- return k;
- }
-
- if (bp.level == r->level + 1)
- k = bkey_i_to_s_c(&r->key);
-
- if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
+ if (likely(!bp.level)) {
+ struct bch_fs *c = trans->c;
+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+ struct bkey_s_c k;
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.btree_id,
+ bp.pos,
+ 0, 0,
+ iter_flags);
+ k = bch2_btree_iter_peek_slot(iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
- if (unlikely(bch2_backpointers_no_use_write_buffer)) {
- if (bp.level) {
- struct btree *b;
+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+ return k;
- /*
- * If a backpointer for a btree node wasn't found, it may be
- * because it was overwritten by a new btree node that hasn't
- * been written out yet - backpointer_get_node() checks for
- * this:
- */
- b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
- if (!IS_ERR_OR_NULL(b))
- return bkey_i_to_s_c(&b->key);
+ bch2_trans_iter_exit(trans, iter);
+ backpointer_not_found(trans, bp_pos, bp, k);
+ return bkey_s_c_null;
+ } else {
+ struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
+ if (IS_ERR_OR_NULL(b)) {
bch2_trans_iter_exit(trans, iter);
-
- if (IS_ERR(b))
- return bkey_s_c_err(PTR_ERR(b));
- return bkey_s_c_null;
+ return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
}
-
- backpointer_not_found(trans, bp_pos, bp, k, "extent");
+ return bkey_i_to_s_c(&b->key);
}
-
- return bkey_s_c_null;
}
struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
bp.level - 1,
0);
b = bch2_btree_iter_peek_node(iter);
- if (IS_ERR(b))
+ if (IS_ERR_OR_NULL(b))
goto err;
- if (b && extent_matches_bp(c, bp.btree_id, bp.level,
- bkey_i_to_s_c(&b->key),
- bucket, bp))
+ BUG_ON(b->c.level != bp.level - 1);
+
+ if (extent_matches_bp(c, bp.btree_id, bp.level,
+ bkey_i_to_s_c(&b->key),
+ bucket, bp))
return b;
- if (b && btree_node_will_make_reachable(b)) {
+ if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
- backpointer_not_found(trans, bp_pos, bp,
- bkey_i_to_s_c(&b->key), "btree node");
+ backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
b = NULL;
}
err:
{
struct bch_fs *c = trans->c;
struct btree_iter alloc_iter = { NULL };
- struct bch_dev *ca;
struct bkey_s_c alloc_k;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
- "backpointer for mising device:\n%s",
+ backpointer_to_missing_device,
+ "backpointer for missing device:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, bp_iter, 0);
goto out;
}
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
bp_pos_to_bucket(c, k.k->p), 0);
ret = bkey_err(alloc_k);
goto out;
if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+ backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
int ret;
ret = bch2_trans_run(c,
- for_each_btree_key_commit(&trans, iter,
+ for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- bch2_check_btree_backpointer(&trans, &iter, k)));
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+ bch2_check_btree_backpointer(trans, &iter, k)));
if (ret)
bch_err_fn(c, ret);
return ret;
return ret;
missing:
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
- bch2_btree_ids[bp.btree_id], bp.level);
+ bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k);
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
c->opts.reconstruct_alloc ||
- fsck_err(c, "%s", buf.buf))
+ fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
goto out;
}
static int check_extent_to_backpointers(struct btree_trans *trans,
- struct btree_iter *iter,
+ enum btree_id btree, unsigned level,
struct bpos bucket_start,
struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ struct bpos_level *last_flushed,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- struct bkey_s_c k;
int ret;
- k = bch2_btree_iter_peek_all_levels(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
- if (!k.k)
- return 0;
-
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bpos bucket_pos;
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+ bch2_extent_ptr_to_bp(c, btree, level,
k, p, &bucket_pos, &bp);
ret = check_bp_exists(trans, bucket_pos, bp, k,
enum btree_id btree_id,
struct bpos bucket_start,
struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ struct bpos_level *last_flushed,
+ int *level)
{
struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, btree_id);
struct btree_iter iter;
struct btree *b;
struct bkey_s_c k;
- struct bkey_ptrs_c ptrs;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
int ret;
-
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
+retry:
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
+ 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
b = bch2_btree_iter_peek_node(&iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
- BUG_ON(b != btree_node_root(c, b));
-
- k = bkey_i_to_s_c(&b->key);
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket_pos;
- struct bch_backpointer bp;
-
- if (p.ptr.cached)
- continue;
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry;
+ }
- bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
- k, p, &bucket_pos, &bp);
+ *level = b->c.level;
- ret = check_bp_exists(trans, bucket_pos, bp, k,
+ k = bkey_i_to_s_c(&b->key);
+ ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1,
bucket_start, bucket_end,
- last_flushed);
- if (ret)
- goto err;
- }
+ last_flushed, k);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
struct bch_fs *c = trans->c;
struct btree_iter iter;
enum btree_id btree_id;
- struct bpos_level last_flushed = { UINT_MAX };
+ struct bkey_s_c k;
+ struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
int ret = 0;
for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
- unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
-
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
- depth,
- BTREE_ITER_ALL_LEVELS|
- BTREE_ITER_PREFETCH);
-
- do {
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_extent_to_backpointers(trans, &iter,
- bucket_start, bucket_end,
- &last_flushed));
- if (ret)
- break;
- } while (!bch2_btree_iter_advance(&iter));
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret)
- break;
+ int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc,
check_btree_root_to_backpointers(trans, btree_id,
bucket_start, bucket_end,
- &last_flushed));
+ &last_flushed, &level));
if (ret)
- break;
+ return ret;
+
+ while (level >= depth) {
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ level,
+ BTREE_ITER_PREFETCH);
+ for_each_btree_key_continue(trans, iter, BTREE_ITER_PREFETCH, k, ret) {
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc,
+ check_extent_to_backpointers(trans, btree_id, level,
+ bucket_start, bucket_end,
+ &last_flushed, k));
+ if (ret)
+ break;
+
+ if (bpos_eq(iter.pos, SPOS_MAX))
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ return ret;
+
+ --level;
+ }
}
- return ret;
+
+ return 0;
}
static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
--btree_nodes;
if (!btree_nodes) {
- *end = alloc_k.k->p;
+ *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
break;
}
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct bpos start = POS_MIN, end;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
while (1) {
- ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+ ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
if (ret)
break;
printbuf_exit(&buf);
}
- ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+ ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
if (ret || bpos_eq(end, SPOS_MAX))
break;
start = bpos_successor(end);
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
bch_err_fn(c, ret);
}
if (fsck_err_on(!k.k, c,
- "backpointer for missing extent\n %s",
+ backpointer_to_missing_ptr,
+ "backpointer for missing %s\n %s",
+ bp.v->level ? "btree node" : "extent",
(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
goto out;
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
&last_flushed_pos));
int bch2_check_backpointers_to_extents(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
while (1) {
- ret = bch2_get_btree_in_memory_pos(&trans,
+ ret = bch2_get_btree_in_memory_pos(trans,
(1U << BTREE_ID_extents)|
(1U << BTREE_ID_reflink),
~0,
printbuf_exit(&buf);
}
- ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+ ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
if (ret || !bbpos_cmp(end, BBPOS_MAX))
break;
start = bbpos_successor(end);
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
bch_err_fn(c, ret);
#include "buckets.h"
#include "super.h"
-int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+static inline u64 swab40(u64 x)
+{
+ return (((x & 0x00000000ffULL) << 32)|
+ ((x & 0x000000ff00ULL) << 16)|
+ ((x & 0x0000ff0000ULL) >> 0)|
+ ((x & 0x00ff000000ULL) >> 16)|
+ ((x & 0xff00000000ULL) >> 32));
+}
+
+int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
enum bkey_invalid_flags, struct printbuf *);
void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#ifndef _BCACHEFS_BBPOS_H
#define _BCACHEFS_BBPOS_H
+#include "bbpos_types.h"
#include "bkey_methods.h"
-
-struct bbpos {
- enum btree_id btree;
- struct bpos pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
- return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#include "btree_cache.h"
static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
{
static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
{
- prt_str(out, bch2_btree_ids[pos.btree]);
+ prt_str(out, bch2_btree_id_str(pos.btree));
prt_char(out, ':');
bch2_bpos_to_text(out, pos.pos);
}
#include "nocow_locking_types.h"
#include "opts.h"
#include "recovery_types.h"
+#include "sb-errors_types.h"
#include "seqmutex.h"
#include "util.h"
printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
#define bch_err_fn(_c, _ret) \
- bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret))
-#define bch_err_msg(_c, _ret, _msg) \
- bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret))
+do { \
+ if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
+#define bch_err_msg(_c, _ret, _msg, ...) \
+do { \
+ if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ bch_err(_c, "%s(): error " _msg " %s", __func__, \
+ ##__VA_ARGS__, bch2_err_str(_ret)); \
+} while (0)
#define bch_verbose(c, fmt, ...) \
do { \
#undef BCH_DEBUG_PARAM
#ifndef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
BCH_DEBUG_PARAMS_DEBUG()
#undef BCH_DEBUG_PARAM
#endif
x(journal_flush_write) \
x(journal_noflush_write) \
x(journal_flush_seq) \
- x(blocked_journal) \
+ x(blocked_journal_low_on_space) \
+ x(blocked_journal_low_on_pin) \
+ x(blocked_journal_max_in_flight) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
x(nocow_lock_contended)
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
+#include "disk_groups_types.h"
#include "ec_types.h"
#include "journal_types.h"
#include "keylist_types.h"
GC_PHASE_BTREE_bucket_gens,
GC_PHASE_BTREE_snapshot_trees,
GC_PHASE_BTREE_deleted_inodes,
+ GC_PHASE_BTREE_logged_ops,
+ GC_PHASE_BTREE_rebalance_work,
GC_PHASE_PENDING_DELETE,
};
* Committed by bch2_write_super() -> bch_fs_mi_update()
*/
struct bch_member_cpu mi;
+ atomic64_t errors[BCH_MEMBER_ERROR_NR];
+
__uuid_t uuid;
char name[BDEVNAME_SIZE];
BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
BCH_FS_NEED_ANOTHER_GC,
- BCH_FS_HAVE_DELETED_SNAPSHOTS,
+ BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
/* errors: */
BCH_FS_ERROR,
u64 start;
u64 end;
bool dirty;
- } entries[0];
+ } entries[];
};
struct journal_keys {
size_t size;
};
-struct btree_path_buf {
- struct btree_path *path;
+struct btree_trans_buf {
+ struct btree_trans *trans;
};
#define REPLICAS_DELTA_LIST_MAX (1U << 16)
struct snapshot_table __rcu *snapshots;
size_t snapshot_table_size;
struct mutex snapshot_table_lock;
+ struct rw_semaphore snapshot_create_lock;
struct work_struct snapshot_delete_work;
struct work_struct snapshot_wait_for_pagecache_and_delete_work;
/* btree_iter.c: */
struct seqmutex btree_trans_lock;
struct list_head btree_trans_list;
- mempool_t btree_paths_pool;
+ mempool_t btree_trans_pool;
mempool_t btree_trans_mem_pool;
- struct btree_path_buf __percpu *btree_paths_bufs;
+ struct btree_trans_buf __percpu *btree_trans_bufs;
struct srcu_struct btree_trans_barrier;
bool btree_trans_barrier_initialized;
struct list_head moving_context_list;
struct mutex moving_context_lock;
- struct list_head data_progress_list;
- struct mutex data_progress_lock;
-
/* REBALANCE */
struct bch_fs_rebalance rebalance;
struct bio_set dio_read_bioset;
struct bio_set nocow_flush_bioset;
- /* ERRORS */
- struct list_head fsck_errors;
- struct mutex fsck_error_lock;
- bool fsck_alloc_err;
-
/* QUOTAS */
struct bch_memquota_type quotas[QTYP_NR];
enum bch_recovery_pass curr_recovery_pass;
/* bitmap of explicitly enabled recovery passes: */
u64 recovery_passes_explicit;
+ u64 recovery_passes_complete;
/* DEBUG JUNK */
struct dentry *fs_debug_dir;
struct bch2_time_stats times[BCH_TIME_STAT_NR];
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
+
+ /* ERRORS */
+ struct list_head fsck_error_msgs;
+ struct mutex fsck_error_msgs_lock;
+ bool fsck_alloc_msgs_err;
+
+ bch_sb_errors_cpu fsck_error_counts;
+ struct mutex fsck_error_counts_lock;
};
extern struct wait_queue_head bch2_read_only_wait;
return dev < c->sb.nr_devices && c->devs[dev];
}
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- c->recovery_passes_explicit |= BIT_ULL(pass);
-
- if (c->curr_recovery_pass >= pass) {
- c->curr_recovery_pass = pass;
- return -BCH_ERR_restart_recovery;
- } else {
- return 0;
- }
-}
-
#define BKEY_PADDED_ONSTACK(key, pad) \
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
#endif
#define BITMASK(name, type, field, offset, end) \
-static const unsigned name##_OFFSET = offset; \
-static const unsigned name##_BITS = (end - offset); \
+static const __maybe_unused unsigned name##_OFFSET = offset; \
+static const __maybe_unused unsigned name##_BITS = (end - offset); \
\
static inline __u64 name(const type *k) \
{ \
}
#define LE_BITMASK(_bits, name, type, field, offset, end) \
-static const unsigned name##_OFFSET = offset; \
-static const unsigned name##_BITS = (end - offset); \
-static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \
+static const __maybe_unused unsigned name##_OFFSET = offset; \
+static const __maybe_unused unsigned name##_BITS = (end - offset); \
+static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
\
static inline __u64 name(const type *k) \
{ \
x(backpointer, 28) \
x(inode_v3, 29) \
x(bucket_gens, 30) \
- x(snapshot_tree, 31)
+ x(snapshot_tree, 31) \
+ x(logged_op_truncate, 32) \
+ x(logged_op_finsert, 33)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
#endif
};
-struct bch_extent_reservation {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:22,
- replicas:4,
- generation:32;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 generation:32,
- replicas:4,
- unused:22,
- type:6;
-#endif
-};
-
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:7,
- unused:33,
- compression:8,
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 target:16,
compression:8,
- unused:33,
- type:7;
+ unused:34,
+ type:6;
#endif
};
__le64 bi_hash_seed;
__le32 bi_flags;
__le16 bi_mode;
- __u8 fields[0];
+ __u8 fields[];
} __packed __aligned(8);
struct bch_inode_v2 {
__le64 bi_hash_seed;
__le64 bi_flags;
__le16 bi_mode;
- __u8 fields[0];
+ __u8 fields[];
} __packed __aligned(8);
struct bch_inode_v3 {
__le64 bi_sectors;
__le64 bi_size;
__le64 bi_version;
- __u8 fields[0];
+ __u8 fields[];
} __packed __aligned(8);
#define INODEv3_FIELDS_START_INITIAL 6
Inode_opt_nr,
};
-enum {
- /*
- * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
- * flags)
- */
- __BCH_INODE_SYNC = 0,
- __BCH_INODE_IMMUTABLE = 1,
- __BCH_INODE_APPEND = 2,
- __BCH_INODE_NODUMP = 3,
- __BCH_INODE_NOATIME = 4,
-
- __BCH_INODE_I_SIZE_DIRTY = 5,
- __BCH_INODE_I_SECTORS_DIRTY = 6,
- __BCH_INODE_UNLINKED = 7,
- __BCH_INODE_BACKPTR_UNTRUSTED = 8,
-
- /* bits 20+ reserved for packed fields below: */
-};
-
-#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC)
-#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE)
-#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND)
-#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP)
-#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME)
-#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
-#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
-#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
-#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
+#define BCH_INODE_FLAGS() \
+ x(sync, 0) \
+ x(immutable, 1) \
+ x(append, 2) \
+ x(nodump, 3) \
+ x(noatime, 4) \
+ x(i_size_dirty, 5) \
+ x(i_sectors_dirty, 6) \
+ x(unlinked, 7) \
+ x(backptr_untrusted, 8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n) BCH_INODE_##t = 1U << n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n) __BCH_INODE_##t = n,
+ BCH_INODE_FLAGS()
+#undef x
+};
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
#define DT_SUBVOL 16
#define BCH_DT_MAX 17
-#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(__u64) - \
- sizeof(struct bkey) - \
- offsetof(struct bch_dirent, d_name)))
+#define BCH_NAME_MAX 512
/* Xattrs */
struct bch_val v;
__le64 refcount;
union bch_extent_entry start[0];
- __u64 _data[0];
+ __u64 _data[];
} __packed __aligned(8);
struct bch_indirect_inline_data {
struct bch_val v;
__le64 refcount;
- u8 data[0];
+ u8 data[];
};
/* Inline data */
struct bch_inline_data {
struct bch_val v;
- u8 data[0];
+ u8 data[];
};
/* Subvolumes: */
__le32 flags;
__le32 snapshot;
__le64 inode;
+ /*
+ * Snapshot subvolumes form a tree, separate from the snapshot nodes
+ * tree - if this subvolume is a snapshot, this is the ID of the
+ * subvolume it was created from:
+ */
__le32 parent;
__le32 pad;
bch_le128 otime;
__le32 parent;
__le32 children[2];
__le32 subvol;
+ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
__le32 tree;
__le32 depth;
__le32 skip[3];
#define LRU_ID_STRIPES (1U << 16)
+/* Logged operations btree: */
+
+struct bch_logged_op_truncate {
+ struct bch_val v;
+ __le32 subvol;
+ __le32 pad;
+ __le64 inum;
+ __le64 new_i_size;
+};
+
+enum logged_op_finsert_state {
+ LOGGED_OP_FINSERT_start,
+ LOGGED_OP_FINSERT_shift_extents,
+ LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+ struct bch_val v;
+ __u8 state;
+ __u8 pad[3];
+ __le32 subvol;
+ __le64 inum;
+ __le64 dst_offset;
+ __le64 src_offset;
+ __le64 pos;
+};
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
#define BCH_SB_FIELDS() \
x(journal, 0) \
- x(members, 1) \
+ x(members_v1, 1) \
x(crypt, 2) \
x(replicas_v0, 3) \
x(quota, 4) \
x(replicas, 7) \
x(journal_seq_blacklist, 8) \
x(journal_v2, 9) \
- x(counters, 10)
+ x(counters, 10) \
+ x(members_v2, 11) \
+ x(errors, 12)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
struct bch_sb_field_journal {
struct bch_sb_field field;
- __le64 buckets[0];
+ __le64 buckets[];
};
struct bch_sb_field_journal_v2 {
struct bch_sb_field_journal_v2_entry {
__le64 start;
__le64 nr;
- } d[0];
+ } d[];
};
-/* BCH_SB_FIELD_members: */
+/* BCH_SB_FIELD_members_v1: */
#define BCH_MIN_NR_NBUCKETS (1 << 6)
+#define BCH_IOPS_MEASUREMENTS() \
+ x(seqread, 0) \
+ x(seqwrite, 1) \
+ x(randread, 2) \
+ x(randwrite, 3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+ BCH_IOPS_MEASUREMENTS()
+#undef x
+ BCH_IOPS_NR
+};
+
+#define BCH_MEMBER_ERROR_TYPES() \
+ x(read, 0) \
+ x(write, 1) \
+ x(checksum, 2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+ BCH_MEMBER_ERROR_TYPES()
+#undef x
+ BCH_MEMBER_ERROR_NR
+};
+
struct bch_member {
__uuid_t uuid;
__le64 nbuckets; /* device size */
__le32 pad;
__le64 last_mount; /* time_t */
- __le64 flags[2];
+ __le64 flags;
+ __le32 iops[4];
+ __le64 errors[BCH_MEMBER_ERROR_NR];
+ __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
+ __le64 errors_reset_time;
};
-LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4)
+#define BCH_MEMBER_V1_BYTES 56
+
+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
- struct bch_member, flags[0], 30, 31)
+ struct bch_member, flags, 30, 31)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
BCH_MEMBER_STATE_NR
};
-struct bch_sb_field_members {
+struct bch_sb_field_members_v1 {
+ struct bch_sb_field field;
+ struct bch_member _members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
struct bch_sb_field field;
- struct bch_member members[0];
+ __le16 member_bytes; //size of single member entry
+ u8 pad[6];
+ struct bch_member _members[];
};
/* BCH_SB_FIELD_crypt: */
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
- __u8 devs[0];
+ __u8 devs[];
} __packed;
struct bch_sb_field_replicas_v0 {
struct bch_sb_field field;
- struct bch_replicas_entry_v0 entries[0];
+ struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);
struct bch_replicas_entry {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
- __u8 devs[0];
+ __u8 devs[];
} __packed;
#define replicas_entry_bytes(_i) \
struct bch_sb_field_replicas {
struct bch_sb_field field;
- struct bch_replicas_entry entries[0];
+ struct bch_replicas_entry entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_quota: */
struct bch_sb_field_disk_groups {
struct bch_sb_field field;
- struct bch_disk_group entries[0];
+ struct bch_disk_group entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_counters */
struct bch_sb_field_counters {
struct bch_sb_field field;
- __le64 d[0];
+ __le64 d[];
};
/*
__u8 type; /* designates what this jset holds */
__u8 pad[3];
- union {
- struct bkey_i start[0];
- __u64 _data[0];
- };
+ struct bkey_i start[0];
+ __u64 _data[];
};
struct bch_sb_field_clean {
__le16 _write_clock;
__le64 journal_seq;
- union {
- struct jset_entry start[0];
- __u64 _data[0];
- };
+ struct jset_entry start[0];
+ __u64 _data[];
};
struct journal_seq_blacklist_entry {
struct bch_sb_field_journal_seq_blacklist {
struct bch_sb_field field;
+ struct journal_seq_blacklist_entry start[];
+};
- union {
- struct journal_seq_blacklist_entry start[0];
- __u64 _data[0];
- };
+struct bch_sb_field_errors {
+ struct bch_sb_field field;
+ struct bch_sb_field_error_entry {
+ __le64 v;
+ __le64 last_error_time;
+ } entries[];
};
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
+
/* Superblock: */
/*
x(snapshot_skiplists, BCH_VERSION(1, 1), \
BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
x(deleted_inodes, BCH_VERSION(1, 2), \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
+ x(rebalance_work, BCH_VERSION(1, 3), \
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
bcachefs_metadata_version_max
};
-static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
struct bch_sb_layout layout;
- union {
- struct bch_sb_field start[0];
- __le64 _data[0];
- };
+ struct bch_sb_field start[0];
+ __le64 _data[];
} __packed __aligned(8);
/*
BCH_CSUM_NR
};
-static const unsigned bch_crc_bytes[] = {
+static const __maybe_unused unsigned bch_crc_bytes[] = {
[BCH_CSUM_none] = 0,
[BCH_CSUM_crc32c_nonzero] = 4,
[BCH_CSUM_crc32c] = 4,
__le64 last_seq;
- union {
- struct jset_entry start[0];
- __u64 _data[0];
- };
+ struct jset_entry start[0];
+ __u64 _data[];
} __packed __aligned(8);
LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
enum btree_id_flags {
BTREE_ID_EXTENTS = BIT(0),
BTREE_ID_SNAPSHOTS = BIT(1),
- BTREE_ID_DATA = BIT(2),
+ BTREE_ID_SNAPSHOT_FIELD = BIT(2),
+ BTREE_ID_DATA = BIT(3),
};
#define BCH_BTREE_IDS() \
BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \
- x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \
- BIT_ULL(KEY_TYPE_set))
+ x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
+ BIT_ULL(KEY_TYPE_set)) \
+ x(logged_ops, 17, 0, \
+ BIT_ULL(KEY_TYPE_logged_op_truncate)| \
+ BIT_ULL(KEY_TYPE_logged_op_finsert)) \
+ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
__le16 version;
__le16 u64s; /* count of d[] in u64s */
- union {
- struct bkey_packed start[0];
- __u64 _data[0];
- };
+ struct bkey_packed start[0];
+ __u64 _data[];
} __packed __aligned(8);
LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4);
#include "bset.h"
#include "util.h"
-#undef EBUG_ON
-
-#ifdef DEBUG_BKEYS
-#define EBUG_ON(cond) BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
void bch2_bkey_packed_to_binary_text(struct printbuf *out,
struct bkey_packed *k)
{
EBUG_ON(state->p < k->_data);
- EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+ EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
*state->p = state->w;
}
return v + offset;
}
+__always_inline
+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+ unsigned bits = state->format->bits_per_field[field];
+
+ if (bits) {
+ if (bits > state->bits) {
+ bits -= state->bits;
+ /* avoid shift by 64 if bits is 64 - bits is never 0 here: */
+ state->w |= (v >> 1) >> (bits - 1);
+
+ *state->p = state->w;
+ state->p = next_word(state->p);
+ state->w = 0;
+ state->bits = 64;
+ }
+
+ state->bits -= bits;
+ state->w |= v << state->bits;
+ }
+}
+
__always_inline
static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
{
if (fls64(v) > bits)
return false;
- if (bits > state->bits) {
- bits -= state->bits;
- /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
- state->w |= (v >> 1) >> (bits - 1);
-
- *state->p = state->w;
- state->p = next_word(state->p);
- state->w = 0;
- state->bits = 64;
- }
-
- state->bits -= bits;
- state->w |= v << state->bits;
-
+ __set_inc_field(state, field, v);
return true;
}
/**
* bch2_bkey_pack_key -- pack just the key, not the value
+ * @out: packed result
+ * @in: key to pack
+ * @format: format of packed result
+ *
+ * Returns: true on success, false on failure
*/
bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
- const struct bkey_format *format)
+ const struct bkey_format *format)
{
struct pack_state state = pack_state_init(format, out);
u64 *w = out->_data;
/**
* bch2_bkey_unpack -- unpack the key and the value
+ * @b: btree node of @src key (for packed format)
+ * @dst: unpacked result
+ * @src: packed input
*/
void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
- const struct bkey_packed *src)
+ const struct bkey_packed *src)
{
__bkey_unpack_key(b, &dst->k, src);
/**
* bch2_bkey_pack -- pack the key and the value
+ * @dst: packed result
+ * @src: unpacked input
+ * @format: format of packed result
+ *
+ * Returns: true on success, false on failure
*/
-bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
- const struct bkey_format *format)
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+ const struct bkey_format *format)
{
struct bkey_packed tmp;
- if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+ if (!bch2_bkey_pack_key(&tmp, &src->k, format))
return false;
- memmove_u64s((u64 *) out + format->key_u64s,
- &in->v,
- bkey_val_u64s(&in->k));
- memcpy_u64s_small(out, &tmp, format->key_u64s);
+ memmove_u64s((u64 *) dst + format->key_u64s,
+ &src->v,
+ bkey_val_u64s(&src->k));
+ memcpy_u64s_small(dst, &tmp, format->key_u64s);
return true;
}
ret = false;
}
- if (bits > state->bits) {
- bits -= state->bits;
- state->w |= (v >> 1) >> (bits - 1);
-
- *state->p = state->w;
- state->p = next_word(state->p);
- state->w = 0;
- state->bits = 64;
- }
-
- state->bits -= bits;
- state->w |= v << state->bits;
-
+ __set_inc_field(state, field, v);
return ret;
}
return false;
}
+
+static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
+{
+ for (unsigned i = 0; i < f->nr_fields; i++) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 packed_max = f->bits_per_field[i]
+ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+ : 0;
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (packed_max + field_offset < packed_max ||
+ packed_max + field_offset > unpacked_max)
+ return true;
+ }
+
+ return false;
+}
#endif
/*
BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
BUG_ON(bkey_packed_successor(&successor, b, *out) &&
- bkey_cmp_left_packed(b, &successor, &orig) < 0);
+ bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+ !bkey_format_has_too_big_fields(f));
}
#endif
/* allow for extent merging: */
if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
- ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
- bits += 4;
+ unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
+
+ ret.bits_per_field[BKEY_FIELD_SIZE] += b;
+ bits += b;
}
ret.key_u64s = DIV_ROUND_UP(bits, 64);
}
}
- EBUG_ON(bch2_bkey_format_validate(&ret));
+#ifdef CONFIG_BCACHEFS_DEBUG
+ {
+ struct printbuf buf = PRINTBUF;
+
+ BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
+ printbuf_exit(&buf);
+ }
+#endif
return ret;
}
-const char *bch2_bkey_format_validate(struct bkey_format *f)
+int bch2_bkey_format_invalid(struct bch_fs *c,
+ struct bkey_format *f,
+ enum bkey_invalid_flags flags,
+ struct printbuf *err)
{
unsigned i, bits = KEY_PACKED_BITS_START;
- if (f->nr_fields != BKEY_NR_FIELDS)
- return "incorrect number of fields";
+ if (f->nr_fields != BKEY_NR_FIELDS) {
+ prt_printf(err, "incorrect number of fields: got %u, should be %u",
+ f->nr_fields, BKEY_NR_FIELDS);
+ return -BCH_ERR_invalid;
+ }
/*
* Verify that the packed format can't represent fields larger than the
* unpacked format:
*/
for (i = 0; i < f->nr_fields; i++) {
- unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
- u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 packed_max = f->bits_per_field[i]
- ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
- : 0;
- u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
- if (packed_max + field_offset < packed_max ||
- packed_max + field_offset > unpacked_max)
- return "field too large";
+ if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+ u64 packed_max = f->bits_per_field[i]
+ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+ : 0;
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+ if (packed_max + field_offset < packed_max ||
+ packed_max + field_offset > unpacked_max) {
+ prt_printf(err, "field %u too large: %llu + %llu > %llu",
+ i, packed_max, field_offset, unpacked_max);
+ return -BCH_ERR_invalid;
+ }
+ }
bits += f->bits_per_field[i];
}
- if (f->key_u64s != DIV_ROUND_UP(bits, 64))
- return "incorrect key_u64s";
+ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
+ prt_printf(err, "incorrect key_u64s: got %u, should be %u",
+ f->key_u64s, DIV_ROUND_UP(bits, 64));
+ return -BCH_ERR_invalid;
+ }
- return NULL;
+ return 0;
+}
+
+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
+{
+ prt_printf(out, "u64s %u fields ", f->key_u64s);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
+ if (i)
+ prt_str(out, ", ");
+ prt_printf(out, "%u:%llu",
+ f->bits_per_field[i],
+ le64_to_cpu(f->field_offset[i]));
+ }
}
/*
#include "util.h"
#include "vstructs.h"
+enum bkey_invalid_flags {
+ BKEY_INVALID_WRITE = (1U << 0),
+ BKEY_INVALID_COMMIT = (1U << 1),
+ BKEY_INVALID_JOURNAL = (1U << 2),
+};
+
#if 0
/*
static inline struct bkey_i *bkey_next(struct bkey_i *k)
{
- return (struct bkey_i *) (k->_data + k->k.u64s);
+ return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
}
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
#define bkey_lr_packed(_l, _r) \
((_l)->format + ((_r)->format << 1))
-#define bkey_copy(_dst, _src) \
-do { \
- BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \
- !type_is(_dst, struct bkey_packed *)); \
- BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \
- !type_is(_src, struct bkey_packed *)); \
- EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \
- (u64 *) (_dst) < (u64 *) (_src) + \
- ((struct bkey *) (_src))->u64s); \
- \
- memcpy_u64s_small((_dst), (_src), \
- ((struct bkey *) (_src))->u64s); \
-} while (0)
+static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
+{
+ memcpy_u64s_small(dst, src, src->u64s);
+}
+
+static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
+{
+ memcpy_u64s_small(dst, src, src->k.u64s);
+}
struct btree;
}
#define bkeyp_val(_format, _k) \
- ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+ ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
extern const struct bkey_format bch2_bkey_format_current;
#error edit for your odd byteorder.
#endif
-#define high_word(f, k) ((k)->_data + high_word_offset(f))
+#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f))
#define next_word(p) nth_word(p, 1)
#define prev_word(p) nth_word(p, -1)
void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-const char *bch2_bkey_format_validate(struct bkey_format *);
+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
+ enum bkey_invalid_flags, struct printbuf *);
+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
#endif /* _BCACHEFS_BKEY_H */
#include "bcachefs.h"
#include "backpointers.h"
#include "bkey_methods.h"
+#include "btree_cache.h"
#include "btree_types.h"
#include "alloc_background.h"
#include "dirent.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
+#include "io_misc.h"
#include "lru.h"
#include "quota.h"
#include "reflink.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "xattr.h"
NULL
};
-static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
- unsigned flags, struct printbuf *err)
+static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
}
.key_invalid = deleted_key_invalid, \
})
-static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
- unsigned flags, struct printbuf *err)
+static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
{
- if (bkey_val_bytes(k.k)) {
- prt_printf(err, "incorrect value size (%zu != 0)",
- bkey_val_bytes(k.k));
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
+ bkey_val_size_nonzero,
+ "incorrect value size (%zu != 0)",
+ bkey_val_bytes(k.k));
+fsck_err:
+ return ret;
}
#define bch2_bkey_ops_error ((struct bkey_ops) { \
.key_invalid = empty_val_key_invalid, \
})
-static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
- unsigned flags, struct printbuf *err)
+static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
}
.key_invalid = empty_val_key_invalid, \
})
-static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
- unsigned flags, struct printbuf *err)
+static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
}
.val_to_text = key_type_inline_data_to_text, \
})
-static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
- unsigned flags, struct printbuf *err)
-{
- if (bkey_val_bytes(k.k)) {
- prt_printf(err, "incorrect value size (%zu != %zu)",
- bkey_val_bytes(k.k), sizeof(struct bch_cookie));
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
-}
-
static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
bch2_key_resize(l.k, l.k->size + r.k->size);
}
#define bch2_bkey_ops_set ((struct bkey_ops) { \
- .key_invalid = key_type_set_invalid, \
+ .key_invalid = empty_val_key_invalid, \
.key_merge = key_type_set_merge, \
})
};
const struct bkey_ops bch2_bkey_null_ops = {
- .min_val_size = U8_MAX,
};
int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+ int ret = 0;
- if (bkey_val_bytes(k.k) < ops->min_val_size) {
- prt_printf(err, "bad val size (%zu < %u)",
- bkey_val_bytes(k.k), ops->min_val_size);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
+ bkey_val_size_too_small,
+ "bad val size (%zu < %u)",
+ bkey_val_bytes(k.k), ops->min_val_size);
if (!ops->key_invalid)
return 0;
- return ops->key_invalid(c, k, flags, err);
+ ret = ops->key_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
static u64 bch2_key_types_allowed[] = {
-#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
- BCH_BTREE_IDS()
-#undef x
[BKEY_TYPE_btree] =
BIT_ULL(KEY_TYPE_deleted)|
BIT_ULL(KEY_TYPE_btree_ptr)|
BIT_ULL(KEY_TYPE_btree_ptr_v2),
+#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
+ BCH_BTREE_IDS()
+#undef x
};
+const char *bch2_btree_node_type_str(enum btree_node_type type)
+{
+ return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
+}
+
int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (k.k->u64s < BKEY_U64s) {
- prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- if (flags & BKEY_INVALID_COMMIT &&
- !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) {
- prt_printf(err, "invalid key type for btree %s (%s)",
- bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
+ bkey_u64s_too_small,
+ "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
- if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
- if (k.k->size == 0) {
- prt_printf(err, "size == 0");
- return -BCH_ERR_invalid_bkey;
- }
+ if (type >= BKEY_TYPE_NR)
+ return 0;
- if (k.k->size > k.k->p.offset) {
- prt_printf(err, "size greater than offset (%u > %llu)",
- k.k->size, k.k->p.offset);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
+ bkey_invalid_type_for_btree,
+ "invalid key type for btree %s (%s)",
+ bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+
+ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+ bkey_fsck_err_on(k.k->size == 0, c, err,
+ bkey_extent_size_zero,
+ "size == 0");
+
+ bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
+ bkey_extent_size_greater_than_offset,
+ "size greater than offset (%u > %llu)",
+ k.k->size, k.k->p.offset);
} else {
- if (k.k->size) {
- prt_printf(err, "size != 0");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(k.k->size, c, err,
+ bkey_size_nonzero,
+ "size != 0");
}
if (type != BKEY_TYPE_btree) {
- if (!btree_type_has_snapshots((enum btree_id) type) &&
- k.k->p.snapshot) {
- prt_printf(err, "nonzero snapshot");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (btree_type_has_snapshots((enum btree_id) type) &&
- !k.k->p.snapshot) {
- prt_printf(err, "snapshot == 0");
- return -BCH_ERR_invalid_bkey;
+ enum btree_id btree = type - 1;
+
+ if (btree_type_has_snapshots(btree)) {
+ bkey_fsck_err_on(!k.k->p.snapshot, c, err,
+ bkey_snapshot_zero,
+ "snapshot == 0");
+ } else if (!btree_type_has_snapshot_field(btree)) {
+ bkey_fsck_err_on(k.k->p.snapshot, c, err,
+ bkey_snapshot_nonzero,
+ "nonzero snapshot");
+ } else {
+ /*
+ * btree uses snapshot field but it's not required to be
+ * nonzero
+ */
}
- if (bkey_eq(k.k->p, POS_MAX)) {
- prt_printf(err, "key at POS_MAX");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
+ bkey_at_pos_max,
+ "key at POS_MAX");
}
-
- return 0;
+fsck_err:
+ return ret;
}
int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
bch2_bkey_val_invalid(c, k, flags, err);
}
-int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
- struct printbuf *err)
+int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
+ struct bkey_s_c k, struct printbuf *err)
{
- if (bpos_lt(k.k->p, b->data->min_key)) {
- prt_printf(err, "key before start of btree node");
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- if (bpos_gt(k.k->p, b->data->max_key)) {
- prt_printf(err, "key past end of btree node");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
+ bkey_before_start_of_btree_node,
+ "key before start of btree node");
- return 0;
+ bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
+ bkey_after_end_of_btree_node,
+ "key past end of btree node");
+fsck_err:
+ return ret;
}
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
{
const struct bkey_ops *ops;
struct bkey uk;
- struct bkey_s u;
unsigned nr_compat = 5;
int i;
}
break;
- case 4:
+ case 4: {
+ struct bkey_s u;
+
if (!bkey_packed(k)) {
u = bkey_i_to_s(packed_to_bkey(k));
} else {
if (ops->compat)
ops->compat(btree_id, version, big_endian, write, u);
break;
+ }
default:
BUG();
}
extern const char * const bch2_bkey_types[];
extern const struct bkey_ops bch2_bkey_null_ops;
-enum bkey_invalid_flags {
- BKEY_INVALID_WRITE = (1U << 0),
- BKEY_INVALID_COMMIT = (1U << 1),
- BKEY_INVALID_JOURNAL = (1U << 2),
-};
-
/*
* key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
* invalid, entire key will be deleted.
* being read or written; more aggressive checks can be enabled when rw == WRITE.
*/
struct bkey_ops {
- int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
+ int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
enum bkey_invalid_flags, struct printbuf *);
int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
+ struct bkey_s_c, struct printbuf *);
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
__BTREE_UPDATE_NOJOURNAL,
- __BTREE_UPDATE_PREJOURNAL,
__BTREE_UPDATE_KEY_CACHE_RECLAIM,
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL)
#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
- ((1U << KEY_TYPE_alloc)| \
- (1U << KEY_TYPE_alloc_v2)| \
- (1U << KEY_TYPE_alloc_v3)| \
- (1U << KEY_TYPE_alloc_v4)| \
- (1U << KEY_TYPE_stripe)| \
- (1U << KEY_TYPE_inode)| \
- (1U << KEY_TYPE_inode_v2)| \
- (1U << KEY_TYPE_snapshot))
-
static inline int bch2_trans_mark_key(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
while ((k = sort_iter_peek(iter))) {
if (!bkey_deleted(k) &&
!should_drop_next_key(iter)) {
- bkey_copy(out, k);
+ bkey_p_copy(out, k);
btree_keys_account_key_add(&nr, 0, out);
out = bkey_p_next(out);
}
continue;
if (!transform)
- bkey_copy(out, in);
+ bkey_p_copy(out, in);
else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
? in_f : &bch2_bkey_format_current, in))
out->format = KEY_FORMAT_LOCAL_BTREE;
memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
set_bkeyp_val_u64s(f, out, 0);
} else {
- bkey_copy(out, in);
+ bkey_p_copy(out, in);
}
out->needs_whiteout |= needs_whiteout;
out = bkey_p_next(out);
struct sort_iter_set {
struct bkey_packed *k, *end;
- } data[MAX_BSETS + 1];
+ } data[];
};
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
{
iter->b = b;
iter->used = 0;
- iter->size = ARRAY_SIZE(iter->data);
+ iter->size = size;
+}
+
+struct sort_iter_stack {
+ struct sort_iter iter;
+ struct sort_iter_set sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+ sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
}
static inline void sort_iter_add(struct sort_iter *iter,
printk(KERN_ERR "iter was:");
btree_node_iter_for_each(_iter, set) {
- struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
+ struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+ struct bset_tree *t = bch2_bkey_to_bset(b, k2);
printk(" [%zi %zi]", t - b->set,
- k->_data - bset(b, t)->_data);
+ k2->_data - bset(b, t)->_data);
}
panic("\n");
}
{
struct bset_tree *t = bch2_bkey_to_bset(b, where);
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
- struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+ struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
#if 0
}
struct ro_aux_tree {
- struct bkey_float f[0];
+ u8 nothing[0];
+ struct bkey_float f[];
};
struct rw_aux_tree {
{
unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
- return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+ return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
}
static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
btree_keys_account_key_add(&b->nr, t - b->set, src);
if (src->u64s != clobber_u64s) {
- u64 *src_p = where->_data + clobber_u64s;
- u64 *dst_p = where->_data + src->u64s;
+ u64 *src_p = (u64 *) where->_data + clobber_u64s;
+ u64 *dst_p = (u64 *) where->_data + src->u64s;
EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
(int) clobber_u64s - src->u64s);
unsigned clobber_u64s)
{
struct bset_tree *t = bset_tree_last(b);
- u64 *src_p = where->_data + clobber_u64s;
+ u64 *src_p = (u64 *) where->_data + clobber_u64s;
u64 *dst_p = where->_data;
bch2_bset_verify_rw_aux_tree(b, t);
case BSET_RO_AUX_TREE:
return bset_search_tree(b, t, search, lossy_packed_search);
default:
- unreachable();
+ BUG();
}
}
}
/**
- * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
* given position
*
+ * @iter: iterator to initialize
+ * @b: btree node to search
+ * @search: search key
+ *
* Main entry point to the lookup code for individual btree nodes:
*
* NOTE:
bc->shrink.scan_objects = bch2_btree_cache_scan;
bc->shrink.to_text = bch2_btree_cache_shrinker_to_text;
bc->shrink.seeks = 4;
- ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
+ ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name);
if (ret)
goto err;
six_unlock_intent(&b->c.lock);
/* Unlock before doing IO: */
- if (trans && sync)
+ if (path && sync)
bch2_trans_unlock_noassert(trans);
bch2_btree_node_read(c, b, sync);
"btree node header doesn't match ptr\n"
"btree %s level %u\n"
"ptr: ",
- bch2_btree_ids[b->c.btree_id], b->c.level);
+ bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_printf(&buf, "\nheader: btree %s level %llu\n"
"min ",
- bch2_btree_ids[BTREE_NODE_ID(b->data)],
+ bch2_btree_id_str(BTREE_NODE_ID(b->data)),
BTREE_NODE_LEVEL(b->data));
bch2_bpos_to_text(&buf, b->data->min_key);
}
if (unlikely(need_relock)) {
- int ret = bch2_trans_relock(trans) ?:
+ ret = bch2_trans_relock(trans) ?:
bch2_btree_path_relock_intent(trans, path);
if (ret) {
six_unlock_type(&b->c.lock, lock_type);
}
/**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
* in from disk if necessary.
*
+ * @trans: btree transaction object
+ * @path: btree_path being traversed
+ * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level: level of btree node being looked up (0 == leaf node)
+ * @lock_type: SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
* The btree node will have either a read or a write lock held, depending on
* the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
*/
struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
const struct bkey_i *k, unsigned level,
}
if (unlikely(btree_node_read_in_flight(b))) {
- u32 seq = six_lock_seq(&b->c.lock);
-
six_unlock_type(&b->c.lock, lock_type);
- bch2_trans_unlock(trans);
-
- bch2_btree_node_wait_on_read(b);
-
- /*
- * should_be_locked is not set on this path yet, so we need to
- * relock it specifically:
- */
- if (trans) {
- int ret = bch2_trans_relock(trans) ?:
- bch2_btree_path_relock_intent(trans, path);
- if (ret) {
- BUG_ON(!trans->restarted);
- return ERR_PTR(ret);
- }
- }
-
- if (!six_relock_type(&b->c.lock, lock_type, seq))
- return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
}
prefetch(b->aux_data);
six_unlock_intent(&b->c.lock);
}
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
- const struct btree *b)
+const char *bch2_btree_id_str(enum btree_id btree)
+{
+ return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
+}
+
+void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+ prt_printf(out, "%s level %u/%u\n ",
+ bch2_btree_id_str(b->c.btree_id),
+ b->c.level,
+ bch2_btree_id_root(c, b->c.btree_id)->level);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
- const struct bkey_format *f = &b->format;
struct bset_stats stats;
memset(&stats, 0, sizeof(stats));
prt_printf(out, ":\n"
" ptrs: ");
bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+ prt_newline(out);
+
+ prt_printf(out,
+ " format: ");
+ bch2_bkey_format_to_text(out, &b->format);
- prt_printf(out, "\n"
- " format: u64s %u fields %u %u %u %u %u\n"
+ prt_printf(out,
" unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
" sib u64s: %u, %u (merge threshold %u)\n"
" nr unpacked keys %u\n"
" floats %zu\n"
" failed unpacked %zu\n",
- f->key_u64s,
- f->bits_per_field[0],
- f->bits_per_field[1],
- f->bits_per_field[2],
- f->bits_per_field[3],
- f->bits_per_field[4],
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
btree_bytes(c) - sizeof(struct btree_node),
return bch2_btree_id_root(c, b->c.btree_id)->b;
}
-void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
- const struct btree *);
+const char *bch2_btree_id_str(enum btree_id);
+void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
+#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
static bool should_restart_for_topology_repair(struct bch_fs *c)
{
return c->opts.fix_errors != FSCK_FIX_no &&
- !(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
+ !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
}
static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
- "btree node with incorrect min_key at btree %s level %u:\n"
- " prev %s\n"
- " cur %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
- buf1.buf, buf2.buf) &&
- should_restart_for_topology_repair(c)) {
+ FSCK_CAN_FIX|
+ FSCK_CAN_IGNORE|
+ FSCK_NO_RATELIMIT,
+ btree_node_topology_bad_min_key,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " cur %s",
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
+ buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
bch2_bpos_to_text(&buf2, node_end);
- if (__fsck_err(c,
- FSCK_CAN_FIX|
- FSCK_CAN_IGNORE|
- FSCK_NO_RATELIMIT,
+ if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
+ btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf) &&
should_restart_for_topology_repair(c)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
cur->data->min_key), c,
+ btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node at btree %s level %u:\n"
" node %s\n"
" next %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_PREV_NODE;
goto out;
if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
bpos_predecessor(cur->data->min_key)), c,
+ btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" node %s\n"
" next %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
if (mustfix_fsck_err_on(bpos_ge(expected_start,
cur->data->max_key), c,
+ btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node at btree %s level %u:\n"
" prev %s\n"
" node %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_THIS_NODE;
goto out;
}
if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
+ btree_node_topology_bad_min_key,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" node %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_min(c, cur, expected_start);
}
bch2_bpos_to_text(&buf2, b->key.k.p);
if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
+ btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
- bch2_btree_ids[b->c.btree_id], b->c.level,
+ bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = set_node_max(c, child, b->key.k.p);
if (ret)
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
if (mustfix_fsck_err_on(ret == -EIO, c,
+ btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
- bch2_btree_ids[b->c.btree_id],
+ bch2_btree_id_str(b->c.btree_id),
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
if (mustfix_fsck_err_on(!have_child, c,
+ btree_node_topology_interior_node_empty,
"empty interior btree node at btree %s level %u\n"
" %s",
- bch2_btree_ids[b->c.btree_id],
+ bch2_btree_id_str(b->c.btree_id),
b->c.level, buf.buf))
ret = DROP_THIS_NODE;
err:
int bch2_check_topology(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree *b;
unsigned i;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
- for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) {
+ for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (!r->alive)
if (btree_node_fake(b))
continue;
- btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
- ret = bch2_btree_repair_topology_recurse(&trans, b);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ ret = bch2_btree_repair_topology_recurse(trans, b);
six_unlock_read(&b->c.lock);
if (ret == DROP_THIS_NODE) {
}
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
struct bkey_s_c *k)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
- const union bch_extent_entry *entry;
+ struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
+ const union bch_extent_entry *entry_c;
struct extent_ptr_decoded p = { 0 };
bool do_update = false;
struct printbuf buf = PRINTBUF;
* XXX
* use check_bucket_ref here
*/
- bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+ bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
if (!g->gen_valid &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ fsck_err(c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ fsck_err(c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ fsck_err(c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
(c->opts.reconstruct_alloc ||
- fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ fsck_err(c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
if (fsck_err_on(bucket_data_type(g->data_type) &&
bucket_data_type(g->data_type) != data_type, c,
+ ptr_bucket_data_type_mismatch,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
if (fsck_err_on(!m || !m->alive, c,
+ ptr_to_missing_stripe,
"pointer to nonexistent stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
do_update = true;
if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
+ ptr_to_incorrect_stripe,
"pointer does not match stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
goto err;
if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
+ bkey_version_in_future,
"key version number higher than recorded: %llu > %llu",
k->k->version.lo,
atomic64_read(&c->key_version)))
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
FSCK_NO_RATELIMIT,
+ btree_node_read_error,
"Unreadable btree node at btree %s level %u:\n"
" %s",
- bch2_btree_ids[b->c.btree_id],
+ bch2_btree_id_str(b->c.btree_id),
b->c.level - 1,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key);
if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
+ btree_root_bad_min_key,
"btree root with incorrect min_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
ret = -BCH_ERR_fsck_repair_unimplemented;
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->max_key);
if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
+ btree_root_bad_max_key,
"btree root with incorrect max_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
ret = -BCH_ERR_fsck_repair_unimplemented;
static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
enum btree_id ids[BTREE_ID_NR];
unsigned i;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
if (initial)
- trans.is_initial_gc = true;
+ trans->is_initial_gc = true;
for (i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
for (i = 0; i < BTREE_ID_NR && !ret; i++)
ret = initial
- ? bch2_gc_btree_init(&trans, ids[i], metadata_only)
- : bch2_gc_btree(&trans, ids[i], initial, metadata_only);
+ ? bch2_gc_btree_init(trans, ids[i], metadata_only)
+ : bch2_gc_btree(trans, ids[i], initial, metadata_only);
for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
if (!bch2_btree_id_root(c, i)->alive)
continue;
ret = initial
- ? bch2_gc_btree_init(&trans, i, metadata_only)
- : bch2_gc_btree(&trans, i, initial, metadata_only);
+ ? bch2_gc_btree_init(trans, i, metadata_only)
+ : bch2_gc_btree(trans, i, initial, metadata_only);
}
if (ret < 0)
bch_err_fn(c, ret);
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
percpu_down_write(&c->mark_lock);
-#define copy_field(_f, _msg, ...) \
+#define copy_field(_err, _f, _msg, ...) \
if (dst->_f != src->_f && \
(!verify || \
- fsck_err(c, _msg ": got %llu, should be %llu" \
+ fsck_err(c, _err, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f))) \
dst->_f = src->_f
-#define copy_stripe_field(_f, _msg, ...) \
- if (dst->_f != src->_f && \
- (!verify || \
- fsck_err(c, "stripe %zu has wrong "_msg \
- ": got %u, should be %u", \
- iter.pos, ##__VA_ARGS__, \
- dst->_f, src->_f))) \
- dst->_f = src->_f
-#define copy_dev_field(_f, _msg, ...) \
- copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
-#define copy_fs_field(_f, _msg, ...) \
- copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+#define copy_dev_field(_err, _f, _msg, ...) \
+ copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+#define copy_fs_field(_err, _f, _msg, ...) \
+ copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
dev_usage_u64s());
- copy_dev_field(buckets_ec, "buckets_ec");
-
for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
- copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
- copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ copy_dev_field(dev_usage_buckets_wrong,
+ d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(dev_usage_sectors_wrong,
+ d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(dev_usage_fragmented_wrong,
+ d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
- };
+
+ copy_dev_field(dev_usage_buckets_ec_wrong,
+ buckets_ec, "buckets_ec");
+ }
{
unsigned nr = fs_usage_u64s(c);
struct bch_fs_usage *src = (void *)
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
- copy_fs_field(hidden, "hidden");
- copy_fs_field(btree, "btree");
+ copy_fs_field(fs_usage_hidden_wrong,
+ hidden, "hidden");
+ copy_fs_field(fs_usage_btree_wrong,
+ btree, "btree");
if (!metadata_only) {
- copy_fs_field(data, "data");
- copy_fs_field(cached, "cached");
- copy_fs_field(reserved, "reserved");
- copy_fs_field(nr_inodes,"nr_inodes");
+ copy_fs_field(fs_usage_data_wrong,
+ data, "data");
+ copy_fs_field(fs_usage_cached_wrong,
+ cached, "cached");
+ copy_fs_field(fs_usage_reserved_wrong,
+ reserved, "reserved");
+ copy_fs_field(fs_usage_nr_inodes_wrong,
+ nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(persistent_reserved[i],
+ copy_fs_field(fs_usage_persistent_reserved_wrong,
+ persistent_reserved[i],
"persistent_reserved[%i]", i);
}
printbuf_reset(&buf);
bch2_replicas_entry_to_text(&buf, e);
- copy_fs_field(replicas[i], "%s", buf.buf);
+ copy_fs_field(fs_usage_replicas_wrong,
+ replicas[i], "%s", buf.buf);
}
}
if (c->opts.reconstruct_alloc ||
fsck_err_on(new.data_type != gc.data_type, c,
+ alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
bch2_data_types[gc.data_type]))
new.data_type = gc.data_type;
-#define copy_bucket_field(_f) \
+#define copy_bucket_field(_errtype, _f) \
if (c->opts.reconstruct_alloc || \
- fsck_err_on(new._f != gc._f, c, \
+ fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
new._f, gc._f)) \
new._f = gc._f; \
- copy_bucket_field(gen);
- copy_bucket_field(dirty_sectors);
- copy_bucket_field(cached_sectors);
- copy_bucket_field(stripe_redundancy);
- copy_bucket_field(stripe);
+ copy_bucket_field(alloc_key_gen_wrong,
+ gen);
+ copy_bucket_field(alloc_key_dirty_sectors_wrong,
+ dirty_sectors);
+ copy_bucket_field(alloc_key_cached_sectors_wrong,
+ cached_sectors);
+ copy_bucket_field(alloc_key_stripe_wrong,
+ stripe);
+ copy_bucket_field(alloc_key_stripe_redundancy_wrong,
+ stripe_redundancy);
#undef copy_bucket_field
if (!bch2_alloc_v4_cmp(*old, new))
static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct bch_dev *ca;
unsigned i;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
for_each_member_device(ca, c, i) {
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS(ca->dev_idx, ca->mi.first_bucket),
BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW,
- bch2_alloc_write_key(&trans, &iter, k, metadata_only));
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ bch2_alloc_write_key(trans, &iter, k, metadata_only));
if (ret < 0) {
- bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
+ bch_err_fn(c, ret);
percpu_ref_put(&ca->ref);
break;
}
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret < 0 ? ret : 0;
}
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
struct bch_dev *ca;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct bucket *g;
if (!buckets) {
percpu_ref_put(&ca->ref);
bch_err(c, "error allocating ca->buckets[gc]");
- return -BCH_ERR_ENOMEM_gc_alloc_start;
+ ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+ goto err;
}
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = ca->mi.nbuckets;
rcu_assign_pointer(ca->buckets_gc, buckets);
- };
-
- bch2_trans_init(&trans, c, 0, 0);
+ }
- for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
ca = bch_dev_bkey_exists(c, k.k->p.inode);
g = gc_bucket(ca, k.k->p.offset);
g->stripe_redundancy = a->stripe_redundancy;
}
}
- bch2_trans_iter_exit(&trans, &iter);
-
- bch2_trans_exit(&trans);
-
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ bch2_trans_put(trans);
if (ret)
- bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
-
+ bch_err_fn(c, ret);
return ret;
}
g->dirty_sectors = 0;
g->cached_sectors = 0;
}
- };
+ }
}
static int bch2_gc_write_reflink_key(struct btree_trans *trans,
}
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+ reflink_v_refcount_wrong,
"reflink key has wrong refcount:\n"
" %s\n"
" should be %u",
static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
size_t idx = 0;
if (metadata_only)
return 0;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
- ret = for_each_btree_key_commit(&trans, iter,
+ ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_reflink_key(trans, &iter, k, &idx));
c->reflink_gc_nr = 0;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
static int bch2_gc_reflink_start(struct bch_fs *c,
bool metadata_only)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
struct reflink_gc *r;
if (metadata_only)
return 0;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
c->reflink_gc_nr = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
const __le64 *refcount = bkey_refcount_c(k);
r->size = k.k->size;
r->refcount = 0;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
if (bad)
bch2_bkey_val_to_text(&buf, c, k);
- if (fsck_err_on(bad, c, "%s", buf.buf)) {
+ if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+ "%s", buf.buf)) {
struct bkey_i_stripe *new;
new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
if (metadata_only)
return 0;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
- ret = for_each_btree_key_commit(&trans, iter,
+ ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_stripes, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_gc_write_stripes_key(&trans, &iter, k));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_stripes_key(trans, &iter, k));
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
/**
* bch2_gc - walk _all_ references to buckets, and recompute them:
*
+ * @c: filesystem object
+ * @initial: are we in recovery?
+ * @metadata_only: are we just checking metadata references, or everything?
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
* Order matters here:
* - Concurrent GC relies on the fact that we have a total ordering for
* everything that GC walks - see gc_will_visit_node(),
int bch2_gc_gens(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_dev *ca;
trace_and_count(c, gc_gens_start, c);
down_read(&c->gc_lock);
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
for_each_member_device(ca, c, i) {
- struct bucket_gens *gens;
+ struct bucket_gens *gens = bucket_gens(ca);
BUG_ON(ca->oldest_gen);
- ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+ ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
percpu_ref_put(&ca->ref);
ret = -BCH_ERR_ENOMEM_gc_gens;
goto err;
}
- gens = bucket_gens(ca);
-
for (b = gens->first_bucket;
b < gens->nbuckets; b++)
ca->oldest_gen[b] = gens->b[b];
for (i = 0; i < BTREE_ID_NR; i++)
if (btree_type_has_ptrs(i)) {
- struct btree_iter iter;
- struct bkey_s_c k;
-
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = for_each_btree_key_commit(&trans, iter, i,
+
+ ret = for_each_btree_key_commit(trans, iter, i,
POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_NOFAIL,
- gc_btree_gens_key(&trans, &iter, k));
+ BCH_TRANS_COMMIT_no_enospc,
+ gc_btree_gens_key(trans, &iter, k));
if (ret && !bch2_err_matches(ret, EROFS))
- bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+ bch_err_fn(c, ret);
if (ret)
goto err;
}
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN,
BTREE_ITER_PREFETCH,
k,
NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_alloc_write_oldest_gen(&trans, &iter, k));
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_alloc_write_oldest_gen(trans, &iter, k));
if (ret && !bch2_err_matches(ret, EROFS))
- bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+ bch_err_fn(c, ret);
if (ret)
goto err;
ca->oldest_gen = NULL;
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
up_read(&c->gc_lock);
mutex_unlock(&c->gc_gens_lock);
return ret;
ret = bch2_gc_gens(c);
#endif
if (ret < 0)
- bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
+ bch_err_fn(c, ret);
debug_check_no_locks_held();
}
p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
if (IS_ERR(p)) {
- bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
+ bch_err_fn(c, PTR_ERR(p));
return PTR_ERR(p);
}
#ifndef _BCACHEFS_BTREE_GC_H
#define _BCACHEFS_BTREE_GC_H
+#include "bkey.h"
#include "btree_types.h"
int bch2_check_topology(struct bch_fs *);
#include "debug.h"
#include "error.h"
#include "extents.h"
-#include "io.h"
+#include "io_write.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
+#include "recovery.h"
#include "super-io.h"
#include "trace.h"
vpfree(p, size);
}
-static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
- bool *used_mempool)
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
+ bool *used_mempool)
{
unsigned flags = memalloc_nofs_save();
void *p;
BUG_ON(size > btree_bytes(c));
*used_mempool = false;
- p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT);
+ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
memalloc_nofs_restore(flags);
return p;
}
-#define btree_bounce_alloc(_c, _size, _used_mempool) \
- alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool))
static void sort_bkey_ptrs(const struct btree *bt,
struct bkey_packed **ptrs, unsigned nr)
k = new_whiteouts;
while (ptrs != ptrs_end) {
- bkey_copy(k, *ptrs);
+ bkey_p_copy(k, *ptrs);
k = bkey_p_next(k);
ptrs++;
}
n = bkey_p_next(k);
if (!bkey_deleted(k)) {
- bkey_copy(out, k);
+ bkey_p_copy(out, k);
out = bkey_p_next(out);
} else {
BUG_ON(k->needs_whiteout);
bool filter_whiteouts)
{
struct btree_node *out;
- struct sort_iter sort_iter;
+ struct sort_iter_stack sort_iter;
struct bset_tree *t;
struct bset *start_bset = bset(b, &b->set[start_idx]);
bool used_mempool = false;
bool sorting_entire_node = start_idx == 0 &&
end_idx == b->nsets;
- sort_iter_init(&sort_iter, b);
+ sort_iter_stack_init(&sort_iter, b);
for (t = b->set + start_idx;
t < b->set + end_idx;
t++) {
u64s += le16_to_cpu(bset(b, t)->u64s);
- sort_iter_add(&sort_iter,
+ sort_iter_add(&sort_iter.iter,
btree_bkey_first(b, t),
btree_bkey_last(b, t));
}
start_time = local_clock();
- u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+ u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
out->keys.u64s = cpu_to_le16(u64s);
start_bset->journal_seq = cpu_to_le64(seq);
if (sorting_entire_node) {
- unsigned u64s = le16_to_cpu(out->keys.u64s);
+ u64s = le16_to_cpu(out->keys.u64s);
BUG_ON(bytes != btree_bytes(c));
bch2_verify_btree_nr_keys(dst);
}
-#define SORT_CRIT (4096 / sizeof(u64))
-
/*
* We're about to add another bset to the btree node, so if there's currently
* too many bsets - sort some of them together:
bch2_trans_node_reinit_iter(trans, b);
}
-static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
- struct btree *b)
-{
- prt_printf(out, "%s level %u/%u\n ",
- bch2_btree_ids[b->c.btree_id],
- b->c.level,
- bch2_btree_id_root(c, b->c.btree_id)->level);
- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-}
-
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
struct btree *b, struct bset *i,
if (ca)
prt_printf(out, "on %s ", ca->name);
prt_printf(out, "at btree ");
- btree_pos_to_text(out, c, b);
+ bch2_btree_pos_to_text(out, c, b);
prt_printf(out, "\n node offset %u", b->written);
if (i)
prt_str(out, ": ");
}
-enum btree_err_type {
- /*
- * We can repair this locally, and we're after the checksum check so
- * there's no need to try another replica:
- */
- BTREE_ERR_FIXABLE,
- /*
- * We can repair this if we have to, but we should try reading another
- * replica if we can:
- */
- BTREE_ERR_WANT_RETRY,
- /*
- * Read another replica if we have one, otherwise consider the whole
- * node bad:
- */
- BTREE_ERR_MUST_RETRY,
- BTREE_ERR_BAD_NODE,
- BTREE_ERR_INCOMPATIBLE,
-};
-
-enum btree_validate_ret {
- BTREE_RETRY_READ = 64,
-};
-
-static int __btree_err(enum btree_err_type type,
+__printf(9, 10)
+static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
int write,
bool have_retry,
+ enum bch_sb_error_id err_type,
const char *fmt, ...)
{
struct printbuf out = PRINTBUF;
va_list args;
- int ret = -BCH_ERR_fsck_fix;
btree_err_msg(&out, c, ca, b, i, b->written, write);
goto out;
}
- if (!have_retry && type == BTREE_ERR_WANT_RETRY)
- type = BTREE_ERR_FIXABLE;
- if (!have_retry && type == BTREE_ERR_MUST_RETRY)
- type = BTREE_ERR_BAD_NODE;
+ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+ ret = -BCH_ERR_btree_node_read_err_fixable;
+ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret = -BCH_ERR_btree_node_read_err_bad_node;
+
+ if (ret != -BCH_ERR_btree_node_read_err_fixable)
+ bch2_sb_error_count(c, err_type);
- switch (type) {
- case BTREE_ERR_FIXABLE:
- mustfix_fsck_err(c, "%s", out.buf);
+ switch (ret) {
+ case -BCH_ERR_btree_node_read_err_fixable:
+ ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ if (ret != -BCH_ERR_fsck_fix &&
+ ret != -BCH_ERR_fsck_ignore)
+ goto fsck_err;
ret = -BCH_ERR_fsck_fix;
break;
- case BTREE_ERR_WANT_RETRY:
- case BTREE_ERR_MUST_RETRY:
+ case -BCH_ERR_btree_node_read_err_want_retry:
+ case -BCH_ERR_btree_node_read_err_must_retry:
bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = BTREE_RETRY_READ;
break;
- case BTREE_ERR_BAD_NODE:
+ case -BCH_ERR_btree_node_read_err_bad_node:
bch2_print_string_as_lines(KERN_ERR, out.buf);
bch2_topology_error(c);
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
break;
- case BTREE_ERR_INCOMPATIBLE:
+ case -BCH_ERR_btree_node_read_err_incompatible:
bch2_print_string_as_lines(KERN_ERR, out.buf);
ret = -BCH_ERR_fsck_errors_not_fixed;
break;
return ret;
}
-#define btree_err(type, c, ca, b, i, msg, ...) \
+#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \
({ \
- int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
+ int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \
+ BCH_FSCK_ERR_##_err_type, \
+ msg, ##__VA_ARGS__); \
\
- if (_ret != -BCH_ERR_fsck_fix) \
+ if (_ret != -BCH_ERR_fsck_fix) { \
+ ret = _ret; \
goto fsck_err; \
+ } \
+ \
*saw_error = true; \
})
void bch2_btree_node_drop_keys_outside_node(struct btree *b)
{
struct bset_tree *t;
- struct bkey_s_c k;
- struct bkey unpacked;
- struct btree_node_iter iter;
for_each_bset(b, t) {
struct bset *i = bset(b, t);
bch2_bset_set_no_aux_tree(b, b->set);
bch2_btree_build_aux_trees(b);
+ struct bkey_s_c k;
+ struct bkey unpacked;
+ struct btree_node_iter iter;
for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
BUG_ON(bpos_gt(k.k->p, b->data->max_key));
int write, bool have_retry, bool *saw_error)
{
unsigned version = le16_to_cpu(i->version);
- const char *err;
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
int ret = 0;
btree_err_on(!bch2_version_compatible(version),
- BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, ca, b, i,
+ btree_node_unsupported_version,
"unsupported bset version %u.%u",
BCH_VERSION_MAJOR(version),
BCH_VERSION_MINOR(version));
if (btree_err_on(version < c->sb.version_min,
- BTREE_ERR_FIXABLE, c, NULL, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
mutex_lock(&c->sb_lock);
if (btree_err_on(BCH_VERSION_MAJOR(version) >
BCH_VERSION_MAJOR(c->sb.version),
- BTREE_ERR_FIXABLE, c, NULL, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bset_newer_than_sb,
"bset version %u newer than superblock version %u",
version, c->sb.version)) {
mutex_lock(&c->sb_lock);
}
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
- BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, ca, b, i,
+ btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
if (btree_err_on(offset + sectors > btree_sectors(c),
- BTREE_ERR_FIXABLE, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_past_end_of_btree_node,
"bset past end of btree node")) {
i->u64s = 0;
ret = 0;
}
btree_err_on(offset && !i->u64s,
- BTREE_ERR_FIXABLE, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_empty,
"empty bset");
- btree_err_on(BSET_OFFSET(i) &&
- BSET_OFFSET(i) != offset,
- BTREE_ERR_WANT_RETRY, c, ca, b, i,
+ btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_wrong_sector_offset,
"bset at wrong sector offset");
if (!offset) {
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
- BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ bset_bad_seq,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
- BTREE_ERR_MUST_RETRY, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_btree,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
- BTREE_ERR_MUST_RETRY, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_level,
"incorrect level");
if (!write)
}
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
- BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_min_key,
"incorrect min_key: got %s should be %s",
(printbuf_reset(&buf1),
bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
}
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
- BTREE_ERR_MUST_RETRY, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, i,
+ btree_node_bad_max_key,
"incorrect max key %s",
(printbuf_reset(&buf1),
bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
compat_btree_node(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write, bn);
- err = bch2_bkey_format_validate(&bn->format);
- btree_err_on(err,
- BTREE_ERR_BAD_NODE, c, ca, b, i,
- "invalid bkey format: %s", err);
+ btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
+ -BCH_ERR_btree_node_read_err_bad_node,
+ c, ca, b, i,
+ btree_node_bad_format,
+ "invalid bkey format: %s\n %s", buf1.buf,
+ (printbuf_reset(&buf2),
+ bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
+ printbuf_reset(&buf1);
compat_bformat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
struct printbuf *err)
{
return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
- (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
+ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
struct bkey tmp;
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
- BTREE_ERR_FIXABLE, c, NULL, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_past_bset_end,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
break;
}
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
- BTREE_ERR_FIXABLE, c, NULL, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_bad_format,
"invalid bkey format %u", k->format)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
printbuf_reset(&buf);
if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
printbuf_reset(&buf);
- prt_printf(&buf, "invalid bkey: ");
bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
prt_printf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
+ btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bad_bkey,
+ "invalid bkey: %s", buf.buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
bch2_dump_bset(c, b, i, 0);
- if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
+ if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bkey_out_of_order,
+ "%s", buf.buf)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned u64s;
- unsigned blacklisted_written, nonblacklisted_written = 0;
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
b->written = 0;
iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
- sort_iter_init(iter, b);
- iter->size = (btree_blocks(c) + 1) * 2;
+ sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
if (bch2_meta_read_fault("btree"))
- btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+ btree_err(-BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_fault_injected,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
- BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_magic,
"bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(b->data->magic));
- btree_err_on(!b->data->keys.seq,
- BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
- "bad btree header: seq 0");
-
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
btree_err_on(b->data->keys.seq != bp->seq,
- BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_seq,
"got wrong btree node (seq %llx want %llx)",
b->data->keys.seq, bp->seq);
+ } else {
+ btree_err_on(!b->data->keys.seq,
+ -BCH_ERR_btree_node_read_err_must_retry,
+ c, ca, b, NULL,
+ btree_node_bad_seq,
+ "bad btree header: seq 0");
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
unsigned sectors;
struct nonce nonce;
- struct bch_csum csum;
bool first = !b->written;
+ bool csum_bad;
if (!b->written) {
i = &b->data->keys;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- BTREE_ERR_WANT_RETRY, c, ca, b, i,
- "unknown checksum type %llu",
- BSET_CSUM_TYPE(i));
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_unknown_csum,
+ "unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
- btree_err_on(bch2_crc_cmp(csum, b->data->csum),
- BTREE_ERR_WANT_RETRY, c, ca, b, i,
+ csum_bad = bch2_crc_cmp(b->data->csum,
+ csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
+ if (csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_bad_csum,
"invalid checksum");
ret = bset_encrypt(c, i, b->written << 9);
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
- BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
+ -BCH_ERR_btree_node_read_err_incompatible,
+ c, NULL, b, NULL,
+ btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
sectors = vstruct_sectors(b->data, c->block_bits);
break;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
- BTREE_ERR_WANT_RETRY, c, ca, b, i,
- "unknown checksum type %llu",
- BSET_CSUM_TYPE(i));
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_unknown_csum,
+ "unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
- csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
- btree_err_on(bch2_crc_cmp(csum, bne->csum),
- BTREE_ERR_WANT_RETRY, c, ca, b, i,
+ csum_bad = bch2_crc_cmp(bne->csum,
+ csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
+ if (csum_bad)
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+ btree_err_on(csum_bad,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, i,
+ bset_bad_csum,
"invalid checksum");
ret = bset_encrypt(c, i, b->written << 9);
true);
btree_err_on(blacklisted && first,
- BTREE_ERR_FIXABLE, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ bset_blacklisted_journal_seq,
"first btree node bset has blacklisted journal seq (%llu)",
le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
- BTREE_ERR_FIXABLE, c, ca, b, i,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i,
+ first_bset_blacklisted_journal_seq,
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
sort_iter_add(iter,
vstruct_idx(i, 0),
vstruct_last(i));
-
- nonblacklisted_written = b->written;
}
if (ptr_written) {
btree_err_on(b->written < ptr_written,
- BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, NULL,
+ btree_node_data_missing,
"btree node data missing: expected %u sectors, found %u",
ptr_written, b->written);
} else {
!bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq),
true),
- BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+ -BCH_ERR_btree_node_read_err_want_retry,
+ c, ca, b, NULL,
+ btree_node_bset_after_end,
"found bset signature after last bset");
-
- /*
- * Blacklisted bsets are those that were written after the most recent
- * (flush) journal write. Since there wasn't a flush, they may not have
- * made it to all devices - which means we shouldn't write new bsets
- * after them, as that could leave a gap and then reads from that device
- * wouldn't find all the bsets in that btree node - which means it's
- * important that we start writing new bsets after the most recent _non_
- * blacklisted bset:
- */
- blacklisted_written = b->written;
- b->written = nonblacklisted_written;
}
sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
prt_printf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, u.s_c);
- btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
+ btree_err(-BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, i,
+ btree_node_bad_bkey,
+ "%s", buf.buf);
btree_keys_account_key_drop(&b->nr, 0, k);
btree_node_reset_sib_u64s(b);
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
- if (ca->mi.state != BCH_MEMBER_STATE_rw)
+ if (ca2->mi.state != BCH_MEMBER_STATE_rw)
set_btree_node_need_rewrite(b);
}
printbuf_exit(&buf);
return retry_read;
fsck_err:
- if (ret == BTREE_RETRY_READ)
+ if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+ ret == -BCH_ERR_btree_node_read_err_must_retry)
retry_read = 1;
else
set_btree_node_read_error(b);
}
start:
printbuf_reset(&buf);
- btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+ "btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);
- printbuf_exit(&buf);
if (saw_error && !btree_node_read_error(b)) {
- struct printbuf buf = PRINTBUF;
-
+ printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->key.k.p);
bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
- __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
- printbuf_exit(&buf);
+ __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
bch2_btree_node_rewrite_async(c, b);
}
+ printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
}
}
written2 = btree_node_sectors_written(c, ra->buf[i]);
- if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+ if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_replicas_sectors_written_mismatch,
"btree node sectors written mismatch: %u != %u",
written, written2) ||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
- BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_bset_after_end,
"found bset signature after last bset") ||
btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
- BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, NULL, b, NULL,
+ btree_node_replicas_data_mismatch,
"btree node replicas content mismatch"))
dump_bset_maps = true;
struct printbuf buf = PRINTBUF;
prt_str(&buf, "btree node read error: no device to read from\n at ");
- btree_pos_to_text(&buf, c, b);
+ bch2_btree_pos_to_text(&buf, c, b);
bch_err(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
const struct bkey_i *k, unsigned level)
{
- return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level));
-
+ return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
}
void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
{
- struct btree_trans trans;
-
- bch2_trans_init(&trans, c, 0, 0);
+ struct btree_trans *trans = bch2_trans_get(c);
- btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
__btree_node_write_done(c, b);
six_unlock_read(&b->c.lock);
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
}
static void btree_node_write_work(struct work_struct *work)
}
} else {
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
+ bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW,
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ "btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
struct bset *i;
struct btree_node *bn = NULL;
struct btree_node_entry *bne = NULL;
- struct sort_iter sort_iter;
+ struct sort_iter_stack sort_iter;
struct nonce nonce;
unsigned bytes_to_write, sectors_to_write, bytes, u64s;
u64 seq = 0;
bch2_sort_whiteouts(c, b);
- sort_iter_init(&sort_iter, b);
+ sort_iter_stack_init(&sort_iter, b);
bytes = !b->written
? sizeof(struct btree_node)
continue;
bytes += le16_to_cpu(i->u64s) * sizeof(u64);
- sort_iter_add(&sort_iter,
+ sort_iter_add(&sort_iter.iter,
btree_bkey_first(b, t),
btree_bkey_last(b, t));
seq = max(seq, le64_to_cpu(i->journal_seq));
i->journal_seq = cpu_to_le64(seq);
i->u64s = 0;
- sort_iter_add(&sort_iter,
+ sort_iter_add(&sort_iter.iter,
unwritten_whiteouts_start(c, b),
unwritten_whiteouts_end(c, b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
b->whiteout_u64s = 0;
- u64s = bch2_sort_keys(i->start, &sort_iter, false);
+ u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
le16_add_cpu(&i->u64s, u64s);
BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
#include "btree_locking.h"
#include "checksum.h"
#include "extents.h"
-#include "io_types.h"
+#include "io_write_types.h"
struct bch_fs;
struct btree_write;
__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
__BTREE_WRITE_ALREADY_STARTED,
};
-#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED )
-#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED)
+#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED)
+#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED)
void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_iter.h"
+#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
-#include "recovery.h"
#include "replicas.h"
-#include "subvolume.h"
+#include "snapshot.h"
#include "trace.h"
#include <linux/random.h>
BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
- !btree_type_has_snapshots(iter->btree_id));
+ !btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
bch2_btree_path_verify(trans, iter->update_path);
bch2_bpos_to_text(&buf, pos);
panic("not locked: %s %s%s\n",
- bch2_btree_ids[id], buf.buf,
+ bch2_btree_id_str(id), buf.buf,
key_cache ? " cached" : "");
}
if (!bch2_btree_node_iter_end(node_iter) &&
iter_current_key_modified &&
b->c.level) {
- struct bset_tree *t;
struct bkey_packed *k, *k2, *p;
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (t != BTREE_NODE_UNLOCKED) {
btree_node_unlock(trans, path, b->c.level);
six_lock_increment(&b->c.lock, (enum six_lock_type) t);
- mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
+ mark_btree_node_locked(trans, path, b->c.level, t);
}
bch2_btree_path_level_init(trans, path, b);
for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
path->l[i].b = NULL;
- mark_btree_node_locked(trans, path, path->level, lock_type);
+ mark_btree_node_locked(trans, path, path->level,
+ (enum btree_node_locked_type) lock_type);
bch2_btree_path_level_init(trans, path, b);
return 0;
}
if (btree_node_read_locked(path, level + 1))
btree_node_unlock(trans, path, level + 1);
- mark_btree_node_locked(trans, path, level, lock_type);
+ mark_btree_node_locked(trans, path, level,
+ (enum btree_node_locked_type) lock_type);
path->level = level;
bch2_btree_path_level_init(trans, path, b);
/*
* We used to assert that all paths had been traversed here
* (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
- * path->Should_be_locked is not set yet, we we might have unlocked and
+ * path->should_be_locked is not set yet, we might have unlocked and
* then failed to relock a path - that's fine.
*/
err:
if (unlikely(ret))
goto out;
+ if (unlikely(!trans->srcu_held))
+ bch2_trans_srcu_lock(trans);
+
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
struct btree_path *path, struct bpos new_pos,
bool intent, unsigned long ip, int cmp)
{
- unsigned level = path->level;
-
bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!path->ref);
goto out;
}
- level = btree_path_up_until_good_node(trans, path, cmp);
+ unsigned level = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, level)) {
struct btree_path_level *l = &path->l[level];
__bch2_path_free(trans, path);
}
-void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
{
panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
trans->restart_count, restart_count,
(void *) trans->last_begin_ip);
}
-void bch2_trans_in_restart_error(struct btree_trans *trans)
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
{
panic("in transaction restart: %s, last restarted by %pS\n",
bch2_err_str(trans->restarted),
struct bkey_s_c old = { &i->old_k, i->old_v };
prt_printf(buf, "update: btree=%s cached=%u %pS",
- bch2_btree_ids[i->btree_id],
+ bch2_btree_id_str(i->btree_id),
i->cached,
(void *) i->ip_allocated);
prt_newline(buf);
trans_for_each_wb_update(trans, wb) {
prt_printf(buf, "update: btree=%s wb=1 %pS",
- bch2_btree_ids[wb->btree],
+ bch2_btree_id_str(wb->btree),
(void *) i->ip_allocated);
prt_newline(buf);
path->idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
- bch2_btree_ids[path->btree_id],
+ bch2_btree_id_str(path->btree_id),
path->level);
bch2_bpos_to_text(out, path->pos);
static noinline void btree_path_overflow(struct btree_trans *trans)
{
bch2_dump_trans_paths_updates(trans);
- panic("trans path oveflow\n");
+ panic("trans path overflow\n");
}
static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
+ path->alloc_seq++;
btree_path_list_add(trans, pos, path);
trans->paths_sorted = false;
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want)
- bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
return path;
}
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
- if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
- struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
- ? bpos_eq(pos, SPOS_MAX)
- : bkey_eq(pos, SPOS_MAX));
-
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
- } else {
- if (!btree_path_node(iter->path, iter->path->level))
- return true;
+ struct bpos pos = iter->k.p;
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, SPOS_MAX)
+ : bkey_eq(pos, SPOS_MAX));
- iter->advanced = true;
- return false;
- }
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
}
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
}
/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter: iterator to peek from
+ * @end: search limit: returns keys less than or equal to @end
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
{
struct bpos iter_pos;
int ret;
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
if (iter->update_path) {
}
/**
- * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
- * to iterator's current position, returning keys from every level of the btree.
- * For keys at different levels of the btree that compare equal, the key from
- * the lower level (leaf) is returned first.
- */
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- struct bkey_s_c k;
- int ret;
-
- EBUG_ON(iter->path->cached);
- bch2_btree_iter_verify(iter);
- BUG_ON(iter->path->level < iter->min_depth);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
- EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- /* Already at end? */
- if (!btree_path_node(iter->path, iter->path->level)) {
- k = bkey_s_c_null;
- goto out_no_locked;
- }
-
- k = btree_path_level_peek_all(trans->c,
- &iter->path->l[iter->path->level], &iter->k);
-
- /* Check if we should go up to the parent node: */
- if (!k.k ||
- (iter->advanced &&
- bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
- iter->pos = path_l(iter->path)->b->key.k.p;
- btree_path_set_level_up(trans, iter->path);
- iter->advanced = false;
- continue;
- }
-
- /*
- * Check if we should go back down to a leaf:
- * If we're not in a leaf node, we only return the current key
- * if it exactly matches iter->pos - otherwise we first have to
- * go back to the leaf:
- */
- if (iter->path->level != iter->min_depth &&
- (iter->advanced ||
- !k.k ||
- !bpos_eq(iter->pos, k.k->p))) {
- btree_path_set_level_down(trans, iter->path, iter->min_depth);
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- /* Check if we should go to the next key: */
- if (iter->path->level == iter->min_depth &&
- iter->advanced &&
- k.k &&
- bpos_eq(iter->pos, k.k->p)) {
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- if (iter->advanced &&
- iter->path->level == iter->min_depth &&
- !bpos_eq(k.k->p, iter->pos))
- iter->advanced = false;
-
- BUG_ON(iter->advanced);
- BUG_ON(!k.k);
- break;
- }
-
- iter->pos = k.k->p;
- btree_path_set_should_be_locked(iter->path);
-out_no_locked:
- bch2_btree_iter_verify(iter);
-
- return k;
-}
-
-/**
- * bch2_btree_iter_next: returns first key greater than iterator's current
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
* position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
{
}
/**
- * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
* iterator's current position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
}
/**
- * bch2_btree_iter_prev: returns first key less than iterator's current
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
* position
+ * @iter: iterator to peek from
+ *
+ * Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
{
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
/* extents can't span inode numbers: */
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
struct btree_iter *iter,
- unsigned btree_id, struct bpos pos,
+ enum btree_id btree_id, struct bpos pos,
unsigned flags)
{
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
unsigned depth,
unsigned flags)
{
- flags |= BTREE_ITER_NOT_EXTENTS;
- flags |= __BTREE_ITER_ALL_SNAPSHOTS;
- flags |= BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_NOT_EXTENTS;
+ flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+ flags |= BTREE_ITER_ALL_SNAPSHOTS;
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
__bch2_btree_iter_flags(trans, btree_id, flags),
return p;
}
-static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+static inline void check_srcu_held_too_long(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
- struct btree_path *path;
+ WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+ "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+ (jiffies - trans->srcu_lock_time) / HZ);
+}
- trans_for_each_path(trans, path)
- if (path->cached && !btree_node_locked(path, 0))
- path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+void bch2_trans_srcu_unlock(struct btree_trans *trans)
+{
+ if (trans->srcu_held) {
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
+ trans_for_each_path(trans, path)
+ if (path->cached && !btree_node_locked(path, 0))
+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+ check_srcu_held_too_long(trans);
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ trans->srcu_held = false;
+ }
+}
+
+void bch2_trans_srcu_lock(struct btree_trans *trans)
+{
+ if (!trans->srcu_held) {
+ trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
+ }
}
/**
* bch2_trans_begin() - reset a transaction after a interrupted attempt
* @trans: transaction to reset
*
+ * Returns: current restart counter, to be used with trans_was_restarted()
+ *
* While iterating over nodes or updating nodes a attempt to lock a btree node
* may return BCH_ERR_transaction_restart when the trylock fails. When this
* occurs bch2_trans_begin() should be called and the transaction retried.
}
trans->last_begin_time = now;
- if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
- bch2_trans_reset_srcu_lock(trans);
+ if (unlikely(trans->srcu_held &&
+ time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+ bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
if (trans->restarted) {
return trans->restart_count;
}
-static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
{
- size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX;
- size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
- void *p = NULL;
+ struct btree_trans *trans;
- BUG_ON(trans->used_mempool);
+ if (IS_ENABLED(__KERNEL__)) {
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+ if (trans)
+ return trans;
+ }
-#ifdef __KERNEL__
- p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
-#endif
- if (!p)
- p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
+ trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
/*
- * paths need to be zeroed, bch2_check_for_deadlock looks at paths in
- * other threads
+ * paths need to be zeroed, bch2_check_for_deadlock looks at
+ * paths in other threads
*/
-
- trans->paths = p; p += paths_bytes;
- trans->updates = p; p += updates_bytes;
+ memset(&trans->paths, 0, sizeof(trans->paths));
+ return trans;
}
const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
return i;
}
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
__acquires(&c->btree_trans_barrier)
{
+ struct btree_trans *trans;
struct btree_transaction_stats *s;
bch2_assert_btree_nodes_not_locked();
+ trans = bch2_trans_alloc(c);
+
memset(trans, 0, sizeof(*trans));
trans->c = c;
trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
closure_init_stack(&trans->ref);
- bch2_trans_alloc_paths(trans, c);
-
s = btree_trans_stats(trans);
if (s && s->max_mem) {
unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
trans->wb_updates_size = s->wb_updates_size;
}
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
struct btree_trans *pos;
list_add_done:
seqmutex_unlock(&c->btree_trans_lock);
}
+
+ return trans;
}
static void check_btree_paths_leaked(struct btree_trans *trans)
trans_for_each_path(trans, path)
if (path->ref)
printk(KERN_ERR " btree %s %pS\n",
- bch2_btree_ids[path->btree_id],
+ bch2_btree_id_str(path->btree_id),
(void *) path->ip_allocated);
/* Be noisy about this: */
bch2_fatal_error(c);
#endif
}
-void bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
struct btree_insert_entry *i;
check_btree_paths_leaked(trans);
- srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+ if (trans->srcu_held) {
+ check_srcu_held_too_long(trans);
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+ }
kfree(trans->extra_journal_entries.data);
else
kfree(trans->mem);
-#ifdef __KERNEL__
- /*
- * Userspace doesn't have a real percpu implementation:
- */
- trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
-#endif
-
- if (trans->paths)
- mempool_free(trans->paths, &c->btree_paths_pool);
-
- trans->mem = (void *) 0x1;
- trans->paths = (void *) 0x1;
+ /* Userspace doesn't have a real percpu implementation: */
+ if (IS_ENABLED(__KERNEL__))
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+ if (trans)
+ mempool_free(trans, &c->btree_trans_pool);
}
static void __maybe_unused
prt_tab(out);
prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
- b->level, bch2_btree_ids[b->btree_id]);
+ b->level, bch2_btree_id_str(b->btree_id));
bch2_bpos_to_text(out, btree_node_pos(b));
prt_tab(out);
path->idx,
path->cached ? 'c' : 'b',
path->level,
- bch2_btree_ids[path->btree_id]);
+ bch2_btree_id_str(path->btree_id));
bch2_bpos_to_text(out, path->pos);
prt_newline(out);
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
struct btree_transaction_stats *s;
+ struct btree_trans *trans;
+ int cpu;
+
+ trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+ if (trans)
+ panic("%s leaked btree_trans\n", trans->fn);
+
+ if (c->btree_trans_bufs)
+ for_each_possible_cpu(cpu)
+ kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+ free_percpu(c->btree_trans_bufs);
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
if (c->btree_trans_barrier_initialized)
cleanup_srcu_struct(&c->btree_trans_barrier);
mempool_exit(&c->btree_trans_mem_pool);
- mempool_exit(&c->btree_paths_pool);
+ mempool_exit(&c->btree_trans_pool);
}
int bch2_fs_btree_iter_init(struct bch_fs *c)
{
struct btree_transaction_stats *s;
- unsigned nr = BTREE_ITER_MAX;
int ret;
for (s = c->btree_transaction_stats;
INIT_LIST_HEAD(&c->btree_trans_list);
seqmutex_init(&c->btree_trans_lock);
- ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
- sizeof(struct btree_path) * nr +
- sizeof(struct btree_insert_entry) * nr) ?:
+ c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+ if (!c->btree_trans_bufs)
+ return -ENOMEM;
+
+ ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+ sizeof(struct btree_trans)) ?:
mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
BTREE_TRANS_MEM_MAX) ?:
init_srcu_struct(&c->btree_trans_barrier);
unsigned, unsigned, unsigned, unsigned long);
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+ struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+ if (k.k && bpos_eq(path->pos, k.k->p))
+ return k;
+
+ bkey_init(u);
+ u->p = path->pos;
+ return (struct bkey_s_c) { u, NULL };
+}
+
struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
struct btree_iter *, struct bpos);
int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_unlock_long(struct btree_trans *);
bool bch2_trans_locked(struct btree_trans *);
-static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
{
- return restart_count != trans->restart_count;
+ return restart_count != trans->restart_count
+ ? -BCH_ERR_transaction_restart_nested
+ : 0;
}
-void bch2_trans_restart_error(struct btree_trans *, u32);
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
u32 restart_count)
bch2_trans_restart_error(trans, restart_count);
}
-void bch2_trans_in_restart_error(struct btree_trans *);
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
{
struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
-
static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
unsigned btree_id,
unsigned flags)
{
- if (flags & BTREE_ITER_ALL_LEVELS)
- flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
-
if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
- btree_node_type_is_extents(btree_id))
+ btree_id_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
- !btree_type_has_snapshots(btree_id))
+ !btree_type_has_snapshot_field(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
}
void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
- unsigned, struct bpos, unsigned);
+ enum btree_id, struct bpos, unsigned);
static inline void bch2_trans_iter_init(struct btree_trans *trans,
struct btree_iter *iter,
__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \
KEY_TYPE_##_type, sizeof(*_val), _val)
+void bch2_trans_srcu_unlock(struct btree_trans *);
+void bch2_trans_srcu_lock(struct btree_trans *);
+
u32 bch2_trans_begin(struct btree_trans *);
/*
static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
unsigned flags)
{
- BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
-
return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek_prev(iter);
}
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
- flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek(iter);
}
#define lockrestart_do(_trans, _do) \
({ \
u32 _restart_count; \
- int _ret; \
+ int _ret2; \
\
do { \
_restart_count = bch2_trans_begin(_trans); \
- _ret = (_do); \
- } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \
+ _ret2 = (_do); \
+ } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \
\
- if (!_ret) \
+ if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
\
- _ret; \
+ _ret2; \
})
/*
#define nested_lockrestart_do(_trans, _do) \
({ \
u32 _restart_count, _orig_restart_count; \
- int _ret; \
+ int _ret2; \
\
_restart_count = _orig_restart_count = (_trans)->restart_count; \
\
- while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
+ while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
_restart_count = bch2_trans_begin(_trans); \
\
- if (!_ret) \
+ if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
\
- if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \
- _ret = -BCH_ERR_transaction_restart_nested; \
- \
- _ret; \
+ _ret2 ?: trans_was_restarted(_trans, _restart_count); \
})
#define for_each_btree_key2(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
({ \
- int _ret = 0; \
+ int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
while (1) { \
u32 _restart_count = bch2_trans_begin(_trans); \
\
- _ret = 0; \
+ _ret3 = 0; \
(_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \
if (!(_k).k) \
break; \
\
- _ret = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ _ret3 = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
continue; \
- if (_ret) \
+ if (_ret3) \
break; \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
if (!bch2_btree_iter_advance(&(_iter))) \
} \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret; \
+ _ret3; \
})
#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _do) \
({ \
- int _ret = 0; \
+ int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
while (1) { \
u32 _restart_count = bch2_trans_begin(_trans); \
\
- _ret = 0; \
+ _ret3 = 0; \
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
if (!(_k).k) \
break; \
\
- _ret = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ _ret3 = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
continue; \
- if (_ret) \
+ if (_ret3) \
break; \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
if (!bch2_btree_iter_advance(&(_iter))) \
} \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret; \
+ _ret3; \
})
#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
({ \
- int _ret = 0; \
+ int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
u32 _restart_count = bch2_trans_begin(_trans); \
(_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
if (!(_k).k) { \
- _ret = 0; \
+ _ret3 = 0; \
break; \
} \
\
- _ret = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ _ret3 = bkey_err(_k) ?: (_do); \
+ if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
continue; \
- if (_ret) \
+ if (_ret3) \
break; \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
if (!bch2_btree_iter_rewind(&(_iter))) \
} \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret; \
+ _ret3; \
})
#define for_each_btree_key_commit(_trans, _iter, _btree_id, \
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
-void bch2_trans_exit(struct btree_trans *);
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
unsigned bch2_trans_get_fn_idx(const char *);
-#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \
-do { \
+#define bch2_trans_get(_c) \
+({ \
static unsigned trans_fn_idx; \
\
if (unlikely(!trans_fn_idx)) \
trans_fn_idx = bch2_trans_get_fn_idx(__func__); \
- \
- __bch2_trans_init(_trans, _c, trans_fn_idx); \
-} while (0)
+ __bch2_trans_get(_c, trans_fn_idx); \
+})
void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
- if (ck->c.lock.readers)
+ if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
- else
+ bc->nr_freed_pcpu++;
+ } else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
+ }
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
{
struct bkey_cached *pos;
+ bc->nr_freed_nonpcpu++;
+
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
+ bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
}
if (ck) {
- int ret;
-
ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
if (unlikely(ret)) {
bkey_cached_move_to_freelist(bc, ck);
path->l[0].b = (void *) ck;
path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
ret = bch2_btree_node_lock_write(trans, path, &ck->c);
if (unlikely(ret)) {
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
- bch2_btree_ids[path->btree_id]);
+ bch2_btree_id_str(path->btree_id));
return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
}
- mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
}
ck->c.level = 0;
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_ids[ck->key.btree_id], new_u64s);
+ bch2_btree_id_str(ck->key.btree_id), new_u64s);
ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
goto err;
}
if (!ck)
goto retry;
- mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
path->locks_want = 1;
} else {
enum six_lock_type lock_want = __btree_lock_want(path, 0);
goto retry;
}
- mark_btree_node_locked(trans, path, 0, lock_want);
+ mark_btree_node_locked(trans, path, 0,
+ (enum btree_node_locked_type) lock_want);
}
path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
* path->uptodate yet:
*/
if (!path->locks_want &&
- !__bch2_btree_path_upgrade(trans, path, 1)) {
+ !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err;
goto retry;
}
- mark_btree_node_locked(trans, path, 0, lock_want);
+ mark_btree_node_locked(trans, path, 0,
+ (enum btree_node_locked_type) lock_want);
}
path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
(ck->journal.seq == journal_last_seq(j)
? BCH_WATERMARK_reclaim
: 0)|
goto out;
bch2_journal_pin_drop(j, &ck->journal);
- bch2_journal_preres_put(j, &ck->res);
BUG_ON(!btree_node_locked(c_iter.path, 0));
struct bkey_cached *ck =
container_of(pin, struct bkey_cached, journal);
struct bkey_cached_key key;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
- btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
+ btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
key = ck->key;
if (ck->journal.seq != seq ||
}
six_unlock_read(&ck->c.lock);
- ret = commit_do(&trans, NULL, NULL, 0,
- btree_key_cache_flush_pos(&trans, key, seq,
- BTREE_INSERT_JOURNAL_RECLAIM, false));
+ ret = commit_do(trans, NULL, NULL, 0,
+ btree_key_cache_flush_pos(trans, key, seq,
+ BCH_TRANS_COMMIT_journal_reclaim, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
BUG_ON(insert->k.u64s > ck->u64s);
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- int difference;
-
- BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
-
- difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
- if (difference > 0) {
- trans->journal_preres.u64s -= difference;
- ck->res.u64s += difference;
- }
- }
-
bkey_copy(ck->k, insert);
ck->valid = true;
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
+ scanned += bc->nr_freed_nonpcpu;
+
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- scanned++;
freed++;
+ bc->nr_freed_nonpcpu--;
}
if (scanned >= nr)
goto out;
+ scanned += bc->nr_freed_pcpu;
+
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- scanned++;
freed++;
+ bc->nr_freed_pcpu--;
}
if (scanned >= nr)
}
#endif
+ BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+ BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
- bch2_journal_preres_put(&c->journal, &ck->res);
list_del(&ck->list);
kfree(ck->k);
bc->shrink.count_objects = bch2_btree_key_cache_count;
bc->shrink.scan_objects = bch2_btree_key_cache_scan;
bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text;
- if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name))
+ if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
return 0;
}
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
{
- prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed));
+ prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed));
prt_newline(out);
prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys));
prt_newline(out);
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
- bool upgrade)
+ bool upgrade,
+ struct get_locks_fail *f)
{
unsigned l = path->level;
int fail_idx = -1;
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l)))
- fail_idx = l;
+ : bch2_btree_node_relock(trans, path, l))) {
+ fail_idx = l;
+
+ if (f) {
+ f->l = l;
+ f->b = path->l[l].b;
+ }
+ }
l++;
} while (l < path->locks_want);
bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- return btree_path_get_locks(trans, path, false);
+ struct get_locks_fail f;
+
+ return btree_path_get_locks(trans, path, false, &f);
}
int __bch2_btree_path_relock(struct btree_trans *trans,
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
struct btree_path *path,
- unsigned new_locks_want)
+ unsigned new_locks_want,
+ struct get_locks_fail *f)
{
EBUG_ON(path->locks_want >= new_locks_want);
path->locks_want = new_locks_want;
- return btree_path_get_locks(trans, path, true);
+ return btree_path_get_locks(trans, path, true, f);
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
- unsigned new_locks_want)
+ unsigned new_locks_want,
+ struct get_locks_fail *f)
{
struct btree_path *linked;
- if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true;
/*
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true);
+ btree_path_get_locks(trans, linked, true, NULL);
}
return false;
struct btree_path *path,
unsigned new_locks_want)
{
- unsigned l;
+ unsigned l, old_locks_want = path->locks_want;
+
+ if (trans->restarted)
+ return;
EBUG_ON(path->locks_want < new_locks_want);
}
bch2_btree_path_verify_locks(path);
+
+ path->downgrade_seq++;
+ trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
}
/* Btree transaction locking: */
{
struct btree_path *path;
+ if (trans->restarted)
+ return;
+
trans_for_each_path(trans, path)
bch2_btree_path_downgrade(trans, path);
}
bch2_assert_btree_nodes_not_locked();
}
+void bch2_trans_unlock_long(struct btree_trans *trans)
+{
+ bch2_trans_unlock(trans);
+ bch2_trans_srcu_unlock(trans);
+}
+
bool bch2_trans_locked(struct btree_trans *trans)
{
struct btree_path *path;
* updating the iterator state
*/
-#include <linux/six.h>
-
#include "btree_iter.h"
+#include "six.h"
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
static inline void mark_btree_node_locked(struct btree_trans *trans,
struct btree_path *path,
unsigned level,
- enum six_lock_type type)
+ enum btree_node_locked_type type)
{
mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
/* upgrade */
+
+struct get_locks_fail {
+ unsigned l;
+ struct btree *b;
+};
+
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
- struct btree_path *, unsigned);
+ struct btree_path *, unsigned,
+ struct get_locks_fail *);
+
bool __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned);
+ struct btree_path *, unsigned,
+ struct get_locks_fail *);
static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
+ struct get_locks_fail f;
unsigned old_locks_want = path->locks_want;
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (path->locks_want < new_locks_want
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
: path->uptodate == BTREE_ITER_UPTODATE)
return 0;
trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
- old_locks_want, new_locks_want);
+ old_locks_want, new_locks_want, &f);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
}
#include <linux/list.h>
#include <linux/rhashtable.h>
-#include <linux/six.h>
-//#include "bkey_methods.h"
+#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
#include "errcode.h"
#include "journal_types.h"
#include "replicas_types.h"
+#include "six.h"
struct open_bucket;
struct btree_update;
/*
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
-static const u16 BTREE_ITER_SLOTS = 1 << 0;
-static const u16 BTREE_ITER_ALL_LEVELS = 1 << 1;
+static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0;
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-static const u16 BTREE_ITER_INTENT = 1 << 2;
+static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1;
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-static const u16 BTREE_ITER_PREFETCH = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2;
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-static const u16 BTREE_ITER_IS_EXTENTS = 1 << 4;
-static const u16 BTREE_ITER_NOT_EXTENTS = 1 << 5;
-static const u16 BTREE_ITER_CACHED = 1 << 6;
-static const u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7;
-static const u16 BTREE_ITER_WITH_UPDATES = 1 << 8;
-static const u16 BTREE_ITER_WITH_JOURNAL = 1 << 9;
-static const u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
-static const u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11;
-static const u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12;
-static const u16 BTREE_ITER_NOPRESERVE = 1 << 13;
-static const u16 BTREE_ITER_CACHED_NOFILL = 1 << 14;
-static const u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15;
-#define __BTREE_ITER_FLAGS_END 16
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14;
+#define __BTREE_ITER_FLAGS_END 15
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
u8 sorted_idx;
u8 ref;
u8 intent_ref;
+ u32 alloc_seq;
+ u32 downgrade_seq;
/* btree_iter_copy starts here: */
struct bpos pos;
#endif
};
-struct btree_key_cache_freelist {
- struct bkey_cached *objs[16];
- unsigned nr;
-};
-
-struct btree_key_cache {
- struct mutex lock;
- struct rhashtable table;
- bool table_init_done;
- struct list_head freed_pcpu;
- struct list_head freed_nonpcpu;
- struct shrinker shrink;
- unsigned shrink_iter;
- struct btree_key_cache_freelist __percpu *pcpu_freed;
-
- atomic_long_t nr_freed;
- atomic_long_t nr_keys;
- atomic_long_t nr_dirty;
-};
-
-struct bkey_cached_key {
- u32 btree_id;
- struct bpos pos;
-} __packed __aligned(4);
-
#define BKEY_CACHED_ACCESSED 0
#define BKEY_CACHED_DIRTY 1
struct rhash_head hash;
struct list_head list;
- struct journal_preres res;
struct journal_entry_pin journal;
u64 seq;
u8 old_btree_u64s;
struct bkey_i *k;
struct btree_path *path;
- u64 seq;
/* key being overwritten: */
struct bkey old_k;
const struct bch_val *old_v;
u8 nr_updates;
u8 nr_wb_updates;
u8 wb_updates_size;
+ bool srcu_held:1;
bool used_mempool:1;
bool in_traverse_all:1;
bool paths_sorted:1;
bool journal_replay_not_finished:1;
bool is_initial_gc:1;
bool notrace_relock_fail:1;
+ bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
unsigned long last_begin_ip;
void *mem;
u8 sorted[BTREE_ITER_MAX + 8];
- struct btree_path *paths;
- struct btree_insert_entry *updates;
+ struct btree_path paths[BTREE_ITER_MAX];
+ struct btree_insert_entry updates[BTREE_ITER_MAX];
struct btree_write_buffered_key *wb_updates;
/* update path: */
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
- struct journal_preres journal_preres;
u64 *journal_seq;
struct disk_reservation *disk_res;
unsigned journal_u64s;
- unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
};
}
enum btree_node_type {
-#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
+ BKEY_TYPE_btree,
+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
BCH_BTREE_IDS()
#undef x
- BKEY_TYPE_btree,
+ BKEY_TYPE_NR
};
/* Type of a key in btree @id at level @level: */
static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
{
- return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
+ return level ? BKEY_TYPE_btree : (unsigned) id + 1;
}
/* Type of keys @b contains: */
return __btree_node_type(b->c.level, b->c.btree_id);
}
+const char *bch2_btree_node_type_str(enum btree_node_type);
+
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
- (BIT(BKEY_TYPE_extents)| \
- BIT(BKEY_TYPE_alloc)| \
- BIT(BKEY_TYPE_inodes)| \
- BIT(BKEY_TYPE_stripes)| \
- BIT(BKEY_TYPE_reflink)| \
- BIT(BKEY_TYPE_btree))
+ (BIT_ULL(BKEY_TYPE_extents)| \
+ BIT_ULL(BKEY_TYPE_alloc)| \
+ BIT_ULL(BKEY_TYPE_inodes)| \
+ BIT_ULL(BKEY_TYPE_stripes)| \
+ BIT_ULL(BKEY_TYPE_reflink)| \
+ BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
- (BIT(BKEY_TYPE_alloc)| \
- BIT(BKEY_TYPE_inodes)| \
- BIT(BKEY_TYPE_stripes)| \
- BIT(BKEY_TYPE_snapshots))
+ (BIT_ULL(BKEY_TYPE_alloc)| \
+ BIT_ULL(BKEY_TYPE_inodes)| \
+ BIT_ULL(BKEY_TYPE_stripes)| \
+ BIT_ULL(BKEY_TYPE_snapshots))
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
- return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
+ return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
}
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
const unsigned mask = 0
-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr)
+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
BCH_BTREE_IDS()
#undef x
;
static inline bool btree_id_is_extents(enum btree_id btree)
{
- return btree_node_type_is_extents((enum btree_node_type) btree);
+ return btree_node_type_is_extents(__btree_node_type(0, btree));
}
static inline bool btree_type_has_snapshots(enum btree_id id)
return (1U << id) & mask;
}
+static inline bool btree_type_has_snapshot_field(enum btree_id id)
+{
+ const unsigned mask = 0
+#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+ BCH_BTREE_IDS()
+#undef x
+ ;
+
+ return (1U << id) & mask;
+}
+
static inline bool btree_type_has_ptrs(enum btree_id id)
{
const unsigned mask = 0
#include "btree_iter.h"
#include "journal.h"
-#include "journal.h"
struct bch_fs;
struct btree;
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
struct bkey_i *, u64);
-enum btree_insert_flags {
+#define BCH_TRANS_COMMIT_FLAGS() \
+ x(no_enospc, "don't check for enospc") \
+ x(no_check_rw, "don't attempt to take a ref on c->writes") \
+ x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
+ x(no_journal_res, "don't take a journal reservation, instead " \
+ "pin journal entry referred to by trans->journal_res.seq") \
+ x(journal_reclaim, "operation required for journal reclaim; may return error" \
+ "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+
+enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */
- __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
- __BTREE_INSERT_NOCHECK_RW,
- __BTREE_INSERT_LAZY_RW,
- __BTREE_INSERT_JOURNAL_REPLAY,
- __BTREE_INSERT_JOURNAL_RECLAIM,
- __BTREE_INSERT_NOWAIT,
- __BTREE_INSERT_GC_LOCK_HELD,
- __BCH_HASH_SET_MUST_CREATE,
- __BCH_HASH_SET_MUST_REPLACE,
+ __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
+#define x(n, ...) __BCH_TRANS_COMMIT_##n,
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
};
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL)
-
-#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW)
-#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW)
-
-/* Insert is for journal replay - don't get journal reservations: */
-#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY)
-
-/* Insert is being called from journal reclaim path: */
-#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
-
-/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD)
-
-#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE)
+enum bch_trans_commit_flags {
+#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
+};
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
struct bkey_i *, enum btree_update_flags);
-int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
enum btree_update_flags);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
- struct disk_reservation *, u64 *, int flags);
+ struct disk_reservation *, int flags);
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *, unsigned);
-int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
/**
* bch2_trans_commit - insert keys at given iterator positions
nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_flags)))
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
-({ \
- struct btree_trans trans; \
- int _ret; \
- \
- bch2_trans_init(&trans, (_c), 0, 0); \
- _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \
- bch2_trans_exit(&trans); \
- \
- _ret; \
-})
-
#define bch2_trans_run(_c, _do) \
({ \
- struct btree_trans trans; \
- int _ret; \
- \
- bch2_trans_init(&trans, (_c), 0, 0); \
- _ret = (_do); \
- bch2_trans_exit(&trans); \
- \
+ struct btree_trans *trans = bch2_trans_get(_c); \
+ int _ret = (_do); \
+ bch2_trans_put(trans); \
_ret; \
})
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+ bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
+
#define trans_for_each_update(_trans, _i) \
for ((_i) = (_trans)->updates; \
(_i) < (_trans)->updates + (_trans)->nr_updates; \
{
struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
btree_id, pos, flags|BTREE_ITER_INTENT, type);
- struct bkey_i *ret = unlikely(IS_ERR(k.k))
+ struct bkey_i *ret = IS_ERR(k.k)
? ERR_CAST(k.k)
: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
- if (unlikely(IS_ERR(ret)))
+ if (IS_ERR(ret))
bch2_trans_iter_exit(trans, iter);
return ret;
}
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
+#include "btree_journal_iter.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
-#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
#include "trace.h"
}
/**
- * btree_node_format_fits - check if we could rewrite node with a new format
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
*
- * This assumes all keys can pack with the new format -- it just checks if
- * the re-packed keys would fit inside the node itself.
+ * @c: filesystem handle
+ * @b: btree node to rewrite
+ * @new_f: bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
*/
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
struct bkey_format *new_f)
struct write_point *wp;
struct btree *b;
BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
- struct open_buckets ob = { .nr = 0 };
+ struct open_buckets obs = { .nr = 0 };
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
struct btree_alloc *a =
&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
- ob = a->ob;
+ obs = a->ob;
bkey_copy(&tmp.k, &a->k);
mutex_unlock(&c->btree_reserve_cache_lock);
goto mem_alloc;
bkey_btree_ptr_v2_init(&tmp.k);
bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
- bch2_open_bucket_get(c, wp, &ob);
+ bch2_open_bucket_get(c, wp, &obs);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
b = bch2_btree_node_mem_alloc(trans, interior_node);
BUG_ON(b->ob.nr);
bkey_copy(&b->key, &tmp.k);
- b->ob = ob;
+ b->ob = obs;
return b;
}
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
- *
- * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
- * blocking on this lock:
*/
ret = bch2_btree_cache_cannibalize_lock(c, cl);
if (ret)
struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) {
- b = __bch2_btree_node_alloc(trans, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT ? NULL : cl,
- interior, flags);
+ b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
+ interior, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err;
up_read(&c->gc_lock);
as->took_gc_lock = false;
- bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
{
struct bch_fs *c = as->c;
struct btree *b;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
u64 journal_seq = 0;
unsigned i;
int ret;
- bch2_trans_init(&trans, c, 0, 512);
/*
* If we're already in an error state, it might be because a btree node
* was never written, and we might be trying to free that same btree
b = as->old_nodes[i];
- btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
seq = b->data ? b->data->keys.seq : 0;
six_unlock_read(&b->c.lock);
* journal reclaim does btree updates when flushing bkey_cached entries,
* which may require allocations as well.
*/
- ret = commit_do(&trans, &as->disk_res, &journal_seq,
+ ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM,
- btree_update_nodes_written_trans(&trans, as));
- bch2_trans_unlock(&trans);
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_journal_reclaim,
+ btree_update_nodes_written_trans(trans, as));
+ bch2_trans_unlock(trans);
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"%s(): error %s", __func__, bch2_err_str(ret));
struct btree_path *path;
b = as->b;
- path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
+ path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
/*
* @b is the node we did the final insert into:
*
* we may rarely end up with a locked path besides the one we
* have here:
*/
- bch2_trans_unlock(&trans);
- btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
- mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+ bch2_trans_unlock(trans);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
path->l[b->c.level].b = b;
- bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+ bch2_btree_node_lock_write_nofail(trans, path, &b->c);
mutex_lock(&c->btree_interior_update_lock);
* btree_interior_update_lock:
*/
if (as->b == b) {
- struct bset *i = btree_bset_last(b);
-
BUG_ON(!b->c.level);
BUG_ON(!btree_node_dirty(b));
if (!ret) {
- i->journal_seq = cpu_to_le64(
+ struct bset *last = btree_bset_last(b);
+
+ last->journal_seq = cpu_to_le64(
max(journal_seq,
- le64_to_cpu(i->journal_seq)));
+ le64_to_cpu(last->journal_seq)));
bch2_btree_add_journal_pin(c, b, journal_seq);
} else {
six_unlock_write(&b->c.lock);
btree_node_write_if_need(c, b, SIX_LOCK_intent);
- btree_node_unlock(&trans, path, b->c.level);
- bch2_path_put(&trans, path, true);
+ btree_node_unlock(trans, path, b->c.level);
+ bch2_path_put(trans, path, true);
}
bch2_journal_pin_drop(&c->journal, &as->journal);
- bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
mutex_lock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
- btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
for (i = 0; i < as->nr_open_buckets; i++)
bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
- bch2_btree_update_free(as, &trans);
- bch2_trans_exit(&trans);
+ bch2_btree_update_free(as, trans);
+ bch2_trans_put(trans);
}
static void btree_interior_update_work(struct work_struct *work)
mutex_unlock(&c->btree_interior_update_lock);
}
+static int bch2_update_reparent_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
static void btree_update_reparent(struct btree_update *as,
struct btree_update *child)
{
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
+ bch2_update_reparent_journal_pin_flush);
}
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
b->ob.v[--b->ob.nr];
}
+static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_updates - redirect @b's
* when the new nodes are persistent and reachable on disk:
*/
w = btree_current_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
struct bch_fs *c = trans->c;
struct btree_update *as;
u64 start_time = local_clock();
- int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+ int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- unsigned journal_flags = 0;
int ret = 0;
u32 restart_count = trans->restart_count;
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
- journal_flags |= JOURNAL_RES_GET_NONBLOCK;
- journal_flags |= watermark;
-
while (1) {
nr_nodes[!!update_level] += 1 + split;
update_level++;
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
- else if (!down_read_trylock(&c->gc_lock)) {
+ if (!down_read_trylock(&c->gc_lock)) {
ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
if (ret) {
up_read(&c->gc_lock);
as->c = c;
as->start_time = start_time;
as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+ as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
if (ret)
goto err;
- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags|JOURNAL_RES_GET_NONBLOCK);
- if (ret) {
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- goto err;
- }
-
- ret = drop_locks_do(trans,
- bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags));
- if (ret == -BCH_ERR_journal_preres_get_blocked) {
- trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
- }
- if (ret)
- goto err;
- }
-
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,
* flag
*/
if (bch2_err_matches(ret, ENOSPC) &&
- (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
bch2_recalc_btree_reserve(c);
}
-/**
- * bch_btree_set_root - update the root in memory and on disk
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks. However, you must hold an intent lock on the
- * old root.
- *
- * Note: This allocates a journal entry but doesn't add any keys to
- * it. All the btree roots are part of every journal write, so there
- * is nothing new to be done. This just guarantees that there is a
- * journal write.
- */
static void bch2_btree_set_root(struct btree_update *as,
struct btree_trans *trans,
struct btree_path *path,
if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
btree_node_type(b), WRITE, &buf) ?:
- bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
printbuf_reset(&buf);
prt_printf(&buf, "inserting invalid bkey\n ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_printf(&buf, "\n ");
bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
btree_node_type(b), WRITE, &buf);
- bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
+ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
bch2_fs_inconsistent(c, "%s", buf.buf);
dump_stack();
;
while (!bch2_keylist_empty(keys)) {
- struct bkey_i *k = bch2_keylist_front(keys);
+ insert = bch2_keylist_front(keys);
- if (bpos_gt(k->k.p, b->key.k.p))
+ if (bpos_gt(insert->k.p, b->key.k.p))
break;
- bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
+ bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
bch2_keylist_pop_front(keys);
}
}
path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, path1, n1);
path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, path2, n2);
/*
path2->locks_want++;
BUG_ON(btree_node_locked(path2, n3->c.level));
six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, path2, n3);
n3->sib_u64s[0] = U16_MAX;
path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, path1, n1);
if (parent)
}
/**
- * bch_btree_insert_node - insert bkeys into a given btree node
+ * bch2_btree_insert_node - insert bkeys into a given btree node
*
- * @iter: btree iterator
+ * @as: btree_update object
+ * @trans: btree_trans object
+ * @path: path that points to current node
+ * @b: node to insert keys into
* @keys: list of keys to insert
- * @hook: insert callback
- * @persistent: if not null, @persistent will wait on journal write
+ * @flags: transaction commit flags
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
*
* Inserts as many keys as it can into a given btree node, splitting it if full.
* If a split occurred, this function will return early. This can only happen
parent = btree_node_parent(path, b);
as = bch2_btree_update_start(trans, path, level, false,
- BTREE_INSERT_NOFAIL|flags);
+ BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, new_path, n);
bkey_init(&delete.k);
goto out;
}
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- */
int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b,
struct btree_update *as;
int ret;
- flags |= BTREE_INSERT_NOFAIL;
+ flags |= BCH_TRANS_COMMIT_no_enospc;
parent = btree_node_parent(iter->path, b);
as = bch2_btree_update_start(trans, iter->path, b->c.level,
new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+ mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, new_path, n);
trace_and_count(c, btree_node_rewrite, c, b);
out:
if (new_path)
bch2_path_put(trans, new_path, true);
- bch2_btree_path_downgrade(trans, iter->path);
+ bch2_trans_downgrade(trans);
return ret;
err:
bch2_btree_node_free_never_used(as, trans, n);
int ret;
ret = bch2_trans_do(c, NULL, NULL, 0,
- async_btree_node_rewrite_trans(&trans, a));
+ async_btree_node_rewrite_trans(trans, a));
if (ret)
- bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
+ bch_err_fn(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
ret = bch2_fs_read_write_early(c);
if (ret) {
- bch_err(c, "%s: error going read-write: %s",
- __func__, bch2_err_str(ret));
+ bch_err_msg(c, ret, "going read-write");
kfree(a);
return;
}
void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
{
- bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id));
+ bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
}
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
as,
as->mode,
as->nodes_written,
- atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+ closure_nr_remaining(&as->cl),
as->journal.seq);
mutex_unlock(&c->btree_interior_update_lock);
}
r->level = entry->level;
r->alive = true;
- bkey_copy(&r->key, &entry->start[0]);
+ bkey_copy(&r->key, (struct bkey_i *) entry->start);
mutex_unlock(&c->btree_root_lock);
}
struct jset_entry *
bch2_btree_roots_to_journal_entries(struct bch_fs *c,
- struct jset_entry *start,
- struct jset_entry *end)
+ struct jset_entry *end,
+ unsigned long skip)
{
- struct jset_entry *entry;
- unsigned long have = 0;
unsigned i;
- for (entry = start; entry < end; entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_btree_root)
- __set_bit(entry->btree_id, &have);
-
mutex_lock(&c->btree_root_lock);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
- if (r->alive && !test_bit(i, &have)) {
+ if (r->alive && !test_bit(i, &skip)) {
journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
i, r->level, &r->key, r->key.k.u64s);
end = vstruct_next(end);
unsigned update_level;
struct disk_reservation disk_res;
- struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
- __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+ __bch_btree_u64s_remaining(c, b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
- bkey_copy(unwritten_whiteouts_start(c, b), &k);
+ bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
}
/*
void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
- struct jset_entry *, struct jset_entry *);
+ struct jset_entry *, unsigned long);
void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *);
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "extent_update.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery.h"
-#include "subvolume.h"
-#include "replicas.h"
-#include "trace.h"
-
-#include <linux/prefetch.h>
-#include <linux/sort.h>
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
- struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
- if (k.k && bpos_eq(path->pos, k.k->p))
- return k;
-
- bkey_init(u);
- u->p = path->pos;
- return (struct bkey_s_c) { u, NULL };
-}
-
-static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bch_fs *c = trans->c;
- struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
-
- if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
- bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-
- if (j_k)
- k = bkey_i_to_s_c(j_k);
- }
-
- u = *k.k;
- u.needs_whiteout = i->old_k.needs_whiteout;
-
- BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
- BUG_ON(i->old_v != k.v);
-#endif
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
- struct bkey_i *, enum btree_update_flags,
- unsigned long ip);
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
- const struct btree_insert_entry *r)
-{
- return cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->cached, r->cached) ?:
- -cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p);
-}
-
-static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
-{
- return i->path->l + i->level;
-}
-
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- return i != trans->updates &&
- insert_l(&i[0])->b == insert_l(&i[-1])->b;
-}
-
-static inline bool same_leaf_as_next(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- return i + 1 < trans->updates + trans->nr_updates &&
- insert_l(&i[0])->b == insert_l(&i[1])->b;
-}
-
-inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b)
-{
- struct bch_fs *c = trans->c;
-
- if (unlikely(btree_node_just_written(b)) &&
- bch2_btree_post_write_cleanup(c, b))
- bch2_trans_node_reinit_iter(trans, b);
-
- /*
- * If the last bset has been written, or if it's gotten too big - start
- * a new bset to insert into:
- */
- if (want_new_bset(c, b))
- bch2_btree_init_next(trans, b);
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bkey_i *insert)
-{
- struct bkey_packed *k;
- unsigned clobber_u64s = 0, new_u64s = 0;
-
- EBUG_ON(btree_node_just_written(b));
- EBUG_ON(bset_written(b, btree_bset_last(b)));
- EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
- EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
- EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, b));
-
- k = bch2_btree_node_iter_peek_all(node_iter, b);
- if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
- k = NULL;
-
- /* @k is the key being overwritten/deleted, if any: */
- EBUG_ON(k && bkey_deleted(k));
-
- /* Deleting, but not found? nothing to do: */
- if (bkey_deleted(&insert->k) && !k)
- return false;
-
- if (bkey_deleted(&insert->k)) {
- /* Deleting: */
- btree_account_key_drop(b, k);
- k->type = KEY_TYPE_deleted;
-
- if (k->needs_whiteout)
- push_whiteout(trans->c, b, insert->k.p);
- k->needs_whiteout = false;
-
- if (k >= btree_bset_last(b)->start) {
- clobber_u64s = k->u64s;
- bch2_bset_delete(b, k, clobber_u64s);
- goto fix_iter;
- } else {
- bch2_btree_path_fix_key_modified(trans, b, k);
- }
-
- return true;
- }
-
- if (k) {
- /* Overwriting: */
- btree_account_key_drop(b, k);
- k->type = KEY_TYPE_deleted;
-
- insert->k.needs_whiteout = k->needs_whiteout;
- k->needs_whiteout = false;
-
- if (k >= btree_bset_last(b)->start) {
- clobber_u64s = k->u64s;
- goto overwrite;
- } else {
- bch2_btree_path_fix_key_modified(trans, b, k);
- }
- }
-
- k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-overwrite:
- bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
- new_u64s = k->u64s;
-fix_iter:
- if (clobber_u64s != new_u64s)
- bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
- clobber_u64s, new_u64s);
- return true;
-}
-
-static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
- unsigned i, u64 seq)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct btree_write *w = container_of(pin, struct btree_write, journal);
- struct btree *b = container_of(w, struct btree, writes[i]);
- struct btree_trans trans;
- unsigned long old, new, v;
- unsigned idx = w - b->writes;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
- v = READ_ONCE(b->flags);
-
- do {
- old = new = v;
-
- if (!(old & (1 << BTREE_NODE_dirty)) ||
- !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
- w->journal.seq != seq)
- break;
-
- new &= ~BTREE_WRITE_TYPE_MASK;
- new |= BTREE_WRITE_journal_reclaim;
- new |= 1 << BTREE_NODE_need_write;
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
-
- btree_node_write_if_need(c, b, SIX_LOCK_read);
- six_unlock_read(&b->c.lock);
-
- bch2_trans_exit(&trans);
- return 0;
-}
-
-int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
- return __btree_node_flush(j, pin, 0, seq);
-}
-
-int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
- return __btree_node_flush(j, pin, 1, seq);
-}
-
-inline void bch2_btree_add_journal_pin(struct bch_fs *c,
- struct btree *b, u64 seq)
-{
- struct btree_write *w = btree_current_write(b);
-
- bch2_journal_pin_add(&c->journal, seq, &w->journal,
- btree_node_write_idx(b) == 0
- ? bch2_btree_node_flush0
- : bch2_btree_node_flush1);
-}
-
-/**
- * btree_insert_key - insert a key one key into a leaf node
- */
-inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_path *path,
- struct bkey_i *insert,
- u64 journal_seq)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = path_l(path)->b;
- struct bset_tree *t = bset_tree_last(b);
- struct bset *i = bset(b, t);
- int old_u64s = bset_u64s(t);
- int old_live_u64s = b->nr.live_u64s;
- int live_u64s_added, u64s_added;
-
- if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
- &path_l(path)->iter, insert)))
- return;
-
- i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-
- bch2_btree_add_journal_pin(c, b, journal_seq);
-
- if (unlikely(!btree_node_dirty(b))) {
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
- set_btree_node_dirty_acct(c, b);
- }
-
- live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
- u64s_added = (int) bset_u64s(t) - old_u64s;
-
- if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
- if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
- b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
- if (u64s_added > live_u64s_added &&
- bch2_maybe_compact_whiteouts(c, b))
- bch2_trans_node_reinit_iter(trans, b);
-}
-
-/* Cached btree updates: */
-
-/* Normal update interface: */
-
-static inline void btree_insert_entry_checks(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
- BUG_ON(i->cached != i->path->cached);
- BUG_ON(i->level != i->path->level);
- BUG_ON(i->btree_id != i->path->btree_id);
- EBUG_ON(!i->level &&
- !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
- test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
- i->k->k.p.snapshot &&
- bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
-}
-
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
- unsigned long trace_ip)
-{
- return drop_locks_do(trans,
- bch2_journal_preres_get(&trans->c->journal,
- &trans->journal_preres,
- trans->journal_preres_u64s,
- (flags & BCH_WATERMARK_MASK)));
-}
-
-static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
- unsigned flags)
-{
- return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
- trans->journal_u64s, flags);
-}
-
-#define JSET_ENTRY_LOG_U64s 4
-
-static noinline void journal_transaction_name(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
- struct jset_entry *entry =
- bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_log, 0, 0,
- JSET_ENTRY_LOG_U64s);
- struct jset_entry_log *l =
- container_of(entry, struct jset_entry_log, entry);
-
- strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
-}
-
-static inline int btree_key_can_insert(struct btree_trans *trans,
- struct btree *b, unsigned u64s)
-{
- struct bch_fs *c = trans->c;
-
- if (!bch2_btree_node_insert_fits(c, b, u64s))
- return -BCH_ERR_btree_insert_btree_node_full;
-
- return 0;
-}
-
-static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
- struct btree_path *path, unsigned u64s)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) path->l[0].b;
- struct btree_insert_entry *i;
- unsigned new_u64s;
- struct bkey_i *new_k;
-
- EBUG_ON(path->level);
-
- if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(c) &&
- !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
- return -BCH_ERR_btree_insert_need_journal_reclaim;
-
- /*
- * bch2_varint_decode can read past the end of the buffer by at most 7
- * bytes (it won't be used):
- */
- u64s += 1;
-
- if (u64s <= ck->u64s)
- return 0;
-
- new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
- if (!new_k) {
- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_ids[path->btree_id], new_u64s);
- return -BCH_ERR_ENOMEM_btree_key_cache_insert;
- }
-
- trans_for_each_update(trans, i)
- if (i->old_v == &ck->k->v)
- i->old_v = &new_k->v;
-
- ck->u64s = new_u64s;
- ck->k = new_k;
- return 0;
-}
-
-/* Triggers: */
-
-static int run_one_mem_trigger(struct btree_trans *trans,
- struct btree_insert_entry *i,
- unsigned flags)
-{
- struct bkey_s_c old = { &i->old_k, i->old_v };
- struct bkey_i *new = i->k;
- const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
- const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
- int ret;
-
- verify_update_old_key(trans, i);
-
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
- return 0;
-
- if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
- return 0;
-
- if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- ret = bch2_mark_key(trans, i->btree_id, i->level,
- old, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
- } else {
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
-
- _deleted.p = i->path->pos;
-
- ret = bch2_mark_key(trans, i->btree_id, i->level,
- deleted, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key(trans, i->btree_id, i->level,
- old, deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
- }
-
- return ret;
-}
-
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
- bool overwrite)
-{
- /*
- * Transactional triggers create new btree_insert_entries, so we can't
- * pass them a pointer to a btree_insert_entry, that memory is going to
- * move:
- */
- struct bkey old_k = i->old_k;
- struct bkey_s_c old = { &old_k, i->old_v };
- const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
- const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-
- verify_update_old_key(trans, i);
-
- if ((i->flags & BTREE_TRIGGER_NORUN) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- return 0;
-
- if (!i->insert_trigger_run &&
- !i->overwrite_trigger_run &&
- old_ops->trans_trigger == new_ops->trans_trigger &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- i->overwrite_trigger_run = true;
- i->insert_trigger_run = true;
- return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_OVERWRITE|
- i->flags) ?: 1;
- } else if (overwrite && !i->overwrite_trigger_run) {
- i->overwrite_trigger_run = true;
- return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
- } else if (!overwrite && !i->insert_trigger_run) {
- i->insert_trigger_run = true;
- return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
- } else {
- return 0;
- }
-}
-
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- struct btree_insert_entry *btree_id_start)
-{
- struct btree_insert_entry *i;
- bool trans_trigger_run;
- int ret, overwrite;
-
- for (overwrite = 1; overwrite >= 0; --overwrite) {
-
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- for (i = btree_id_start;
- i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
- i++) {
- if (i->btree_id != btree_id)
- continue;
-
- ret = run_one_trans_trigger(trans, i, overwrite);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
- }
- } while (trans_trigger_run);
- }
-
- return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
- unsigned btree_id = 0;
- int ret = 0;
-
- /*
- *
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being moved
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
- * they are re-added.
- */
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
- if (btree_id == BTREE_ID_alloc)
- continue;
-
- while (btree_id_start < trans->updates + trans->nr_updates &&
- btree_id_start->btree_id < btree_id)
- btree_id_start++;
-
- ret = run_btree_triggers(trans, btree_id, btree_id_start);
- if (ret)
- return ret;
- }
-
- trans_for_each_update(trans, i) {
- if (i->btree_id > BTREE_ID_alloc)
- break;
- if (i->btree_id == BTREE_ID_alloc) {
- ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
- if (ret)
- return ret;
- break;
- }
- }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
-#endif
- return 0;
-}
-
-static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- int ret = 0;
-
- trans_for_each_update(trans, i) {
- /*
- * XXX: synchronization of cached update triggers with gc
- * XXX: synchronization of interior node updates with gc
- */
- BUG_ON(i->cached || i->level);
-
- if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
- ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
- if (ret)
- break;
- }
- }
-
- return ret;
-}
-
-static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry **stopped_at,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
- struct btree_trans_commit_hook *h;
- unsigned u64s = 0;
- bool marking = false;
- int ret;
-
- if (race_fault()) {
- trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
- return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
- }
-
- /*
- * Check if the insert will fit in the leaf node with the write lock
- * held, otherwise another thread could write the node changing the
- * amount of space available:
- */
-
- prefetch(&trans->c->journal.flags);
-
- trans_for_each_update(trans, i) {
- /* Multiple inserts might go to same leaf: */
- if (!same_leaf_as_prev(trans, i))
- u64s = 0;
-
- u64s += i->k->k.u64s;
- ret = !i->cached
- ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
- : btree_key_can_insert_cached(trans, flags, i->path, u64s);
- if (ret) {
- *stopped_at = i;
- return ret;
- }
-
- if (btree_node_type_needs_gc(i->bkey_type))
- marking = true;
- }
-
- if (trans->nr_wb_updates &&
- trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
- return -BCH_ERR_btree_insert_need_flush_buffer;
-
- /*
- * Don't get journal reservation until after we know insert will
- * succeed:
- */
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- ret = bch2_trans_journal_res_get(trans,
- (flags & BCH_WATERMARK_MASK)|
- JOURNAL_RES_GET_NONBLOCK);
- if (ret)
- return ret;
-
- if (unlikely(trans->journal_transaction_names))
- journal_transaction_name(trans);
- } else {
- trans->journal_res.seq = c->journal.replay_journal_seq;
- }
-
- /*
- * Not allowed to fail after we've gotten our journal reservation - we
- * have to use it:
- */
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
- if (bch2_journal_seq_verify)
- trans_for_each_update(trans, i)
- i->k->k.version.lo = trans->journal_res.seq;
- else if (bch2_inject_invalid_keys)
- trans_for_each_update(trans, i)
- i->k->k.version = MAX_VERSION;
- }
-
- if (trans->fs_usage_deltas &&
- bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
- return -BCH_ERR_btree_insert_need_mark_replicas;
-
- if (trans->nr_wb_updates) {
- EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
-
- ret = bch2_btree_insert_keys_write_buffer(trans);
- if (ret)
- goto revert_fs_usage;
- }
-
- h = trans->hooks;
- while (h) {
- ret = h->fn(trans, h);
- if (ret)
- goto revert_fs_usage;
- h = h->next;
- }
-
- trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, i->flags);
- if (ret)
- goto fatal_err;
- }
-
- if (unlikely(c->gc_pos.phase)) {
- ret = bch2_trans_commit_run_gc_triggers(trans);
- if (ret)
- goto fatal_err;
- }
-
- if (unlikely(trans->extra_journal_entries.nr)) {
- memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->extra_journal_entries.data,
- trans->extra_journal_entries.nr);
-
- trans->journal_res.offset += trans->extra_journal_entries.nr;
- trans->journal_res.u64s -= trans->extra_journal_entries.nr;
- }
-
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- struct journal *j = &c->journal;
- struct jset_entry *entry;
-
- trans_for_each_update(trans, i) {
- if (i->key_cache_already_flushed)
- continue;
-
- if (i->flags & BTREE_UPDATE_NOJOURNAL)
- continue;
-
- verify_update_old_key(trans, i);
-
- if (trans->journal_transaction_names) {
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_overwrite,
- i->btree_id, i->level,
- i->old_k.u64s);
- bkey_reassemble(&entry->start[0],
- (struct bkey_s_c) { &i->old_k, i->old_v });
- }
-
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_btree_keys,
- i->btree_id, i->level,
- i->k->k.u64s);
- bkey_copy(&entry->start[0], i->k);
- }
-
- trans_for_each_wb_update(trans, wb) {
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_btree_keys,
- wb->btree, 0,
- wb->k.k.u64s);
- bkey_copy(&entry->start[0], &wb->k);
- }
-
- if (trans->journal_seq)
- *trans->journal_seq = trans->journal_res.seq;
- }
-
- trans_for_each_update(trans, i) {
- i->k->k.needs_whiteout = false;
-
- if (!i->cached) {
- u64 seq = trans->journal_res.seq;
-
- if (i->flags & BTREE_UPDATE_PREJOURNAL)
- seq = i->seq;
-
- bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
- } else if (!i->key_cache_already_flushed)
- bch2_btree_insert_key_cached(trans, flags, i);
- else {
- bch2_btree_key_cache_drop(trans, i->path);
- btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
- }
- }
-
- return 0;
-fatal_err:
- bch2_fatal_error(c);
-revert_fs_usage:
- if (trans->fs_usage_deltas)
- bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
- return ret;
-}
-
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
- while (--i >= trans->updates) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
- }
-
- trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i) {
- if (same_leaf_as_prev(trans, i))
- continue;
-
- if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
- return trans_lock_write_fail(trans, i);
-
- if (!i->cached)
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
- }
-
- return 0;
-}
-
-static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-{
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
-
- trans_for_each_update(trans, i)
- bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-
- trans_for_each_wb_update(trans, wb)
- bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry *i,
- struct printbuf *err)
-{
- struct bch_fs *c = trans->c;
- int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
-
- printbuf_reset(err);
- prt_printf(err, "invalid bkey on insert from %s -> %ps",
- trans->fn, (void *) i->ip_allocated);
- prt_newline(err);
- printbuf_indent_add(err, 2);
-
- bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
- prt_newline(err);
-
- bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
- i->bkey_type, rw, err);
- bch2_print_string_as_lines(KERN_ERR, err->buf);
-
- bch2_inconsistent_error(c);
- bch2_dump_trans_updates(trans);
- printbuf_exit(err);
-
- return -EINVAL;
-}
-#endif
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry **stopped_at,
- unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- int ret = 0, u64s_delta = 0;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans_for_each_update(trans, i) {
- struct printbuf buf = PRINTBUF;
- enum bkey_invalid_flags invalid_flags = 0;
-
- if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
- invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
-
- if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
- i->bkey_type, invalid_flags, &buf)))
- ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
- btree_insert_entry_checks(trans, i);
- printbuf_exit(&buf);
-
- if (ret)
- return ret;
- }
-#endif
-
- trans_for_each_update(trans, i) {
- if (i->cached)
- continue;
-
- u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
- u64s_delta -= i->old_btree_u64s;
-
- if (!same_leaf_as_next(trans, i)) {
- if (u64s_delta <= 0) {
- ret = bch2_foreground_maybe_merge(trans, i->path,
- i->level, flags);
- if (unlikely(ret))
- return ret;
- }
-
- u64s_delta = 0;
- }
- }
-
- ret = bch2_journal_preres_get(&c->journal,
- &trans->journal_preres, trans->journal_preres_u64s,
- (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
- if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
- ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
- if (unlikely(ret))
- return ret;
-
- ret = trans_lock_write(trans);
- if (unlikely(ret))
- return ret;
-
- ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-
- if (!ret && unlikely(trans->journal_replay_not_finished))
- bch2_drop_overwrites_from_journal(trans);
-
- trans_for_each_update(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(trans, i->path,
- insert_l(i)->b);
-
- if (!ret && trans->journal_pin)
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- trans->journal_pin, NULL);
-
- /*
- * Drop journal reservation after dropping write locks, since dropping
- * the journal reservation may kick off a journal write:
- */
- bch2_journal_res_put(&c->journal, &trans->journal_res);
-
- if (unlikely(ret))
- return ret;
-
- bch2_trans_downgrade(trans);
-
- return 0;
-}
-
-static int journal_reclaim_wait_done(struct bch_fs *c)
-{
- int ret = bch2_journal_error(&c->journal) ?:
- !bch2_btree_key_cache_must_wait(c);
-
- if (!ret)
- journal_reclaim_kick(&c->journal);
- return ret;
-}
-
-static noinline
-int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
- struct btree_insert_entry *i,
- int ret, unsigned long trace_ip)
-{
- struct bch_fs *c = trans->c;
-
- switch (ret) {
- case -BCH_ERR_btree_insert_btree_node_full:
- ret = bch2_btree_split_leaf(trans, i->path, flags);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
- break;
- case -BCH_ERR_btree_insert_need_mark_replicas:
- ret = drop_locks_do(trans,
- bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
- break;
- case -BCH_ERR_journal_res_get_blocked:
- /*
- * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
- * flag
- */
- if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
- (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- break;
- }
-
- ret = drop_locks_do(trans,
- bch2_trans_journal_res_get(trans,
- (flags & BCH_WATERMARK_MASK)|
- JOURNAL_RES_GET_CHECK));
- break;
- case -BCH_ERR_btree_insert_need_journal_reclaim:
- bch2_trans_unlock(trans);
-
- trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
-
- wait_event_freezable(c->journal.reclaim_wait,
- (ret = journal_reclaim_wait_done(c)));
- if (ret < 0)
- break;
-
- ret = bch2_trans_relock(trans);
- break;
- case -BCH_ERR_btree_insert_need_flush_buffer: {
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- ret = 0;
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_unlock(trans);
- mutex_lock(&wb->flush_lock);
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_begin(trans);
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- } else {
- mutex_unlock(&wb->flush_lock);
- ret = bch2_trans_relock(trans);
- }
- }
- break;
- }
- default:
- BUG_ON(ret >= 0);
- break;
- }
-
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
- !(flags & BTREE_INSERT_NOWAIT) &&
- (flags & BTREE_INSERT_NOFAIL), c,
- "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-
- return ret;
-}
-
-static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- int ret;
-
- if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
- test_bit(BCH_FS_STARTED, &c->flags))
- return -BCH_ERR_erofs_trans_commit;
-
- ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
- if (ret)
- return ret;
-
- bch2_write_ref_get(c, BCH_WRITE_REF_trans);
- return 0;
-}
-
-/*
- * This is for updates done in the early part of fsck - btree_gc - before we've
- * gone RW. we only add the new key to the list of keys for journal replay to
- * do.
- */
-static noinline int
-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- int ret = 0;
-
- trans_for_each_update(trans, i) {
- ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i = NULL;
- struct btree_write_buffered_key *wb;
- unsigned u64s;
- int ret = 0;
-
- if (!trans->nr_updates &&
- !trans->nr_wb_updates &&
- !trans->extra_journal_entries.nr)
- goto out_reset;
-
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
-
- ret = bch2_trans_commit_run_triggers(trans);
- if (ret)
- goto out_reset;
-
- if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
- ret = do_bch2_trans_commit_to_journal_replay(trans);
- goto out_reset;
- }
-
- if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
- unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
- ret = bch2_trans_commit_get_rw_cold(trans, flags);
- if (ret)
- goto out_reset;
- }
-
- if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
- mutex_trylock(&c->btree_write_buffer.flush_lock)) {
- bch2_trans_begin(trans);
- bch2_trans_unlock(trans);
-
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- goto out;
- }
-
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
-
- memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
- trans->journal_u64s = trans->extra_journal_entries.nr;
- trans->journal_preres_u64s = 0;
-
- trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
- if (trans->journal_transaction_names)
- trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-
- trans_for_each_update(trans, i) {
- EBUG_ON(!i->path->should_be_locked);
-
- ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
- if (unlikely(ret))
- goto out;
-
- EBUG_ON(!btree_node_intent_locked(i->path, i->level));
-
- if (i->key_cache_already_flushed)
- continue;
-
- /* we're going to journal the key being updated: */
- u64s = jset_u64s(i->k->k.u64s);
- if (i->cached &&
- likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
- trans->journal_preres_u64s += u64s;
-
- if (i->flags & BTREE_UPDATE_NOJOURNAL)
- continue;
-
- trans->journal_u64s += u64s;
-
- /* and we're also going to log the overwrite: */
- if (trans->journal_transaction_names)
- trans->journal_u64s += jset_u64s(i->old_k.u64s);
- }
-
- trans_for_each_wb_update(trans, wb)
- trans->journal_u64s += jset_u64s(wb->k.k.u64s);
-
- if (trans->extra_journal_res) {
- ret = bch2_disk_reservation_add(c, trans->disk_res,
- trans->extra_journal_res,
- (flags & BTREE_INSERT_NOFAIL)
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- goto err;
- }
-retry:
- bch2_trans_verify_not_in_restart(trans);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-
- ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
-
- /* make sure we didn't drop or screw up locks: */
- bch2_trans_verify_locks(trans);
-
- if (ret)
- goto err;
-
- trace_and_count(c, transaction_commit, trans, _RET_IP_);
-out:
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
- if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
- bch2_write_ref_put(c, BCH_WRITE_REF_trans);
-out_reset:
- bch2_trans_reset_updates(trans);
-
- return ret;
-err:
- ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
- if (ret)
- goto out;
-
- goto retry;
-}
-
-static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
- enum btree_id id,
- struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, id, pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
- while (1) {
- k = bch2_btree_iter_prev(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- if (!k.k)
- break;
-
- if (!bkey_eq(pos, k.k->p))
- break;
-
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
- ret = 1;
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
- enum btree_id id,
- struct bpos pos)
-{
- if (!btree_type_has_snapshots(id) ||
- bch2_snapshot_is_leaf(trans->c, pos.snapshot))
- return 0;
-
- return __check_pos_snapshot_overwritten(trans, id, pos);
-}
-
-static noinline int extent_front_merge(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bkey_i **insert,
- enum btree_update_flags flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i *update;
- int ret;
-
- update = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- return ret;
-
- if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
- return 0;
-
- ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
- check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
-
- ret = bch2_btree_delete_at(trans, iter, flags);
- if (ret)
- return ret;
-
- *insert = update;
- return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- int ret;
-
- ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
- check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
-
- bch2_bkey_merge(c, bkey_i_to_s(insert), k);
- return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u32 snapshot = pos.snapshot;
- int ret;
-
- if (!bch2_snapshot_parent(trans->c, pos.snapshot))
- return 0;
-
- pos.snapshot++;
-
- for_each_btree_key_norestart(trans, iter, btree_id, pos,
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_NOPRESERVE, k, ret) {
- if (!bkey_eq(k.k->p, pos))
- break;
-
- if (bch2_snapshot_is_ancestor(trans->c, snapshot,
- k.k->p.snapshot)) {
- ret = !bkey_whiteout(k.k);
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id id,
- struct bpos old_pos,
- struct bpos new_pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter old_iter, new_iter = { NULL };
- struct bkey_s_c old_k, new_k;
- snapshot_id_list s;
- struct bkey_i *update;
- int ret;
-
- if (!bch2_snapshot_has_children(c, old_pos.snapshot))
- return 0;
-
- darray_init(&s);
-
- bch2_trans_iter_init(trans, &old_iter, id, old_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
- while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
- !(ret = bkey_err(old_k)) &&
- bkey_eq(old_pos, old_k.k->p)) {
- struct bpos whiteout_pos =
- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
-
- if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
- snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
- continue;
-
- new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
- ret = bkey_err(new_k);
- if (ret)
- break;
-
- if (new_k.k->type == KEY_TYPE_deleted) {
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
- ret = PTR_ERR_OR_ZERO(update);
- if (ret)
- break;
-
- bkey_init(&update->k);
- update->k.p = whiteout_pos;
- update->k.type = KEY_TYPE_whiteout;
-
- ret = bch2_trans_update(trans, &new_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- }
- bch2_trans_iter_exit(trans, &new_iter);
-
- ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &new_iter);
- bch2_trans_iter_exit(trans, &old_iter);
- darray_exit(&s);
-
- return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
- struct btree_iter *iter,
- enum btree_update_flags flags,
- struct bkey_s_c old,
- struct bkey_s_c new)
-{
- enum btree_id btree_id = iter->btree_id;
- struct bkey_i *update;
- struct bpos new_start = bkey_start_pos(new.k);
- bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
- bool back_split = bkey_gt(old.k->p, new.k->p);
- int ret = 0, compressed_sectors;
-
- /*
- * If we're going to be splitting a compressed extent, note it
- * so that __bch2_trans_commit() can increase our disk
- * reservation:
- */
- if (((front_split && back_split) ||
- ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
- (compressed_sectors = bch2_bkey_sectors_compressed(old)))
- trans->extra_journal_res += compressed_sectors;
-
- if (front_split) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_back(new_start, update);
-
- ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
- old.k->p, update->k.p) ?:
- bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
- if (ret)
- return ret;
- }
-
- /* If we're overwriting in a different snapshot - middle split: */
- if (old.k->p.snapshot != new.k->p.snapshot &&
- (front_split || back_split)) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_front(new_start, update);
- bch2_cut_back(new.k->p, update);
-
- ret = bch2_insert_snapshot_whiteouts(trans, btree_id,
- old.k->p, update->k.p) ?:
- bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
- if (ret)
- return ret;
- }
-
- if (bkey_le(old.k->p, new.k->p)) {
- update = bch2_trans_kmalloc(trans, sizeof(*update));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bkey_init(&update->k);
- update->k.p = old.k->p;
- update->k.p.snapshot = new.k->p.snapshot;
-
- if (new.k->p.snapshot != old.k->p.snapshot) {
- update->k.type = KEY_TYPE_whiteout;
- } else if (btree_type_has_snapshots(btree_id)) {
- ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
- if (ret < 0)
- return ret;
- if (ret)
- update->k.type = KEY_TYPE_whiteout;
- }
-
- ret = bch2_btree_insert_nonextent(trans, btree_id, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
- if (ret)
- return ret;
- }
-
- if (back_split) {
- update = bch2_bkey_make_mut_noupdate(trans, old);
- if ((ret = PTR_ERR_OR_ZERO(update)))
- return ret;
-
- bch2_cut_front(new.k->p, update);
-
- ret = bch2_trans_update_by_path(trans, iter->path, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- flags, _RET_IP_);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
- struct btree_iter *orig_iter,
- struct bkey_i *insert,
- enum btree_update_flags flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- enum btree_id btree_id = orig_iter->btree_id;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_NOT_EXTENTS);
- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
- if ((ret = bkey_err(k)))
- goto err;
- if (!k.k)
- goto out;
-
- if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
- if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
- ret = extent_front_merge(trans, &iter, k, &insert, flags);
- if (ret)
- goto err;
- }
-
- goto next;
- }
-
- while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
- bool done = bkey_lt(insert->k.p, k.k->p);
-
- ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
- if (ret)
- goto err;
-
- if (done)
- goto out;
-next:
- bch2_btree_iter_advance(&iter);
- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
- if ((ret = bkey_err(k)))
- goto err;
- if (!k.k)
- goto out;
- }
-
- if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
- ret = extent_back_merge(trans, &iter, insert, k);
- if (ret)
- goto err;
- }
-out:
- if (!bkey_deleted(&insert->k))
- ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_insert_entry *i,
- enum btree_update_flags flags,
- unsigned long ip)
-{
- struct btree_path *btree_path;
- struct bkey k;
- int ret;
-
- btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_INTENT, _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, btree_path, 0);
- if (ret)
- goto out;
-
- /*
- * The old key in the insert entry might actually refer to an existing
- * key in the btree that has been deleted from cache and not yet
- * flushed. Check for this and skip the flush so we don't run triggers
- * against a stale key.
- */
- bch2_btree_path_peek_slot_exact(btree_path, &k);
- if (!bkey_deleted(&k))
- goto out;
-
- i->key_cache_already_flushed = true;
- i->flags |= BTREE_TRIGGER_NORUN;
-
- btree_path_set_should_be_locked(btree_path);
- ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
-out:
- bch2_path_put(trans, btree_path, true);
- return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
- struct bkey_i *k, enum btree_update_flags flags,
- unsigned long ip)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i, n;
- u64 seq = 0;
- int cmp;
-
- EBUG_ON(!path->should_be_locked);
- EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
- EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
- /*
- * The transaction journal res hasn't been allocated at this point.
- * That occurs at commit time. Reuse the seq field to pass in the seq
- * of a prejournaled key.
- */
- if (flags & BTREE_UPDATE_PREJOURNAL)
- seq = trans->journal_res.seq;
-
- n = (struct btree_insert_entry) {
- .flags = flags,
- .bkey_type = __btree_node_type(path->level, path->btree_id),
- .btree_id = path->btree_id,
- .level = path->level,
- .cached = path->cached,
- .path = path,
- .k = k,
- .seq = seq,
- .ip_allocated = ip,
- };
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans_for_each_update(trans, i)
- BUG_ON(i != trans->updates &&
- btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
- /*
- * Pending updates are kept sorted: first, find position of new update,
- * then delete/trim any updates the new update overwrites:
- */
- trans_for_each_update(trans, i) {
- cmp = btree_insert_entry_cmp(&n, i);
- if (cmp <= 0)
- break;
- }
-
- if (!cmp && i < trans->updates + trans->nr_updates) {
- EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
- bch2_path_put(trans, i->path, true);
- i->flags = n.flags;
- i->cached = n.cached;
- i->k = n.k;
- i->path = n.path;
- i->seq = n.seq;
- i->ip_allocated = n.ip_allocated;
- } else {
- array_insert_item(trans->updates, trans->nr_updates,
- i - trans->updates, n);
-
- i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
- i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
- if (unlikely(trans->journal_replay_not_finished)) {
- struct bkey_i *j_k =
- bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
- if (j_k) {
- i->old_k = j_k->k;
- i->old_v = &j_k->v;
- }
- }
- }
-
- __btree_path_get(i->path, true);
-
- /*
- * If a key is present in the key cache, it must also exist in the
- * btree - this is necessary for cache coherency. When iterating over
- * a btree that's cached in the key cache, the btree iter code checks
- * the key cache - but the key has to exist in the btree for that to
- * work:
- */
- if (path->cached && bkey_deleted(&i->old_k))
- return flush_new_cached_update(trans, path, i, flags, ip);
-
- return 0;
-}
-
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_update_flags flags)
-{
- struct btree_path *path = iter->update_path ?: iter->path;
- struct bkey_cached *ck;
- int ret;
-
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- return bch2_trans_update_extent(trans, iter, k, flags);
-
- if (bkey_deleted(&k->k) &&
- !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
- (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
- ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
- if (unlikely(ret < 0))
- return ret;
-
- if (ret)
- k->k.type = KEY_TYPE_whiteout;
- }
-
- /*
- * Ensure that updates to cached btrees go to the key cache:
- */
- if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
- !path->cached &&
- !path->level &&
- btree_id_cached(trans->c, path->btree_id)) {
- if (!iter->key_cache_path ||
- !iter->key_cache_path->should_be_locked ||
- !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
- if (!iter->key_cache_path)
- iter->key_cache_path =
- bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_INTENT|
- BTREE_ITER_CACHED, _THIS_IP_);
-
- iter->key_cache_path =
- bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
- iter->flags & BTREE_ITER_INTENT,
- _THIS_IP_);
-
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- BTREE_ITER_CACHED);
- if (unlikely(ret))
- return ret;
-
- ck = (void *) iter->key_cache_path->l[0].b;
-
- if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
- }
-
- btree_path_set_should_be_locked(iter->key_cache_path);
- }
-
- path = iter->key_cache_path;
- }
-
- return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
-}
-
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
- struct btree_iter *iter, struct bkey_i *k,
- enum btree_update_flags flags)
-{
- trans->journal_res.seq = seq;
- return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
- BTREE_UPDATE_PREJOURNAL);
-}
-
-int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
-{
- struct btree_write_buffered_key *i;
- int ret;
-
- EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
- EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
- trans_for_each_wb_update(trans, i) {
- if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
- bkey_copy(&i->k, k);
- return 0;
- }
- }
-
- if (!trans->wb_updates ||
- trans->nr_wb_updates == trans->wb_updates_size) {
- struct btree_write_buffered_key *u;
-
- if (trans->nr_wb_updates == trans->wb_updates_size) {
- struct btree_transaction_stats *s = btree_trans_stats(trans);
-
- BUG_ON(trans->wb_updates_size > U8_MAX / 2);
- trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
- if (s)
- s->wb_updates_size = trans->wb_updates_size;
- }
-
- u = bch2_trans_kmalloc_nomemzero(trans,
- trans->wb_updates_size *
- sizeof(struct btree_write_buffered_key));
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- if (trans->nr_wb_updates)
- memcpy(u, trans->wb_updates, trans->nr_wb_updates *
- sizeof(struct btree_write_buffered_key));
- trans->wb_updates = u;
- }
-
- trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
- .btree = btree,
- };
-
- bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
- trans->nr_wb_updates++;
-
- return 0;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
- enum btree_id btree, struct bpos end)
-{
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
- k = bch2_btree_iter_prev(iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bch2_btree_iter_advance(iter);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- BUG_ON(k.k->type != KEY_TYPE_deleted);
-
- if (bkey_gt(k.k->p, end)) {
- ret = -BCH_ERR_ENOSPC_btree_slot;
- goto err;
- }
-
- return 0;
-err:
- bch2_trans_iter_exit(trans, iter);
- return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
- struct btree_trans_commit_hook *h)
-{
- h->next = trans->hooks;
- trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
- enum btree_id btree, struct bkey_i *k,
- enum btree_update_flags flags)
-{
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, btree, k->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
- struct bkey_i *k, enum btree_update_flags flags)
-{
- struct btree_iter iter;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
- BTREE_ITER_CACHED|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, k, flags);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c: pointer to struct bch_fs
- * @id: btree to insert into
- * @insert_keys: list of keys to insert
- * @hook: insert callback
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
- struct bkey_i *k,
- struct disk_reservation *disk_res,
- u64 *journal_seq, int flags)
-{
- return bch2_trans_do(c, disk_res, journal_seq, flags,
- __bch2_btree_insert(&trans, id, k, 0));
-}
-
-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
- unsigned len, unsigned update_flags)
-{
- struct bkey_i *k;
-
- k = bch2_trans_kmalloc(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
-
- bkey_init(&k->k);
- k->k.p = iter->pos;
- bch2_key_resize(&k->k, len);
- return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
- struct btree_iter *iter, unsigned update_flags)
-{
- return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-}
-
-int bch2_btree_delete_at_buffered(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos)
-{
- struct bkey_i *k;
-
- k = bch2_trans_kmalloc(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
-
- bkey_init(&k->k);
- k->k.p = pos;
- return bch2_trans_update_buffered(trans, btree, k);
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
- struct bpos start, struct bpos end,
- unsigned update_flags,
- u64 *journal_seq)
-{
- u32 restart_count = trans->restart_count;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
- while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(trans->c, 0);
- struct bkey_i delete;
-
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- bkey_init(&delete.k);
-
- /*
- * This could probably be more efficient for extents:
- */
-
- /*
- * For extents, iter.pos won't necessarily be the same as
- * bkey_start_pos(k.k) (for non extents they always will be the
- * same). It's important that we delete starting from iter.pos
- * because the range we want to delete could start in the middle
- * of k.
- *
- * (bch2_btree_iter_peek() does guarantee that iter.pos >=
- * bkey_start_pos(k.k)).
- */
- delete.k.p = iter.pos;
-
- if (iter.flags & BTREE_ITER_IS_EXTENTS)
- bch2_key_resize(&delete.k,
- bpos_min(end, k.k->p).offset -
- iter.pos.offset);
-
- ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
- bch2_trans_commit(trans, &disk_res, journal_seq,
- BTREE_INSERT_NOFAIL);
- bch2_disk_reservation_put(trans->c, &disk_res);
-err:
- /*
- * the bch2_trans_begin() call is in a weird place because we
- * need to call it after every transaction commit, to avoid path
- * overflow, but don't want to call it if the delete operation
- * is a no-op and we have no work to do:
- */
- bch2_trans_begin(trans);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && trans_was_restarted(trans, restart_count))
- ret = -BCH_ERR_transaction_restart_nested;
- return ret;
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
- struct bpos start, struct bpos end,
- unsigned update_flags,
- u64 *journal_seq)
-{
- int ret = bch2_trans_run(c,
- bch2_btree_delete_range_trans(&trans, id, start, end,
- update_flags, journal_seq));
- if (ret == -BCH_ERR_transaction_restart_nested)
- ret = 0;
- return ret;
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
- struct bpos pos, bool set)
-{
- struct bkey_i *k;
- int ret = 0;
-
- k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
- ret = PTR_ERR_OR_ZERO(k);
- if (unlikely(ret))
- return ret;
-
- bkey_init(&k->k);
- k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k->k.p = pos;
-
- return bch2_trans_update_buffered(trans, btree, k);
-}
-
-static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
-{
- struct printbuf buf = PRINTBUF;
- struct jset_entry_log *l;
- unsigned u64s;
- int ret;
-
- prt_vprintf(&buf, fmt, args);
- ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- goto err;
-
- u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
- ret = darray_make_room(entries, jset_u64s(u64s));
- if (ret)
- goto err;
-
- l = (void *) &darray_top(*entries);
- l->entry.u64s = cpu_to_le16(u64s);
- l->entry.btree_id = 0;
- l->entry.level = 1;
- l->entry.type = BCH_JSET_ENTRY_log;
- l->entry.pad[0] = 0;
- l->entry.pad[1] = 0;
- l->entry.pad[2] = 0;
- memcpy(l->d, buf.buf, buf.pos);
- while (buf.pos & 7)
- l->d[buf.pos++] = '\0';
-
- entries->nr += jset_u64s(u64s);
-err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
- va_list args)
-{
- int ret;
-
- if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
- ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
- } else {
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|commit_flags,
- __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
- }
-
- return ret;
-}
-
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
- va_list args;
- int ret;
-
- va_start(args, fmt);
- ret = __bch2_fs_log_msg(c, 0, fmt, args);
- va_end(args);
- return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
- va_list args;
- int ret;
-
- va_start(args, fmt);
- ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
- va_end(args);
- return ret;
-}
#include <linux/sort.h>
+static int bch2_btree_write_buffer_journal_flush(struct journal *,
+ struct journal_entry_pin *, u64);
+
static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
{
const struct btree_write_buffered_key *l = _l;
if (ret)
return ret;
+ /*
+ * We can't clone a path that has write locks: unshare it now, before
+ * set_pos and traverse():
+ */
+ if (iter->path->ref > 1)
+ iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
+
path = iter->path;
if (!*write_locked) {
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
-
- if (path->ref > 1) {
- /*
- * We can't clone a path that has write locks: if the path is
- * shared, unlock before set_pos(), traverse():
- */
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- *write_locked = false;
- }
return 0;
trans_commit:
- return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?:
+ trans->journal_res.seq = wb->journal_seq;
+
+ return bch2_trans_update(trans, iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
commit_flags|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM);
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim);
}
static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ trans->journal_res.seq = wb->journal_seq;
+
ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0);
+ bch2_trans_update(trans, &iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
if (!locked && !mutex_trylock(&wb->flush_lock))
return 0;
- bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+ bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
+ bch2_btree_write_buffer_journal_flush);
bch2_journal_pin_drop(j, &wb->journal_pin);
s = btree_write_buffer_switch(wb);
* However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal if we weren't
- * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+ * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
* if it would block taking a journal reservation.
*
* If that happens, simply skip the key so we can optimistically insert
if (!iter.path || iter.path->btree_id != i->btree) {
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+ BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
}
bch2_btree_iter_set_pos(&iter, i->k.k.p);
if (!i->journal_seq)
continue;
- if (i->journal_seq > pin.seq) {
- struct journal_entry_pin pin2;
-
- memset(&pin2, 0, sizeof(pin2));
-
- bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin);
- bch2_journal_pin_copy(j, &pin, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin2);
- }
+ bch2_journal_pin_update(j, i->journal_seq, &pin,
+ bch2_btree_write_buffer_journal_flush);
ret = commit_do(trans, NULL, NULL,
commit_flags|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim,
btree_write_buffered_insert(trans, i));
if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
break;
mutex_lock(&wb->flush_lock);
return bch2_trans_run(c,
- __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+ __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
}
static inline u64 btree_write_buffer_ref(int idx)
struct printbuf buf = PRINTBUF;
percpu_down_read(&c->mark_lock);
- buf.atomic++;
idx = bch2_replicas_entry_idx(c, r);
if (idx < 0 &&
- fsck_err(c, "no replicas entry\n"
- " while marking %s",
+ fsck_err(c, ptr_to_missing_replicas_entry,
+ "no replicas entry\n while marking %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
percpu_up_read(&c->mark_lock);
ret = bch2_mark_replicas(c, r);
d = trans->fs_usage_deltas;
n = (void *) d->d + d->used;
n->delta = sectors;
- memcpy((void *) n + offsetof(struct replicas_delta, r),
- r, replicas_entry_bytes(r));
+ unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
+ r, replicas_entry_bytes(r),
+ "flexible array member embedded in strcuct with padding");
bch2_replicas_entry_sort(&n->r);
d->used += b;
return 0;
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
- u16 bucket_sectors = !ptr->cached
+ u32 bucket_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
struct printbuf buf = PRINTBUF;
if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_too_stale,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
if (b_gen != ptr->gen && !ptr->cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_stale_dirty_ptr,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
ptr_data_type &&
bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
goto err;
}
- if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
+ if ((u64) bucket_sectors + sectors > U32_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
+ BCH_FSCK_ERR_bucket_sector_count_overflow,
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
/* * XXX doesn't handle deletion */
percpu_down_read(&c->mark_lock);
- buf.atomic++;
g = PTR_GC_BUCKET(ca, ptr);
if (g->dirty_sectors ||
return 0;
}
-int bch2_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
return 0;
}
+int bch2_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
+}
+
int bch2_mark_stripe(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
return 0;
}
-int bch2_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bch_fs_usage *fs_usage;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
return 0;
}
+int bch2_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 start, u64 end,
*idx = r->offset;
return 0;
not_found:
- if (fsck_err(c, "pointer to missing indirect extent\n"
+ if (fsck_err(c, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
" %s\n"
" missing range %llu-%llu",
(bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
new->k.p = bkey_start_pos(p.k);
new->k.p.offset += *idx - start;
bch2_key_resize(&new->k, next_idx - *idx);
- ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i,
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
BTREE_TRIGGER_NORUN);
}
return ret;
}
-int bch2_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
return ret;
}
+int bch2_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
struct bch_fs *c = trans->c;
static int warned_disk_usage = 0;
bool warn = false;
- unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- struct replicas_delta *d = deltas->d, *d2;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+ struct replicas_delta *d, *d2;
struct replicas_delta *top = (void *) deltas->d + deltas->used;
struct bch_fs_usage *dst;
s64 added = 0, should_not_have_added;
if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
bch2_trans_inconsistent(trans,
- "disk usage increased %lli more than %u sectors reserved)",
+ "disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
return 0;
need_mark:
return ret;
}
-int bch2_trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
return ret;
}
+int bch2_trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
+ (int) bch2_bkey_needs_rebalance(c, old);
+
+ if (mod) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+ if (ret)
+ return ret;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
+}
+
static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
struct bkey_s_c_stripe s,
unsigned idx, bool deleting)
return ret;
}
-int bch2_trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
return 0;
}
-static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static int trans_mark_reflink_p_segment(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 *idx, unsigned flags)
{
return ret;
}
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx, end_idx;
int ret = 0;
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
-
- v->front_pad = v->back_pad = 0;
- }
-
idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
end_idx = le64_to_cpu(p.v->idx) + p.k->size +
le32_to_cpu(p.v->back_pad);
while (idx < end_idx && !ret)
- ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
-
+ ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
return ret;
}
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
+
+ v->front_pad = v->back_pad = 0;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
if (a->v.data_type && type && a->v.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ BCH_FSCK_ERR_bucket_metadata_type_mismatch,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
bch2_data_types[type],
bch2_data_types[type]);
ret = -EIO;
- goto out;
+ goto err;
}
- a->v.data_type = type;
- a->v.dirty_sectors = sectors;
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto out;
-out:
+ if (a->v.data_type != type ||
+ a->v.dirty_sectors != sectors) {
+ a->v.data_type = type;
+ a->v.dirty_sectors = sectors;
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ }
+err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
{
- int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
+ int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
+
if (ret)
bch_err_fn(c, ret);
return ret;
}
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ int ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* Disk reservations: */
#define SECTORS_CACHE 1024
#include "buckets_types.h"
#include "extents.h"
-#include "super.h"
+#include "sb-members.h"
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+ return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+ return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+ u32 remainder;
+
+ div_u64_rem(s, ca->mi.bucket_size, &remainder);
+ return remainder;
+}
+
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+ u32 *offset)
+{
+ return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
#define for_each_bucket(_b, _buckets) \
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ * while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR 0
+#else
+#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+ ulong ulong;
+ u8 byte;
+};
+
static inline void bucket_unlock(struct bucket *b)
{
- smp_store_release(&b->lock, 0);
+ BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+
+ clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+ wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
}
static inline void bucket_lock(struct bucket *b)
{
- while (xchg(&b->lock, 1))
- cpu_relax();
+ wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
+ TASK_UNINTERRUPTIBLE);
}
static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
switch (watermark) {
case BCH_WATERMARK_NR:
- unreachable();
+ BUG();
case BCH_WATERMARK_stripe:
reserved += ca->mi.nbuckets >> 6;
fallthrough;
int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+({ \
+ int ret = 0; \
+ \
+ if (_old.k->type) \
+ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
+ if (!ret && _new.k->type) \
+ ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
+ ret; \
+})
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
+ mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
size_t, enum bch_data_type, unsigned);
int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_dev_sbs(struct bch_fs *);
+
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 b_offset = bucket_to_sector(ca, b);
+ u64 b_end = bucket_to_sector(ca, b + 1);
+ unsigned i;
+
+ if (!b)
+ return true;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+ u64 end = offset + (1 << layout->sb_max_size_bits);
+
+ if (!(offset >= b_end || end <= b_offset))
+ return true;
+ }
+
+ return false;
+}
/* disk reservations: */
b->t = n;
kvfree(t);
- pr_debug("took %zu rehashes, table at %zu/%zu elements",
+ pr_debug("took %zu rehashes, table at %zu/%lu elements",
nr_rehashes, nr_elements, 1UL << b->t->bits);
out:
mutex_unlock(&b->lock);
devs[i] = strndup_user((const char __user *)(unsigned long)
user_devs[i],
PATH_MAX);
- if (!devs[i]) {
- ret = -ENOMEM;
+ ret= PTR_ERR_OR_ZERO(devs[i]);
+ if (ret)
goto err;
- }
}
c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
return -EINVAL;
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- if (!path)
- return -ENOMEM;
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
err = bch2_fs_open_incremental(path);
kfree(path);
static long bch2_ioctl_query_uuid(struct bch_fs *c,
struct bch_ioctl_query_uuid __user *user_arg)
{
- return copy_to_user(&user_arg->uuid,
- &c->sb.user_uuid,
- sizeof(c->sb.user_uuid));
+ if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
+ sizeof(c->sb.user_uuid)))
+ return -EFAULT;
+ return 0;
}
#if 0
return -EINVAL;
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- if (!path)
- return -ENOMEM;
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
ret = bch2_dev_add(c, path);
kfree(path);
return -EINVAL;
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
- if (!path)
- return -ENOMEM;
+ ret = PTR_ERR_OR_ZERO(path);
+ if (ret)
+ return ret;
ret = bch2_dev_online(c, path);
kfree(path);
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.data_type,
- .p.btree_id = ctx->stats.btree_id,
- .p.pos = ctx->stats.pos,
+ .p.btree_id = ctx->stats.pos.btree,
+ .p.pos = ctx->stats.pos.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_usage_read_short(c).used,
};
if (len < sizeof(e))
return -EINVAL;
- return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+ if (copy_to_user(buf, &e, sizeof(e)))
+ return -EFAULT;
+
+ return sizeof(e);
}
static const struct file_operations bcachefs_data_ops = {
if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
return -EFAULT;
- arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
+ arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
if (!arg)
return -ENOMEM;
percpu_up_read(&c->mark_lock);
kfree(src);
- if (!ret)
- ret = copy_to_user(user_arg, arg,
- sizeof(*arg) + arg->replica_entries_bytes);
+ if (ret)
+ goto err;
+ if (copy_to_user(user_arg, arg,
+ sizeof(*arg) + arg->replica_entries_bytes))
+ ret = -EFAULT;
err:
kfree(arg);
return ret;
percpu_ref_put(&ca->ref);
- return copy_to_user(user_arg, &arg, sizeof(arg));
+ if (copy_to_user(user_arg, &arg, sizeof(arg)))
+ return -EFAULT;
+
+ return 0;
}
static long bch2_ioctl_read_super(struct bch_fs *c,
goto err;
}
- ret = copy_to_user((void __user *)(unsigned long)arg.sb,
- sb, vstruct_bytes(sb));
+ if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
+ vstruct_bytes(sb)))
+ ret = -EFAULT;
err:
if (!IS_ERR_OR_NULL(ca))
percpu_ref_put(&ca->ref);
arg.pad)
return -EINVAL;
+ if (arg.nbuckets > U32_MAX)
+ return -EINVAL;
+
ca = bch2_device_lookup(c, arg.dev, arg.flags);
if (IS_ERR(ca))
return PTR_ERR(ca);
static inline long bch2_fs_ioctl(struct bch_fs *c,
unsigned cmd, void __user * arg)
{
- return -ENOSYS;
+ return -ENOTTY;
}
static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
for (i = 0; i < pages; i++) {
unsigned offset = offset_in_page(buf);
- unsigned pg_len = min(len, PAGE_SIZE - offset);
+ unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
buf += pg_len;
crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
- if (!chacha20) {
- pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
- return PTR_ERR(chacha20);
+ ret = PTR_ERR_OR_ZERO(chacha20);
+ if (ret) {
+ pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+ return ret;
}
ret = crypto_skcipher_setkey(&chacha20->base,
(void *) key, sizeof(*key));
if (ret) {
- pr_err("crypto_skcipher_setkey() error: %i", ret);
+ pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
goto err;
}
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
- void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+ void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
bch2_checksum_update(&state, p, bv.bv_len);
- kunmap_atomic(p);
+ kunmap_local(p);
}
#else
__bio_for_each_bvec(bv, bio, *iter, *iter)
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
- void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+ void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
crypto_shash_update(desc, p, bv.bv_len);
- kunmap_atomic(p);
+ kunmap_local(p);
}
#else
__bio_for_each_bvec(bv, bio, *iter, *iter)
state.type = type;
bch2_checksum_init(&state);
- state.seed = a.lo;
+ state.seed = le64_to_cpu(a.lo);
BUG_ON(!bch2_checksum_mergeable(type));
while (b_len) {
- unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+ unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
bch2_checksum_update(&state,
- page_address(ZERO_PAGE(0)), b);
- b_len -= b;
+ page_address(ZERO_PAGE(0)), page_len);
+ b_len -= page_len;
}
- a.lo = bch2_checksum_final(&state);
+ a.lo = cpu_to_le64(bch2_checksum_final(&state));
a.lo ^= b.lo;
a.hi ^= b.hi;
return a;
unsigned csum_type;
struct bch_csum csum;
} splits[3] = {
- { crc_a, len_a, new_csum_type },
- { crc_b, len_b, new_csum_type },
- { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type },
+ { crc_a, len_a, new_csum_type, { 0 }},
+ { crc_b, len_b, new_csum_type, { 0 } },
+ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
}, *i;
bool mergeable = crc_old.csum_type == new_csum_type &&
bch2_checksum_mergeable(new_csum_type);
merged = bch2_checksum_bio(c, crc_old.csum_type,
extent_nonce(version, crc_old), bio);
- if (bch2_crc_cmp(merged, crc_old.csum)) {
- bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
+ if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
+ bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+ __func__,
crc_old.csum.hi,
crc_old.csum.lo,
merged.hi,
return 0;
}
+/* BCH_SB_FIELD_crypt: */
+
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+ prt_printf(err, "wrong size (got %zu should be %zu)",
+ vstruct_bytes(&crypt->field), sizeof(*crypt));
+ return -BCH_ERR_invalid_sb_crypt;
+ }
+
+ if (BCH_CRYPT_KDF_TYPE(crypt)) {
+ prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ return -BCH_ERR_invalid_sb_crypt;
+ }
+
+ return 0;
+}
+
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+ prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
+ prt_newline(out);
+ prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
+ prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+ .validate = bch2_sb_crypt_validate,
+ .to_text = bch2_sb_crypt_to_text,
+};
+
#ifdef __KERNEL__
static int __bch2_request_key(char *key_description, struct bch_key *key)
{
{
key_serial_t key_id;
+ key_id = request_key("user", key_description, NULL,
+ KEY_SPEC_SESSION_KEYRING);
+ if (key_id >= 0)
+ goto got_key;
+
key_id = request_key("user", key_description, NULL,
KEY_SPEC_USER_KEYRING);
- if (key_id < 0)
- return -errno;
+ if (key_id >= 0)
+ goto got_key;
+
+ key_id = request_key("user", key_description, NULL,
+ KEY_SPEC_USER_SESSION_KEYRING);
+ if (key_id >= 0)
+ goto got_key;
+
+ return -errno;
+got_key:
if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
return -1;
return 0;
}
+
+#include "../crypto.h"
#endif
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
ret = __bch2_request_key(key_description.buf, key);
printbuf_exit(&key_description);
+
+#ifndef __KERNEL__
+ if (ret) {
+ char *passphrase = read_passphrase("Enter passphrase: ");
+ struct bch_encrypted_key sb_key;
+
+ bch2_passphrase_check(sb, passphrase,
+ key, &sb_key);
+ ret = 0;
+ }
+#endif
+
+ /* stash with memfd, pass memfd fd to mount */
+
return ret;
}
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *sb)
+{
+ key_serial_t key_id;
+ struct printbuf key_description = PRINTBUF;
+
+ prt_printf(&key_description, "bcachefs:");
+ pr_uuid(&key_description, sb->user_uuid.b);
+
+ key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
+ printbuf_exit(&key_description);
+ if (key_id < 0)
+ return errno;
+
+ keyctl_revoke(key_id);
+
+ return 0;
+}
+#endif
+
int bch2_decrypt_sb_key(struct bch_fs *c,
struct bch_sb_field_crypt *crypt,
struct bch_key *key)
/* decrypt real key: */
ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
- &sb_key, sizeof(sb_key));
+ &sb_key, sizeof(sb_key));
if (ret)
goto err;
mutex_lock(&c->sb_lock);
- crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+ crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
goto out;
if (ret)
goto out;
- crypt->key.magic = BCH_KEY_MAGIC;
+ crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC);
crypt->key.key = key;
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
mutex_lock(&c->sb_lock);
/* Do we already have an encryption key? */
- if (bch2_sb_get_crypt(c->disk_sb.sb))
+ if (bch2_sb_field_get(c->disk_sb.sb, crypt))
goto err;
ret = bch2_alloc_ciphers(c);
if (ret)
goto err;
- key.magic = BCH_KEY_MAGIC;
+ key.magic = cpu_to_le64(BCH_KEY_MAGIC);
get_random_bytes(&key.key, sizeof(key.key));
if (keyed) {
if (ret)
goto err;
- crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
+ crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
+ sizeof(*crypt) / sizeof(u64));
if (!crypt) {
ret = -BCH_ERR_ENOSPC_sb_crypt;
goto err;
goto out;
}
- crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+ crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
goto out;
*/
#define csum_vstruct(_c, _type, _nonce, _i) \
({ \
- const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
- const void *end = vstruct_end(_i); \
+ const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
\
- bch2_checksum(_c, _type, _nonce, start, end - start); \
+ bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
})
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *);
+#endif
int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
void *data, size_t);
: 0;
}
+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
+
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
#include "checksum.h"
#include "compress.h"
#include "extents.h"
-#include "io.h"
#include "super-io.h"
#include <linux/lz4.h>
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
size_t decompress_workspace_size = 0;
- bool decompress_workspace_needed;
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max);
struct {
size_t decompress_workspace;
} compression_types[] = {
{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
- max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
+ max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+ 0 },
{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
zlib_inflate_workspacesize(), },
if (!(features & (1 << i->feature)))
continue;
- if (i->decompress_workspace)
- decompress_workspace_needed = true;
-
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
static u64 compression_opt_to_feature(unsigned v)
{
unsigned type = bch2_compression_decode(v).type;
- return 1ULL << bch2_compression_opt_to_feature[type];
+
+ return BIT_ULL(bch2_compression_opt_to_feature[type]);
}
int bch2_fs_compress_init(struct bch_fs *c)
return ret;
}
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+ struct bch_compression_opt opt = bch2_compression_decode(v);
+
+ if (opt.type < BCH_COMPRESSION_OPT_NR)
+ prt_str(out, bch2_compression_opts[opt.type]);
+ else
+ prt_printf(out, "(unknown compression opt %u)", opt.type);
+ if (opt.level)
+ prt_printf(out, ":%u", opt.level);
+}
+
void bch2_opt_compression_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
{
- struct bch_compression_opt opt = bch2_compression_decode(v);
+ return bch2_compression_opt_to_text(out, v);
+}
- prt_str(out, bch2_compression_opts[opt.type]);
- if (opt.level)
- prt_printf(out, ":%u", opt.level);
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+ if (!bch2_compression_opt_valid(v)) {
+ prt_printf(err, "invalid compression opt %llu", v);
+ return -BCH_ERR_invalid_sb_opt_compression;
+ }
+
+ return 0;
}
#include "extents_types.h"
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+ BCH_COMPRESSION_OPTS()
+#undef x
+};
+
struct bch_compression_opt {
u8 type:4,
level:4;
};
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
{
return (struct bch_compression_opt) {
.type = v & 15,
};
}
+static inline bool bch2_compression_opt_valid(unsigned v)
+{
+ struct bch_compression_opt opt = __bch2_compression_decode(v);
+
+ return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
+}
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+ return bch2_compression_opt_valid(v)
+ ? __bch2_compression_decode(v)
+ : (struct bch_compression_opt) { 0 };
+}
+
static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
{
return opt.type|(opt.level << 4);
}
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
- BCH_COMPRESSION_OPTS()
-#undef x
-};
-
static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
{
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
void bch2_fs_compress_exit(struct bch_fs *);
int bch2_fs_compress_init(struct bch_fs *);
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+int bch2_opt_compression_validate(u64, struct printbuf *);
#define bch2_opt_compression (struct bch_opt_fn) { \
- .parse = bch2_opt_compression_parse, \
- .to_text = bch2_opt_compression_to_text, \
+ .parse = bch2_opt_compression_parse, \
+ .to_text = bch2_opt_compression_to_text, \
+ .validate = bch2_opt_compression_validate, \
}
#endif /* _BCACHEFS_COMPRESS_H */
prt_tab(out);
prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
prt_newline(out);
- };
+ }
};
int bch2_sb_counters_to_cpu(struct bch_fs *c)
{
- struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+ struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
u64 val = 0;
int bch2_sb_counters_from_cpu(struct bch_fs *c)
{
- struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+ struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
struct bch_sb_field_counters *ret;
unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
if (nr < BCH_COUNTER_NR) {
- ret = bch2_sb_resize_counters(&c->disk_sb,
+ ret = bch2_sb_field_resize(&c->disk_sb, counters,
sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
if (ret) {
* Inspired by CCAN's darray
*/
-#include "util.h"
#include <linux/slab.h>
#define DARRAY(type) \
typedef DARRAY(void) darray_void;
-static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+int __bch2_darray_resize(darray_void *, size_t, size_t, gfp_t);
+
+static inline int __darray_resize(darray_void *d, size_t element_size,
+ size_t new_size, gfp_t gfp)
{
- if (d->nr + more > d->size) {
- size_t new_size = roundup_pow_of_two(d->nr + more);
- void *data = krealloc_array(d->data, new_size, t_size, gfp);
+ return unlikely(new_size > d->size)
+ ? __bch2_darray_resize(d, element_size, new_size, gfp)
+ : 0;
+}
- if (!data)
- return -ENOMEM;
+#define darray_resize_gfp(_d, _new_size, _gfp) \
+ __darray_resize((darray_void *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
- d->data = data;
- d->size = new_size;
- }
+#define darray_resize(_d, _new_size) \
+ darray_resize_gfp(_d, _new_size, GFP_KERNEL)
- return 0;
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+{
+ return __darray_resize(d, t_size, d->nr + more, gfp);
}
#define darray_make_room_gfp(_d, _more, _gfp) \
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)
+#define darray_room(_d) ((_d).size - (_d).nr)
+
#define darray_top(_d) ((_d).data[(_d).nr])
#define darray_push_gfp(_d, _item, _gfp) \
_ret; \
})
+#define darray_remove_item(_d, _pos) \
+ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+
#define darray_for_each(_d, _i) \
for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+#define darray_for_each_reverse(_d, _i) \
+ for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+
#define darray_init(_d) \
do { \
(_d)->data = NULL; \
#include "ec.h"
#include "error.h"
#include "extents.h"
-#include "io.h"
+#include "io_write.h"
#include "keylist.h"
#include "move.h"
#include "nocow_locking.h"
+#include "rebalance.h"
#include "subvolume.h"
#include "trace.h"
if (insert) {
i = 0;
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
- struct bkey_s new_s;
- new_s.k = (void *) new.k;
- new_s.v = (void *) new.v;
-
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached)
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
- /*
- * See comment below:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
- */
rewrites_found |= 1U << i;
}
i++;
if (!p.ptr.cached &&
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
- bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
- /*
- * Currently, we're dropping unneeded replicas
- * instead of marking them as cached, since
- * cached data in stripe buckets prevents them
- * from being reused:
+
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
- */
goto restart_drop_extra_replicas;
}
}
next_pos = insert->k.p;
+ /*
+ * Check for nonce offset inconsistency:
+ * This is debug code - we've been seeing this bug rarely, and
+ * it's been hard to reproduce, so this should give us some more
+ * information when it does occur:
+ */
+ struct printbuf err = PRINTBUF;
+ int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+ printbuf_exit(&err);
+
+ if (invalid) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "about to insert invalid key in data update path");
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_fatal_error(c);
+ goto out;
+ }
+
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, insert->k.p);
- if (ret)
- goto err;
-
- ret = bch2_trans_update(trans, &iter, insert,
+ k.k->p, insert->k.p) ?:
+ bch2_bkey_set_needs_rebalance(c, insert,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
}
continue;
nowork:
- if (m->ctxt && m->ctxt->stats) {
+ if (m->stats && m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->ctxt->stats->keys_raced);
+ atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->ctxt->stats->sectors_raced);
+ &m->stats->sectors_raced);
}
this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
int bch2_data_update_index_update(struct bch_write_op *op)
{
- return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
+ return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
}
void bch2_data_update_read_done(struct data_update *m,
break;
}
- if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
+ if (closure_nr_remaining(&cl) != 1) {
bch2_trans_unlock(trans);
closure_sync(&cl);
}
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
m->data_opts = data_opts;
+ m->ctxt = ctxt;
+ m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
if (c->opts.nocow_enabled) {
if (ctxt) {
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
!atomic_read(&ctxt->read_sectors));
#define _BCACHEFS_DATA_UPDATE_H
#include "bkey_buf.h"
-#include "io_types.h"
+#include "io_write_types.h"
struct moving_context;
struct bkey_buf k;
struct data_update_opts data_opts;
struct moving_context *ctxt;
+ struct bch_move_stats *stats;
struct bch_write_op op;
};
#include "extents.h"
#include "fsck.h"
#include "inode.h"
-#include "io.h"
#include "super.h"
#include <linux/console.h>
BUG_ON(b->nsets != 1);
for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
- if (k->type == KEY_TYPE_btree_ptr_v2) {
- struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
- v->mem_ptr = 0;
- }
+ if (k->type == KEY_TYPE_btree_ptr_v2)
+ ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
v = c->verify_data;
bkey_copy(&v->key, &b->key);
{
if (i->buf.pos) {
size_t bytes = min_t(size_t, i->buf.pos, i->size);
- int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
+ int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
- if (err)
- return err;
+ i->ret += copied;
+ i->ubuf += copied;
+ i->size -= copied;
+ i->buf.pos -= copied;
+ memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
- i->ret += bytes;
- i->ubuf += bytes;
- i->size -= bytes;
- i->buf.pos -= bytes;
- memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
+ if (copied != bytes)
+ return -EFAULT;
}
return i->size ? 0 : i->ret;
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
ssize_t ret;
if (ret)
return ret;
- bch2_trans_init(&trans, i->c, 0, 0);
- ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+ trans = bch2_trans_get(i->c);
+ ret = for_each_btree_key2(trans, iter, i->id, i->from,
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ({
bch2_bkey_val_to_text(&i->buf, i->c, k);
prt_newline(&i->buf);
- drop_locks_do(&trans, flush_buf(i));
+ drop_locks_do(trans, flush_buf(i));
}));
i->from = iter.pos;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (!ret)
ret = flush_buf(i);
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
ssize_t ret;
if (bpos_eq(SPOS_MAX, i->from))
return i->ret;
- bch2_trans_init(&trans, i->c, 0, 0);
+ trans = bch2_trans_get(i->c);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+ for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
bch2_btree_node_to_text(&i->buf, i->c, b);
i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
? bpos_successor(b->key.k.p)
: b->key.k.p;
- ret = drop_locks_do(&trans, flush_buf(i));
+ ret = drop_locks_do(trans, flush_buf(i));
if (ret)
break;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (!ret)
ret = flush_buf(i);
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
ssize_t ret;
if (ret)
return ret;
- bch2_trans_init(&trans, i->c, 0, 0);
+ trans = bch2_trans_get(i->c);
- ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+ ret = for_each_btree_key2(trans, iter, i->id, i->from,
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ({
struct btree_path_level *l = &iter.path->l[0];
}
bch2_bfloat_to_text(&i->buf, l->b, _k);
- drop_locks_do(&trans, flush_buf(i));
+ drop_locks_do(trans, flush_buf(i));
}));
i->from = iter.pos;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (!ret)
ret = flush_buf(i);
prt_printf(out, "%px btree=%s l=%u ",
b,
- bch2_btree_ids[b->c.btree_id],
+ bch2_btree_id_str(b->c.btree_id),
b->c.level);
prt_newline(out);
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- debugfs_create_file(bch2_btree_ids[bd->id],
+ debugfs_create_file(bch2_btree_id_str(bd->id),
0400, c->btree_debug_dir, bd,
&btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
- bch2_btree_ids[bd->id]);
+ bch2_btree_id_str(bd->id));
debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
&btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch2_btree_ids[bd->id]);
+ bch2_btree_id_str(bd->id));
debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
&bfloat_failed_debug_ops);
#include <linux/dcache.h>
-unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
- unsigned len = bkey_val_bytes(d.k) -
- offsetof(struct bch_dirent, d_name);
+ unsigned bkey_u64s = bkey_val_u64s(d.k);
+ unsigned bkey_bytes = bkey_u64s * sizeof(u64);
+ u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
+#if CPU_BIG_ENDIAN
+ unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
+#else
+ unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
+#endif
+
+ return bkey_bytes -
+ offsetof(struct bch_dirent, d_name) -
+ trailing_nuls;
+}
- return strnlen(d.v->d_name, len);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
+{
+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+ struct qstr name = bch2_dirent_get_name(d);
return bch2_dirent_hash(info, &name);
}
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- int len = bch2_dirent_name_bytes(l);
- const struct qstr *r = _r;
+ const struct qstr l_name = bch2_dirent_get_name(l);
+ const struct qstr *r_name = _r;
- return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+ return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
}
static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
- int l_len = bch2_dirent_name_bytes(l);
- int r_len = bch2_dirent_name_bytes(r);
+ const struct qstr l_name = bch2_dirent_get_name(l);
+ const struct qstr r_name = bch2_dirent_get_name(r);
- return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+ return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
}
static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
.is_visible = dirent_is_visible,
};
-int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- unsigned len;
-
- len = bch2_dirent_name_bytes(d);
- if (!len) {
- prt_printf(err, "empty name");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
- prt_printf(err, "value too big (%zu > %u)",
- bkey_val_u64s(k.k), dirent_val_u64s(len));
- return -BCH_ERR_invalid_bkey;
- }
-
- if (len > BCH_NAME_MAX) {
- prt_printf(err, "dirent name too big (%u > %u)",
- len, BCH_NAME_MAX);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
- prt_printf(err, "invalid name");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
- prt_printf(err, "invalid name");
- return -BCH_ERR_invalid_bkey;
- }
+ struct qstr d_name = bch2_dirent_get_name(d);
+ int ret = 0;
- if (memchr(d.v->d_name, '/', len)) {
- prt_printf(err, "invalid name");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(!d_name.len, c, err,
+ dirent_empty_name,
+ "empty name");
- if (d.v->d_type != DT_SUBVOL &&
- le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
- prt_printf(err, "dirent points to own directory");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
+ dirent_val_too_big,
+ "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
- return 0;
+ /*
+ * Check new keys don't exceed the max length
+ * (older keys may be larger.)
+ */
+ bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
+ dirent_name_too_long,
+ "dirent name too big (%u > %u)",
+ d_name.len, BCH_NAME_MAX);
+
+ bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
+ dirent_name_embedded_nul,
+ "dirent has stray data after name's NUL");
+
+ bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
+ (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
+ dirent_name_dot_or_dotdot,
+ "invalid name");
+
+ bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
+ dirent_name_has_slash,
+ "name with /");
+
+ bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
+ dirent_to_itself,
+ "dirent points to own directory");
+fsck_err:
+ return ret;
}
void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ struct qstr d_name = bch2_dirent_get_name(d);
prt_printf(out, "%.*s -> %llu type %s",
- bch2_dirent_name_bytes(d),
- d.v->d_name,
+ d_name.len,
+ d_name.name,
d.v->d_type != DT_SUBVOL
? le64_to_cpu(d.v->d_inum)
: le32_to_cpu(d.v->d_child_subvol),
int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset, int flags)
+ u64 *dir_offset,
+ bch_str_hash_flags_t str_hash_flags)
{
struct bkey_i_dirent *dirent;
int ret;
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, flags);
+ dir, &dirent->k_i, str_hash_flags);
*dir_offset = dirent->k.p.offset;
return ret;
const struct bch_hash_info *hash_info,
const struct qstr *name, subvol_inum *inum)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
int ret;
-
- bch2_trans_init(&trans, c, 0, 0);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+ ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
name, inum, 0);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (!ret)
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
subvol_inum target;
u32 snapshot;
struct bkey_buf sk;
+ struct qstr name;
int ret;
bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
- for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
SPOS(inum.inum, ctx->pos, snapshot),
POS(inum.inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_dirent)
dirent = bkey_s_c_to_dirent(k);
- ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+ ret = bch2_dirent_read_target(trans, inum, dirent, &target);
if (ret < 0)
break;
if (ret)
/* dir_emit() can fault and block: */
bch2_bkey_buf_reassemble(&sk, c, k);
dirent = bkey_i_to_s_c_dirent(sk.k);
- bch2_trans_unlock(&trans);
+ bch2_trans_unlock(trans);
+
+ name = bch2_dirent_get_name(dirent);
ctx->pos = dirent.k->p.offset;
- if (!dir_emit(ctx, dirent.v->d_name,
- bch2_dirent_name_bytes(dirent),
+ if (!dir_emit(ctx, name.name,
+ name.len,
target.inum,
vfs_d_type(dirent.v->d_type)))
break;
* read_target looks up subvolumes, we can overflow paths if the
* directory has many subvolumes in it
*/
- ret = btree_trans_too_many_iters(&trans);
+ ret = btree_trans_too_many_iters(trans);
if (ret)
break;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
return ret;
enum bkey_invalid_flags;
extern const struct bch_hash_desc bch2_dirent_hash_desc;
-int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
struct bch_hash_info;
struct bch_inode_info;
-unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
static inline unsigned dirent_val_u64s(unsigned len)
{
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *, int);
+ const struct qstr *, u64, u64 *,
+ bch_str_hash_flags_t);
static inline unsigned vfs_d_type(unsigned type)
{
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "disk_groups.h"
+#include "sb-members.h"
#include "super-io.h"
#include <linux/sort.h>
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
struct bch_disk_group *g, *sorted = NULL;
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
unsigned nr_groups = disk_groups_nr(groups);
unsigned i, len;
int ret = 0;
for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member *m = mi->members + i;
- unsigned g;
+ struct bch_member m = bch2_sb_member_get(sb, i);
+ unsigned group_id;
- if (!BCH_MEMBER_GROUP(m))
+ if (!BCH_MEMBER_GROUP(&m))
continue;
- g = BCH_MEMBER_GROUP(m) - 1;
+ group_id = BCH_MEMBER_GROUP(&m) - 1;
- if (g >= nr_groups) {
+ if (group_id >= nr_groups) {
prt_printf(err, "disk %u has invalid label %u (have %u)",
- i, g, nr_groups);
+ i, group_id, nr_groups);
return -BCH_ERR_invalid_sb_disk_groups;
}
- if (BCH_GROUP_DELETED(&groups->entries[g])) {
- prt_printf(err, "disk %u has deleted label %u", i, g);
+ if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+ prt_printf(err, "disk %u has deleted label %u", i, group_id);
return -BCH_ERR_invalid_sb_disk_groups;
}
}
int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
{
- struct bch_sb_field_members *mi;
struct bch_sb_field_disk_groups *groups;
struct bch_disk_groups_cpu *cpu_g, *old_g;
unsigned i, g, nr_groups;
lockdep_assert_held(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb.sb);
- groups = bch2_sb_get_disk_groups(c->disk_sb.sb);
+ groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups);
nr_groups = disk_groups_nr(groups);
if (!groups)
return 0;
- cpu_g = kzalloc(sizeof(*cpu_g) +
- sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+ cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
if (!cpu_g)
return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
dst->deleted = BCH_GROUP_DELETED(src);
dst->parent = BCH_GROUP_PARENT(src);
+ memcpy(dst->label, src->label, sizeof(dst->label));
}
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- struct bch_member *m = mi->members + i;
- struct bch_disk_group_cpu *dst =
- &cpu_g->entries[BCH_MEMBER_GROUP(m)];
+ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
+ struct bch_disk_group_cpu *dst;
- if (!bch2_member_exists(m))
+ if (!bch2_member_exists(&m))
continue;
- g = BCH_MEMBER_GROUP(m);
+ g = BCH_MEMBER_GROUP(&m);
while (g) {
dst = &cpu_g->entries[g - 1];
__set_bit(i, dst->devs.d);
const char *name, unsigned namelen)
{
struct bch_sb_field_disk_groups *groups =
- bch2_sb_get_disk_groups(sb->sb);
+ bch2_sb_field_get(sb->sb, disk_groups);
unsigned i, nr_groups = disk_groups_nr(groups);
struct bch_disk_group *g;
sizeof(struct bch_disk_group) * (nr_groups + 1)) /
sizeof(u64);
- groups = bch2_sb_resize_disk_groups(sb, u64s);
+ groups = bch2_sb_field_resize(sb, disk_groups, u64s);
if (!groups)
return -BCH_ERR_ENOSPC_disk_label_add;
int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
{
struct bch_sb_field_disk_groups *groups =
- bch2_sb_get_disk_groups(sb->sb);
+ bch2_sb_field_get(sb->sb, disk_groups);
int v = -1;
do {
if (*next == '.')
next++;
- groups = bch2_sb_get_disk_groups(sb->sb);
+ groups = bch2_sb_field_get(sb->sb, disk_groups);
v = __bch2_disk_group_find(groups, parent, name, len);
if (v < 0)
return v;
}
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+ struct bch_disk_groups_cpu *groups;
+ struct bch_disk_group_cpu *g;
+ unsigned nr = 0;
+ u16 path[32];
+
+ out->atomic++;
+ rcu_read_lock();
+ groups = rcu_dereference(c->disk_groups);
+ if (!groups)
+ goto invalid;
+
+ while (1) {
+ if (nr == ARRAY_SIZE(path))
+ goto invalid;
+
+ if (v >= groups->nr)
+ goto invalid;
+
+ g = groups->entries + v;
+
+ if (g->deleted)
+ goto invalid;
+
+ path[nr++] = v;
+
+ if (!g->parent)
+ break;
+
+ v = g->parent - 1;
+ }
+
+ while (nr) {
+ v = path[--nr];
+ g = groups->entries + v;
+
+ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+ if (nr)
+ prt_printf(out, ".");
+ }
+out:
+ rcu_read_unlock();
+ out->atomic--;
+ return;
+invalid:
+ prt_printf(out, "invalid label %u", v);
+ goto out;
+}
+
+void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct bch_sb_field_disk_groups *groups =
- bch2_sb_get_disk_groups(sb);
+ bch2_sb_field_get(sb, disk_groups);
struct bch_disk_group *g;
unsigned nr = 0;
u16 path[32];
if (ret)
return ret;
- mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+ mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
SET_BCH_MEMBER_GROUP(mi, v + 1);
return 0;
}
return -EINVAL;
}
-void bch2_opt_target_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
{
struct target t = target_decode(v);
case TARGET_NULL:
prt_printf(out, "none");
break;
- case TARGET_DEV:
- if (c) {
- struct bch_dev *ca;
-
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
- prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
- prt_printf(out, "offline device %u", t.dev);
- } else {
- prt_printf(out, "invalid device %u", t.dev);
- }
-
- rcu_read_unlock();
+ case TARGET_DEV: {
+ struct bch_dev *ca;
+
+ out->atomic++;
+ rcu_read_lock();
+ ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ percpu_ref_put(&ca->io_ref);
+ } else if (ca) {
+ prt_printf(out, "offline device %u", t.dev);
} else {
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- struct bch_member *m = mi->members + t.dev;
-
- if (bch2_dev_exists(sb, mi, t.dev)) {
- prt_printf(out, "Device ");
- pr_uuid(out, m->uuid.b);
- prt_printf(out, " (%u)", t.dev);
- } else {
- prt_printf(out, "Bad device %u", t.dev);
- }
+ prt_printf(out, "invalid device %u", t.dev);
}
+
+ rcu_read_unlock();
+ out->atomic--;
break;
+ }
case TARGET_GROUP:
- if (c) {
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
- mutex_unlock(&c->sb_lock);
+ bch2_disk_path_to_text(out, c, t.group);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+ struct target t = target_decode(v);
+
+ switch (t.type) {
+ case TARGET_NULL:
+ prt_printf(out, "none");
+ break;
+ case TARGET_DEV: {
+ struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+ if (bch2_dev_exists(sb, t.dev)) {
+ prt_printf(out, "Device ");
+ pr_uuid(out, m.uuid.b);
+ prt_printf(out, " (%u)", t.dev);
} else {
- bch2_disk_path_to_text(out, sb, t.group);
+ prt_printf(out, "Bad device %u", t.dev);
}
break;
+ }
+ case TARGET_GROUP:
+ bch2_disk_path_to_text_sb(out, sb, t.group);
+ break;
default:
BUG();
}
}
+
+void bch2_opt_target_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
+{
+ if (c)
+ bch2_target_to_text(out, c, v);
+ else
+ bch2_target_to_text_sb(out, sb, v);
+}
#ifndef _BCACHEFS_DISK_GROUPS_H
#define _BCACHEFS_DISK_GROUPS_H
+#include "disk_groups_types.h"
+
extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
+void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
#include "btree_update.h"
#include "btree_write_buffer.h"
#include "buckets.h"
+#include "checksum.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
-#include "io.h"
+#include "io_read.h"
#include "keylist.h"
#include "recovery.h"
#include "replicas.h"
/* Stripes btree keys: */
-int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ int ret = 0;
- if (bkey_eq(k.k->p, POS_MIN)) {
- prt_printf(err, "stripe at POS_MIN");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (k.k->p.inode) {
- prt_printf(err, "nonzero inode field");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
+ bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
+ stripe_pos_bad,
+ "stripe at bad pos");
- if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
- prt_printf(err, "incorrect value size (%zu < %u)",
- bkey_val_u64s(k.k), stripe_val_u64s(s));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
+ stripe_val_size_bad,
+ "incorrect value size (%zu < %u)",
+ bkey_val_u64s(k.k), stripe_val_u64s(s));
- return bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
if (i < nr_data)
prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+ prt_printf(out, " gen %u", ptr->gen);
if (ptr_stale(ca, ptr))
prt_printf(out, " stale");
}
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
- struct printbuf buf2 = PRINTBUF;
+ struct printbuf err = PRINTBUF;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
+
+ prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
+ want.hi, want.lo,
+ got.hi, got.lo,
+ bch2_csum_types[v->csum_type]);
+ prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
+ bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+ bch_err_ratelimited(ca, "%s", err.buf);
+ printbuf_exit(&err);
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
-
- bch_err_ratelimited(c,
- "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
- (void *) _RET_IP_, i, j, v->csum_type,
- want.lo, got.lo, buf2.buf);
- printbuf_exit(&buf2);
clear_bit(i, buf->valid);
+
+ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
break;
}
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca,
+ bio_data_dir(bio)
+ ? BCH_MEMBER_ERROR_write
+ : BCH_MEMBER_ERROR_read,
+ "erasure coding %s error: %s",
bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
return ret;
}
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
-{
- return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe));
-}
-
/* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
{
+ struct bch_fs *c = trans->c;
struct ec_stripe_buf *buf;
struct closure cl;
struct bch_stripe *v;
if (!buf)
return -BCH_ERR_ENOMEM_ec_read_extent;
- ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+ ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
bch_err_ratelimited(c,
"error doing reconstruct read: error %i looking up stripe", ret);
{
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
int ret;
u64 idx;
- bch2_trans_init(&trans, c, 0, 0);
-
while (1) {
mutex_lock(&c->ec_stripes_heap_lock);
idx = stripe_idx_to_delete(c);
if (!idx)
break;
- ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
- ec_stripe_delete(&trans, idx));
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_delete(trans, idx));
if (ret) {
bch_err_fn(c, ret);
break;
}
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
while (1) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_pos));
if (ret)
static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
- ret = bch2_btree_write_buffer_flush(&trans);
+ ret = bch2_btree_write_buffer_flush(trans);
if (ret)
goto err;
for (i = 0; i < nr_data; i++) {
- ret = ec_stripe_update_bucket(&trans, s, i);
+ ret = ec_stripe_update_bucket(trans, s, i);
if (ret)
break;
}
err:
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
}
ret = bch2_trans_do(c, &s->res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
- ec_stripe_key_update(&trans,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_key_update(trans,
bkey_i_to_stripe(&s->new_stripe.key),
!s->have_existing_stripe));
if (ret) {
ret = ec_stripe_update_extents(c, &s->new_stripe);
if (ret) {
- bch_err(c, "error creating stripe: error updating pointers: %s",
- bch2_err_str(ret));
+ bch_err_msg(c, ret, "creating stripe: error updating pointers");
goto err;
}
err:
h->nr_active_devs++;
rcu_read_unlock();
+
+ /*
+ * If we only have redundancy + 1 devices, we're better off with just
+ * replication:
+ */
+ if (h->nr_active_devs < h->redundancy + 2)
+ bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+ h->nr_active_devs, h->redundancy + 2);
+
list_add(&h->list, &c->ec_stripe_head_list);
return h;
}
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
found:
+ if (!IS_ERR_OR_NULL(h) &&
+ h->nr_active_devs < h->redundancy + 2) {
+ mutex_unlock(&h->lock);
+ h = NULL;
+ }
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
int ret;
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
- if (!h)
- bch_err(c, "no stripe head");
if (IS_ERR_OR_NULL(h))
return h;
int bch2_stripes_read(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
const struct bch_stripe *s;
unsigned i;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
if (k.k->type != KEY_TYPE_stripe)
continue;
bch2_stripes_heap_insert(c, m, k.k->p.offset);
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
bch_err_fn(c, ret);
enum bkey_invalid_flags;
-int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
struct ec_stripe_new *s;
};
-int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
bch2_ec_do_stripe_creates(c);
break;
default:
- unreachable();
+ BUG();
}
}
NULL
};
-#define BCH_ERR_0 0
-
static unsigned bch2_errcode_parents[] = {
#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
BCH_ERRCODES()
return -err;
}
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+ if (status == BLK_STS_REMOVED)
+ return "device removed";
+ return blk_status_to_str(status);
+}
#define _BCACHEFS_ERRCODE_H
#define BCH_ERRCODES() \
+ x(ERANGE, ERANGE_option_too_small) \
+ x(ERANGE, ERANGE_option_too_big) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
x(ENOMEM, ENOMEM_fsck_add_nlink) \
x(ENOMEM, ENOMEM_journal_key_insert) \
x(ENOMEM, ENOMEM_journal_keys_sort) \
- x(ENOMEM, ENOMEM_journal_replay) \
x(ENOMEM, ENOMEM_read_superblock_clean) \
x(ENOMEM, ENOMEM_fs_alloc) \
x(ENOMEM, ENOMEM_fs_name_alloc) \
x(ENOSPC, ENOSPC_sb_quota) \
x(ENOSPC, ENOSPC_sb_replicas) \
x(ENOSPC, ENOSPC_sb_members) \
+ x(ENOSPC, ENOSPC_sb_members_v2) \
x(ENOSPC, ENOSPC_sb_crypt) \
x(ENOSPC, ENOSPC_btree_slot) \
x(ENOSPC, ENOSPC_snapshot_tree) \
x(ENOENT, ENOENT_str_hash_set_must_replace) \
x(ENOENT, ENOENT_inode) \
x(ENOENT, ENOENT_not_subvol) \
+ x(ENOENT, ENOENT_not_directory) \
x(ENOENT, ENOENT_directory_dead) \
x(ENOENT, ENOENT_subvolume) \
x(ENOENT, ENOENT_snapshot_tree) \
x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
x(BCH_ERR_invalid_sb, invalid_sb_clean) \
x(BCH_ERR_invalid_sb, invalid_sb_quota) \
+ x(BCH_ERR_invalid_sb, invalid_sb_errors) \
+ x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
+ x(EIO, btree_node_read_err) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \
+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \
+ x(0, nopromote) \
+ x(BCH_ERR_nopromote, nopromote_may_not) \
+ x(BCH_ERR_nopromote, nopromote_already_promoted) \
+ x(BCH_ERR_nopromote, nopromote_unwritten) \
+ x(BCH_ERR_nopromote, nopromote_congested) \
+ x(BCH_ERR_nopromote, nopromote_in_flight) \
+ x(BCH_ERR_nopromote, nopromote_enomem)
enum bch_errcode {
BCH_ERR_START = 2048,
return err < 0 ? __bch2_err_class(err) : err;
}
+#define BLK_STS_REMOVED ((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
#endif /* _BCACHFES_ERRCODE_H */
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
-#include "io.h"
#include "super.h"
#define FSCK_ERR_RATELIMIT_NR 10
up_write(&c->state_lock);
}
-void bch2_io_error(struct bch_dev *ca)
+void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
+ atomic64_inc(&ca->errors[type]);
//queue_work(system_long_wq, &ca->io_error_work);
}
if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
return NULL;
- list_for_each_entry(s, &c->fsck_errors, list)
+ list_for_each_entry(s, &c->fsck_error_msgs, list)
if (s->fmt == fmt) {
/*
* move it to the head of the list: repeated fsck errors
* are common
*/
- list_move(&s->list, &c->fsck_errors);
+ list_move(&s->list, &c->fsck_error_msgs);
return s;
}
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s) {
- if (!c->fsck_alloc_err)
+ if (!c->fsck_alloc_msgs_err)
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
- c->fsck_alloc_err = true;
+ c->fsck_alloc_msgs_err = true;
return NULL;
}
INIT_LIST_HEAD(&s->list);
s->fmt = fmt;
- list_add(&s->list, &c->fsck_errors);
+ list_add(&s->list, &c->fsck_error_msgs);
return s;
}
-int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+int bch2_fsck_err(struct bch_fs *c,
+ enum bch_fsck_flags flags,
+ enum bch_sb_error_id err,
+ const char *fmt, ...)
{
struct fsck_err_state *s = NULL;
va_list args;
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
+ bch2_sb_error_count(c, err);
+
va_start(args, fmt);
prt_vprintf(out, fmt, args);
va_end(args);
- mutex_lock(&c->fsck_error_lock);
+ mutex_lock(&c->fsck_error_msgs_lock);
s = fsck_err_get(c, fmt);
if (s) {
/*
*/
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
ret = s->ret;
- mutex_unlock(&c->fsck_error_lock);
+ mutex_unlock(&c->fsck_error_msgs_lock);
printbuf_exit(&buf);
return ret;
}
if (s)
s->ret = ret;
- mutex_unlock(&c->fsck_error_lock);
+ mutex_unlock(&c->fsck_error_msgs_lock);
printbuf_exit(&buf);
{
struct fsck_err_state *s, *n;
- mutex_lock(&c->fsck_error_lock);
+ mutex_lock(&c->fsck_error_msgs_lock);
- list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+ list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
if (s->ratelimited && s->last_msg)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
kfree(s);
}
- mutex_unlock(&c->fsck_error_lock);
+ mutex_unlock(&c->fsck_error_msgs_lock);
}
#include <linux/list.h>
#include <linux/printk.h>
+#include "sb-errors.h"
struct bch_dev;
struct bch_fs;
char *last_msg;
};
-#define FSCK_CAN_FIX (1 << 0)
-#define FSCK_CAN_IGNORE (1 << 1)
-#define FSCK_NEED_FSCK (1 << 2)
-#define FSCK_NO_RATELIMIT (1 << 3)
+enum bch_fsck_flags {
+ FSCK_CAN_FIX = 1 << 0,
+ FSCK_CAN_IGNORE = 1 << 1,
+ FSCK_NEED_FSCK = 1 << 2,
+ FSCK_NO_RATELIMIT = 1 << 3,
+};
+
+#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-__printf(3, 4) __cold
-int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
+__printf(4, 5) __cold
+int bch2_fsck_err(struct bch_fs *,
+ enum bch_fsck_flags,
+ enum bch_sb_error_id,
+ const char *, ...);
void bch2_flush_fsck_errs(struct bch_fs *);
-#define __fsck_err(c, _flags, msg, ...) \
+#define __fsck_err(c, _flags, _err_type, ...) \
({ \
- int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \
+ int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \
+ __VA_ARGS__); \
\
if (_ret != -BCH_ERR_fsck_fix && \
_ret != -BCH_ERR_fsck_ignore) { \
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-#define __fsck_err_on(cond, c, _flags, ...) \
- (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
+#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
+ (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define need_fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
-#define need_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define fsck_err(c, _err_type, ...) \
+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-#define need_fsck_err(c, ...) \
- __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define fsck_err_on(cond, c, _err_type, ...) \
+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
-#define mustfix_fsck_err(c, ...) \
- __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+static inline void bch2_bkey_fsck_err(struct bch_fs *c,
+ struct printbuf *err_msg,
+ enum bch_sb_error_id err_type,
+ const char *fmt, ...)
+{
+ va_list args;
-#define mustfix_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+ va_start(args, fmt);
+ prt_vprintf(err_msg, fmt, args);
+ va_end(args);
-#define fsck_err(c, ...) \
- __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+}
-#define fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+#define bkey_fsck_err(c, _err_msg, _err_type, ...) \
+do { \
+ prt_printf(_err_msg, __VA_ARGS__); \
+ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \
+ ret = -BCH_ERR_invalid_bkey; \
+ goto fsck_err; \
+} while (0)
+
+#define bkey_fsck_err_on(cond, ...) \
+do { \
+ if (unlikely(cond)) \
+ bkey_fsck_err(__VA_ARGS__); \
+} while (0)
/*
* Fatal errors: these don't indicate a bug, but we can't continue running in RW
void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *);
+void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-#define bch2_dev_io_err_on(cond, ca, ...) \
+#define bch2_dev_io_err_on(cond, ca, _type, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) { \
bch_err_dev_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca); \
+ bch2_io_error(ca, _type); \
} \
_ret; \
})
-#define bch2_dev_inum_io_err_on(cond, ca, ...) \
+#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) { \
bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca); \
+ bch2_io_error(ca, _type); \
} \
_ret; \
})
#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
+#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
/* KEY_TYPE_btree_ptr: */
-int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
- prt_printf(err, "value too big (%zu > %u)",
- bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
+ btree_ptr_val_too_big,
+ "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
- return bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
- prt_printf(err, "value too big (%zu > %zu)",
- bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
+ btree_ptr_v2_val_too_big,
+ "value too big (%zu > %zu)",
+ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
- return bch2_bkey_ptrs_invalid(c, k, flags, err);
+ ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+ return ret;
}
void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
/* KEY_TYPE_reservation: */
-int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+ int ret = 0;
- if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
- prt_printf(err, "invalid nr_replicas (%u)",
- r.v->nr_replicas);
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
+ reservation_key_nr_replicas_invalid,
+ "invalid nr_replicas (%u)", r.v->nr_replicas);
+fsck_err:
+ return ret;
}
void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
switch (type) {
case BCH_EXTENT_ENTRY_crc32:
set_common_fields(dst->crc32, src);
- memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum));
+ dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
break;
case BCH_EXTENT_ENTRY_crc64:
set_common_fields(dst->crc64, src);
dst->crc64.nonce = src.nonce;
- dst->crc64.csum_lo = src.csum.lo;
- dst->crc64.csum_hi = *((__le16 *) &src.csum.hi);
+ dst->crc64.csum_lo = (u64 __force) src.csum.lo;
+ dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
break;
case BCH_EXTENT_ENTRY_crc128:
set_common_fields(dst->crc128, src);
return i;
}
-static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
- union bch_extent_entry *next = extent_entry_next(entry);
-
- /* stripes have ptrs, but their layout doesn't work with this code */
- BUG_ON(k.k->type == KEY_TYPE_stripe);
-
- memmove_u64s_down(entry, next,
- (u64 *) bkey_val_end(k) - (u64 *) next);
- k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
/*
* Returns pointer to the next entry after the one being dropped:
*/
bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
- if (p1.ptr.dev == p2.ptr.dev &&
- p1.ptr.gen == p2.ptr.gen &&
- (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
- (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
- return true;
+ if (p1.ptr.dev == p2.ptr.dev &&
+ p1.ptr.gen == p2.ptr.gen &&
+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+ return true;
return false;
} else {
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
- const struct bch_extent_ptr *ptr;
- const struct bch_extent_stripe_ptr *ec;
- struct bch_dev *ca;
bool first = true;
if (c)
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- ptr = entry_to_ptr(entry);
- ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ case BCH_EXTENT_ENTRY_ptr: {
+ const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
prt_printf(out, " stale");
}
break;
+ }
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+ case BCH_EXTENT_ENTRY_crc128: {
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
bch2_csum_types[crc.csum_type],
bch2_compression_types[crc.compression_type]);
break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- ec = &entry->stripe_ptr;
+ }
+ case BCH_EXTENT_ENTRY_stripe_ptr: {
+ const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
prt_printf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
+ }
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ prt_str(out, "rebalance: target ");
+ if (c)
+ bch2_target_to_text(out, c, r->target);
+ else
+ prt_printf(out, "%u", r->target);
+ prt_str(out, " compression ");
+ bch2_compression_opt_to_text(out, r->compression);
+ break;
+ }
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
}
}
-static int extent_ptr_invalid(const struct bch_fs *c,
+static int extent_ptr_invalid(struct bch_fs *c,
struct bkey_s_c k,
+ enum bkey_invalid_flags flags,
const struct bch_extent_ptr *ptr,
unsigned size_ondisk,
bool metadata,
u64 bucket;
u32 bucket_offset;
struct bch_dev *ca;
+ int ret = 0;
if (!bch2_dev_exists2(c, ptr->dev)) {
- prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
- return -BCH_ERR_invalid_bkey;
+ /*
+ * If we're in the write path this key might have already been
+ * overwritten, and we could be seeing a device that doesn't
+ * exist anymore due to racing with device removal:
+ */
+ if (flags & BKEY_INVALID_WRITE)
+ return 0;
+
+ bkey_fsck_err(c, err, ptr_to_invalid_device,
+ "pointer to invalid device (%u)", ptr->dev);
}
ca = bch_dev_bkey_exists(c, ptr->dev);
bkey_for_each_ptr(ptrs, ptr2)
- if (ptr != ptr2 && ptr->dev == ptr2->dev) {
- prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
+ ptr_to_duplicate_device,
+ "multiple pointers to same device (%u)", ptr->dev);
bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
- if (bucket >= ca->mi.nbuckets) {
- prt_printf(err, "pointer past last bucket (%llu > %llu)",
- bucket, ca->mi.nbuckets);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
- prt_printf(err, "pointer before first bucket (%llu < %u)",
- bucket, ca->mi.first_bucket);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
- prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+ bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+ ptr_after_last_bucket,
+ "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
+ bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+ ptr_before_first_bucket,
+ "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
+ bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+ ptr_spans_multiple_buckets,
+ "pointer spans multiple buckets (%u + %u > %u)",
bucket_offset, size_ondisk, ca->mi.bucket_size);
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+fsck_err:
+ return ret;
}
-int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
unsigned size_ondisk = k.k->size;
unsigned nonce = UINT_MAX;
unsigned nr_ptrs = 0;
- bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
- int ret;
+ bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+ int ret = 0;
if (bkey_is_btree_ptr(k.k))
size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
- if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
- prt_printf(err, "invalid extent entry type (got %u, max %u)",
- __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
+ extent_ptrs_invalid_entry,
+ "invalid extent entry type (got %u, max %u)",
+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
- if (bkey_is_btree_ptr(k.k) &&
- !extent_entry_is_ptr(entry)) {
- prt_printf(err, "has non ptr field");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
+ !extent_entry_is_ptr(entry), c, err,
+ btree_ptr_has_non_ptr,
+ "has non ptr field");
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
- ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
- false, err);
+ ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+ size_ondisk, false, err);
if (ret)
return ret;
- if (nr_ptrs && unwritten != entry->ptr.unwritten) {
- prt_printf(err, "extent with unwritten and written ptrs");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
- prt_printf(err, "has unwritten ptrs");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
+ ptr_cached_and_erasure_coded,
+ "cached, erasure coded ptr");
- if (entry->ptr.cached && have_ec) {
- prt_printf(err, "cached, erasure coded ptr");
- return -BCH_ERR_invalid_bkey;
- }
+ if (!entry->ptr.unwritten)
+ have_written = true;
+ else
+ have_unwritten = true;
- unwritten = entry->ptr.unwritten;
have_ec = false;
crc_since_last_ptr = false;
nr_ptrs++;
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- if (crc.offset + crc.live_size >
- crc.uncompressed_size) {
- prt_printf(err, "checksum offset + key size > uncompressed size");
- return -BCH_ERR_invalid_bkey;
- }
-
- size_ondisk = crc.compressed_size;
-
- if (!bch2_checksum_type_valid(c, crc.csum_type)) {
- prt_printf(err, "invalid checksum type");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
- prt_printf(err, "invalid compression type");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
+ ptr_crc_uncompressed_size_too_small,
+ "checksum offset + key size > uncompressed size");
+ bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
+ ptr_crc_csum_type_unknown,
+ "invalid checksum type");
+ bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
+ ptr_crc_compression_type_unknown,
+ "invalid compression type");
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
- else if (nonce != crc.offset + crc.nonce) {
- prt_printf(err, "incorrect nonce");
- return -BCH_ERR_invalid_bkey;
- }
+ else if (nonce != crc.offset + crc.nonce)
+ bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+ "incorrect nonce");
}
- if (crc_since_last_ptr) {
- prt_printf(err, "redundant crc entry");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(crc_since_last_ptr, c, err,
+ ptr_crc_redundant,
+ "redundant crc entry");
crc_since_last_ptr = true;
+
+ bkey_fsck_err_on(crc_is_encoded(crc) &&
+ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+ (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+ ptr_crc_uncompressed_size_too_big,
+ "too large encoded extent");
+
+ size_ondisk = crc.compressed_size;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
- if (have_ec) {
- prt_printf(err, "redundant stripe entry");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(have_ec, c, err,
+ ptr_stripe_redundant,
+ "redundant stripe entry");
have_ec = true;
break;
- case BCH_EXTENT_ENTRY_rebalance:
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ if (!bch2_compression_opt_valid(r->compression)) {
+ struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+ prt_printf(err, "invalid compression opt %u:%u",
+ opt.type, opt.level);
+ return -BCH_ERR_invalid_bkey;
+ }
break;
}
+ }
}
- if (!nr_ptrs) {
- prt_str(err, "no ptrs");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
- prt_str(err, "too many ptrs");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (crc_since_last_ptr) {
- prt_printf(err, "redundant crc entry");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (have_ec) {
- prt_printf(err, "redundant stripe entry");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ bkey_fsck_err_on(!nr_ptrs, c, err,
+ extent_ptrs_no_ptrs,
+ "no ptrs");
+ bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
+ extent_ptrs_too_many_ptrs,
+ "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
+ bkey_fsck_err_on(have_written && have_unwritten, c, err,
+ extent_ptrs_written_and_unwritten,
+ "extent with unwritten and written ptrs");
+ bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
+ extent_ptrs_unwritten,
+ "has unwritten ptrs");
+ bkey_fsck_err_on(crc_since_last_ptr, c, err,
+ extent_ptrs_redundant_crc,
+ "redundant crc entry");
+ bkey_fsck_err_on(have_ec, c, err,
+ extent_ptrs_redundant_stripe,
+ "redundant stripe entry");
+fsck_err:
+ return ret;
}
void bch2_ptr_swab(struct bkey_s k)
}
}
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+ return &entry->rebalance;
+
+ return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned rewrite_ptrs = 0;
+
+ if (compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(compression);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
+ rewrite_ptrs = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+incompressible:
+ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+
+ return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ /*
+ * If it's an indirect extent, we don't delete the rebalance entry when
+ * done so that we know what options were applied - check if it still
+ * needs work done:
+ */
+ if (r &&
+ k.k->type == KEY_TYPE_reflink_v &&
+ !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+ r = NULL;
+
+ return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_s k = bkey_i_to_s(_k);
+ struct bch_extent_rebalance *r;
+ bool needs_rebalance;
+
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ /* get existing rebalance entry: */
+ r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+ if (r) {
+ if (k.k->type == KEY_TYPE_reflink_v) {
+ /*
+ * indirect extents: existing options take precedence,
+ * so that we don't move extents back and forth if
+ * they're referenced by different inodes with different
+ * options:
+ */
+ if (r->target)
+ target = r->target;
+ if (r->compression)
+ compression = r->compression;
+ }
+
+ r->target = target;
+ r->compression = compression;
+ }
+
+ needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+ if (needs_rebalance && !r) {
+ union bch_extent_entry *new = bkey_val_end(k);
+
+ new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
+ new->rebalance.compression = compression;
+ new->rebalance.target = target;
+ new->rebalance.unused = 0;
+ k.k->u64s += extent_entry_u64s(new);
+ } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+ /*
+ * For indirect extents, don't delete the rebalance entry when
+ * we're finished so that we know we specifically moved it or
+ * compressed it to its current location/compression type
+ */
+ extent_entry_drop(k, (union bch_extent_entry *) r);
+ }
+
+ return 0;
+}
+
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
memcpy_u64s_small(dst, new, extent_entry_u64s(new));
}
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ /* stripes have ptrs, but their layout doesn't work with this code */
+ BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+ memmove_u64s_down(entry, next,
+ (u64 *) bkey_val_end(k) - (u64 *) next);
+ k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
common_fields(crc->crc32),
};
- memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum));
+ *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
return ret;
}
case BCH_EXTENT_ENTRY_crc64: {
.csum.lo = (__force __le64) crc->crc64.csum_lo,
};
- u16 hi = crc->crc64.csum_hi;
- memcpy(&ret.csum.hi, &hi, sizeof(hi));
+ *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
+
return ret;
}
case BCH_EXTENT_ENTRY_crc128: {
crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
}
+static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
+{
+ return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
+}
+
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
/* KEY_TYPE_btree_ptr: */
-int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
/* KEY_TYPE_reservation: */
-int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
case KEY_TYPE_reflink_v:
case KEY_TYPE_inline_data:
case KEY_TYPE_indirect_inline_data:
+ case KEY_TYPE_error:
return true;
default:
return false;
static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
{
+ struct bch_extent_ptr *dest;
+
EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
switch (k->k.type) {
EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
- memcpy((void *) &k->v + bkey_val_bytes(&k->k),
- &ptr,
- sizeof(ptr));
+ dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
+ *dest = ptr;
k->k.u64s++;
break;
default:
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_ptr_swab(struct bkey_s);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
+ unsigned, unsigned);
+bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
+ unsigned, unsigned);
+
/* Generic extent code: */
enum bch_extent_overlap {
k->size = new_size;
}
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static inline void extent_save(struct btree *b, struct bkey_packed *dst,
- struct bkey *src)
-{
- struct bkey_format *f = &b->format;
- struct bkey_i *dst_unpacked;
-
- if ((dst_unpacked = packed_to_bkey(dst)))
- dst_unpacked->k = *src;
- else
- BUG_ON(!bch2_bkey_pack_key(dst, src, f));
-}
-
#endif /* _BCACHEFS_EXTENTS_H */
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
if (flags & BCH_CREATE_TMPFILE)
- new_inode->bi_flags |= BCH_INODE_UNLINKED;
+ new_inode->bi_flags |= BCH_INODE_unlinked;
ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
if (ret)
#include "extent_update.h"
#include "fs.h"
#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-pagecache.h"
#include "fsck.h"
#include "inode.h"
#include "journal.h"
-#include "io.h"
+#include "io_misc.h"
#include "keylist.h"
#include "quota.h"
#include "reflink.h"
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/uio.h>
-#include <linux/writeback.h>
#include <trace/events/writeback.h>
-static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-struct folio_vec {
- struct folio *fv_folio;
- size_t fv_offset;
- size_t fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
- struct folio *folio = page_folio(bv.bv_page);
- size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
- bv.bv_offset;
- size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
- return (struct folio_vec) {
- .fv_folio = folio,
- .fv_offset = offset,
- .fv_len = len,
- };
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
- struct bvec_iter iter)
-{
- return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start) \
- for (iter = (start); \
- (iter).bi_size && \
- ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \
- bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter) \
- __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
- return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
- return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
- return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
- return folio_end_pos(folio) >> 9;
-}
-
-typedef DARRAY(struct folio *) folios;
-
-static int filemap_get_contig_folios_d(struct address_space *mapping,
- loff_t start, u64 end,
- int fgp_flags, gfp_t gfp,
- folios *folios)
-{
- struct folio *f;
- u64 pos = start;
- int ret = 0;
-
- while (pos < end) {
- if ((u64) pos >= (u64) start + (1ULL << 20))
- fgp_flags &= ~FGP_CREAT;
-
- ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
- if (ret)
- break;
-
- f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
- if (IS_ERR_OR_NULL(f))
- break;
-
- BUG_ON(folios->nr && folio_pos(f) != pos);
-
- pos = folio_end_pos(f);
- darray_push(folios, f);
- }
-
- if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
- ret = -ENOMEM;
-
- return folios->nr ? 0 : ret;
-}
-
struct nocow_flush {
struct closure *cl;
struct bch_dev *ca;
bio_put(&bio->bio);
}
-static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct closure *cl)
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct closure *cl)
{
struct nocow_flush *bio;
struct bch_dev *ca;
struct nocow_flush, bio);
bio->cl = cl;
bio->ca = ca;
- bio->bio.bi_end_io = nocow_flush_endio;
- closure_bio_submit(&bio->bio, cl);
- }
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct closure cl;
-
- closure_init_stack(&cl);
- bch2_inode_flush_nocow_writes_async(c, inode, &cl);
- closure_sync(&cl);
-
- return 0;
-}
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
- if (bio->bi_vcnt >= bio->bi_max_vecs)
- return true;
- if (bio->bi_iter.bi_size > UINT_MAX - len)
- return true;
- return false;
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
- return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
- current->faults_disabled_mapping =
- (void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
- return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-struct quota_res {
- u64 sectors;
-};
-
-struct bch_writepage_io {
- struct bch_inode_info *inode;
-
- /* must be last: */
- struct bch_write_op op;
-};
-
-struct dio_write {
- struct kiocb *req;
- struct address_space *mapping;
- struct bch_inode_info *inode;
- struct mm_struct *mm;
- unsigned loop:1,
- extending:1,
- sync:1,
- flush:1,
- free_iov:1;
- struct quota_res quota_res;
- u64 written;
-
- struct iov_iter iter;
- struct iovec inline_vecs[2];
-
- /* must be last: */
- struct bch_write_op op;
-};
-
-struct dio_read {
- struct closure cl;
- struct kiocb *req;
- long ret;
- bool should_dirty;
- struct bch_read_bio rbio;
-};
-
-/* pagecache_block must be held */
-static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
- loff_t start, loff_t end)
-{
- int ret;
-
- /*
- * XXX: the way this is currently implemented, we can spin if a process
- * is continually redirtying a specific page
- */
- do {
- if (!mapping->nrpages)
- return 0;
-
- ret = filemap_write_and_wait_range(mapping, start, end);
- if (ret)
- break;
-
- if (!mapping->nrpages)
- return 0;
-
- ret = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
- } while (ret == -EBUSY);
-
- return ret;
-}
-
-/* quotas */
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static void __bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res)
-{
- BUG_ON(res->sectors > inode->ei_quota_reserved);
-
- bch2_quota_acct(c, inode->ei_qid, Q_SPC,
- -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
- inode->ei_quota_reserved -= res->sectors;
- res->sectors = 0;
-}
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res)
-{
- if (res->sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __bch2_quota_reservation_put(c, inode, res);
- mutex_unlock(&inode->ei_quota_lock);
- }
-}
-
-static int bch2_quota_reservation_add(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res,
- u64 sectors,
- bool check_enospc)
-{
- int ret;
-
- if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
- return 0;
-
- mutex_lock(&inode->ei_quota_lock);
- ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
- check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
- if (likely(!ret)) {
- inode->ei_quota_reserved += sectors;
- res->sectors += sectors;
- }
- mutex_unlock(&inode->ei_quota_lock);
-
- return ret;
-}
-
-#else
-
-static void __bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res) {}
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res) {}
-
-static int bch2_quota_reservation_add(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct quota_res *res,
- unsigned sectors,
- bool check_enospc)
-{
- return 0;
-}
-
-#endif
-
-/* i_size updates: */
-
-struct inode_new_size {
- loff_t new_size;
- u64 now;
- unsigned fields;
-};
-
-static int inode_set_size(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct inode_new_size *s = p;
-
- bi->bi_size = s->new_size;
- if (s->fields & ATTR_ATIME)
- bi->bi_atime = s->now;
- if (s->fields & ATTR_MTIME)
- bi->bi_mtime = s->now;
- if (s->fields & ATTR_CTIME)
- bi->bi_ctime = s->now;
-
- return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
- struct bch_inode_info *inode,
- loff_t new_size, unsigned fields)
-{
- struct inode_new_size s = {
- .new_size = new_size,
- .now = bch2_current_time(c),
- .fields = fields,
- };
-
- return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
- struct quota_res *quota_res, s64 sectors)
-{
- bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
- "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
- inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
- inode->ei_inode.bi_sectors);
- inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
- if (quota_res &&
- !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
- sectors > 0) {
- BUG_ON(sectors > quota_res->sectors);
- BUG_ON(sectors > inode->ei_quota_reserved);
-
- quota_res->sectors -= sectors;
- inode->ei_quota_reserved -= sectors;
- } else {
- bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
- }
-#endif
-}
-
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
- struct quota_res *quota_res, s64 sectors)
-{
- if (sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __i_sectors_acct(c, inode, quota_res, sectors);
- mutex_unlock(&inode->ei_quota_lock);
- }
-}
-
-/* page state: */
-
-/* stored in page->private: */
-
-#define BCH_FOLIO_SECTOR_STATE() \
- x(unallocated) \
- x(reserved) \
- x(dirty) \
- x(dirty_reserved) \
- x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n) SECTOR_##n,
- BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-static const char * const bch2_folio_sector_states[] = {
-#define x(n) #n,
- BCH_FOLIO_SECTOR_STATE()
-#undef x
- NULL
-};
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_unallocated:
- return SECTOR_dirty;
- case SECTOR_reserved:
- return SECTOR_dirty_reserved;
- default:
- return state;
- }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_dirty:
- return SECTOR_unallocated;
- case SECTOR_dirty_reserved:
- return SECTOR_reserved;
- default:
- return state;
- }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
- switch (state) {
- case SECTOR_unallocated:
- return SECTOR_reserved;
- case SECTOR_dirty:
- return SECTOR_dirty_reserved;
- default:
- return state;
- }
-}
-
-struct bch_folio_sector {
- /* Uncompressed, fully allocated replicas (or on disk reservation): */
- unsigned nr_replicas:4;
-
- /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
- unsigned replicas_reserved:4;
-
- /* i_sectors: */
- enum bch_folio_sector_state state:8;
-};
-
-struct bch_folio {
- spinlock_t lock;
- atomic_t write_count;
- /*
- * Is the sector state up to date with the btree?
- * (Not the data itself)
- */
- bool uptodate;
- struct bch_folio_sector s[];
-};
-
-static inline void folio_sector_set(struct folio *folio,
- struct bch_folio *s,
- unsigned i, unsigned n)
-{
- s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
- u64 f_offset = pos - folio_pos(folio);
- BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
- return f_offset >> SECTOR_SHIFT;
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
- return folio_has_private(folio)
- ? (struct bch_folio *) folio_get_private(folio)
- : NULL;
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
- EBUG_ON(!folio_test_locked(folio));
-
- return __bch2_folio(folio);
-}
-
-/* for newly allocated folios: */
-static void __bch2_folio_release(struct folio *folio)
-{
- kfree(folio_detach_private(folio));
-}
-
-static void bch2_folio_release(struct folio *folio)
-{
- EBUG_ON(!folio_test_locked(folio));
- __bch2_folio_release(folio);
-}
-
-/* for newly allocated folios: */
-static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
- struct bch_folio *s;
-
- s = kzalloc(sizeof(*s) +
- sizeof(struct bch_folio_sector) *
- folio_sectors(folio), gfp);
- if (!s)
- return NULL;
-
- spin_lock_init(&s->lock);
- folio_attach_private(folio, s);
- return s;
-}
-
-static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
- return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
- if (bkey_extent_is_reservation(k))
- return SECTOR_reserved;
- if (bkey_extent_is_allocation(k.k))
- return SECTOR_allocated;
- return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
- unsigned pg_offset, unsigned pg_len,
- unsigned nr_ptrs, unsigned state)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, sectors = folio_sectors(folio);
-
- BUG_ON(pg_offset >= sectors);
- BUG_ON(pg_offset + pg_len > sectors);
-
- spin_lock(&s->lock);
-
- for (i = pg_offset; i < pg_offset + pg_len; i++) {
- s->s[i].nr_replicas = nr_ptrs;
- folio_sector_set(folio, s, i, state);
- }
-
- if (i == sectors)
- s->uptodate = true;
-
- spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
- struct folio **folios, unsigned nr_folios)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_folio *s;
- u64 offset = folio_sector(folios[0]);
- unsigned folio_idx;
- u32 snapshot;
- bool need_set = false;
- int ret;
-
- for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
- s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- need_set |= !s->uptodate;
- }
-
- if (!need_set)
- return 0;
-
- folio_idx = 0;
- bch2_trans_init(&trans, c, 0, 0);
-retry:
- bch2_trans_begin(&trans);
-
- ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_SLOTS, k, ret) {
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- while (folio_idx < nr_folios) {
- struct folio *folio = folios[folio_idx];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
- unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
-
- BUG_ON(k.k->p.offset < folio_start);
- BUG_ON(bkey_start_offset(k.k) > folio_end);
-
- if (!bch2_folio(folio)->uptodate)
- __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
- if (k.k->p.offset < folio_end)
- break;
- folio_idx++;
- }
-
- if (folio_idx == nr_folios)
- break;
- }
-
- offset = iter.pos.offset;
- bch2_trans_iter_exit(&trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
-static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
- struct bvec_iter iter;
- struct folio_vec fv;
- unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
- ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = bkey_to_sector_state(k);
-
- bio_for_each_folio(fv, bio, iter)
- __bch2_folio_set(fv.fv_folio,
- fv.fv_offset >> 9,
- fv.fv_len >> 9,
- nr_ptrs, state);
-}
-
-static void mark_pagecache_unallocated(struct bch_inode_info *inode,
- u64 start, u64 end)
-{
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct folio_batch fbatch;
- unsigned i, j;
-
- if (end <= start)
- return;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(inode->v.i_mapping,
- &index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
-
- BUG_ON(end <= folio_start);
-
- folio_lock(folio);
- s = bch2_folio(folio);
-
- if (s) {
- spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++)
- s->s[j].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
-
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-}
-
-static void mark_pagecache_reserved(struct bch_inode_info *inode,
- u64 start, u64 end)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
- struct folio_batch fbatch;
- s64 i_sectors_delta = 0;
- unsigned i, j;
-
- if (end <= start)
- return;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(inode->v.i_mapping,
- &index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
- u64 folio_start = folio_sector(folio);
- u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
-
- BUG_ON(end <= folio_start);
-
- folio_lock(folio);
- s = bch2_folio(folio);
-
- if (s) {
- spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++) {
- i_sectors_delta -= s->s[j].state == SECTOR_dirty;
- folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
- }
- spin_unlock(&s->lock);
- }
-
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-
- i_sectors_acct(c, inode, NULL, i_sectors_delta);
-}
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
- /* XXX: this should not be open coded */
- return inode->ei_inode.bi_data_replicas
- ? inode->ei_inode.bi_data_replicas - 1
- : c->opts.data_replicas;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
- unsigned nr_replicas)
-{
- return max(0, (int) nr_replicas -
- s->nr_replicas -
- s->replicas_reserved);
-}
-
-static int bch2_get_folio_disk_reservation(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio, bool check_enospc)
-{
- struct bch_folio *s = bch2_folio_create(folio, 0);
- unsigned nr_replicas = inode_nr_replicas(c, inode);
- struct disk_reservation disk_res = { 0 };
- unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
- int ret;
-
- if (!s)
- return -ENOMEM;
-
- for (i = 0; i < sectors; i++)
- disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
- if (!disk_res_sectors)
- return 0;
-
- ret = bch2_disk_reservation_get(c, &disk_res,
- disk_res_sectors, 1,
- !check_enospc
- ? BCH_DISK_RESERVATION_NOFAIL
- : 0);
- if (unlikely(ret))
- return ret;
-
- for (i = 0; i < sectors; i++)
- s->s[i].replicas_reserved +=
- sectors_to_reserve(&s->s[i], nr_replicas);
-
- return 0;
-}
-
-struct bch2_folio_reservation {
- struct disk_reservation disk;
- struct quota_res quota;
-};
-
-static void bch2_folio_reservation_init(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch2_folio_reservation *res)
-{
- memset(res, 0, sizeof(*res));
-
- res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-static void bch2_folio_reservation_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct bch2_folio_reservation *res)
-{
- bch2_disk_reservation_put(c, &res->disk);
- bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-static int bch2_folio_reservation_get(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- unsigned offset, unsigned len)
-{
- struct bch_folio *s = bch2_folio_create(folio, 0);
- unsigned i, disk_sectors = 0, quota_sectors = 0;
- int ret;
-
- if (!s)
- return -ENOMEM;
-
- BUG_ON(!s->uptodate);
-
- for (i = round_down(offset, block_bytes(c)) >> 9;
- i < round_up(offset + len, block_bytes(c)) >> 9;
- i++) {
- disk_sectors += sectors_to_reserve(&s->s[i],
- res->disk.nr_replicas);
- quota_sectors += s->s[i].state == SECTOR_unallocated;
- }
-
- if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
- if (unlikely(ret))
- return ret;
- }
-
- if (quota_sectors) {
- ret = bch2_quota_reservation_add(c, inode, &res->quota,
- quota_sectors, true);
- if (unlikely(ret)) {
- struct disk_reservation tmp = {
- .sectors = disk_sectors
- };
-
- bch2_disk_reservation_put(c, &tmp);
- res->disk.sectors -= disk_sectors;
- return ret;
- }
- }
-
- return 0;
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
- struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_folio *s = bch2_folio(folio);
- struct disk_reservation disk_res = { 0 };
- int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
- if (!s)
- return;
-
- EBUG_ON(!folio_test_locked(folio));
- EBUG_ON(folio_test_writeback(folio));
-
- for (i = 0; i < sectors; i++) {
- disk_res.sectors += s->s[i].replicas_reserved;
- s->s[i].replicas_reserved = 0;
-
- dirty_sectors -= s->s[i].state == SECTOR_dirty;
- folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
- }
-
- bch2_disk_reservation_put(c, &disk_res);
-
- i_sectors_acct(c, inode, NULL, dirty_sectors);
-
- bch2_folio_release(folio);
-}
-
-static void bch2_set_folio_dirty(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct folio *folio,
- struct bch2_folio_reservation *res,
- unsigned offset, unsigned len)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, dirty_sectors = 0;
-
- WARN_ON((u64) folio_pos(folio) + offset + len >
- round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
- BUG_ON(!s->uptodate);
-
- spin_lock(&s->lock);
-
- for (i = round_down(offset, block_bytes(c)) >> 9;
- i < round_up(offset + len, block_bytes(c)) >> 9;
- i++) {
- unsigned sectors = sectors_to_reserve(&s->s[i],
- res->disk.nr_replicas);
-
- /*
- * This can happen if we race with the error path in
- * bch2_writepage_io_done():
- */
- sectors = min_t(unsigned, sectors, res->disk.sectors);
-
- s->s[i].replicas_reserved += sectors;
- res->disk.sectors -= sectors;
-
- dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
- folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
- }
-
- spin_unlock(&s->lock);
-
- i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
- if (!folio_test_dirty(folio))
- filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
- struct file *file = vmf->vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct address_space *fdm = faults_disabled_mapping();
- struct bch_inode_info *inode = file_bch_inode(file);
- vm_fault_t ret;
-
- if (fdm == mapping)
- return VM_FAULT_SIGBUS;
-
- /* Lock ordering: */
- if (fdm > mapping) {
- struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
- if (bch2_pagecache_add_tryget(inode))
- goto got_lock;
-
- bch2_pagecache_block_put(fdm_host);
-
- bch2_pagecache_add_get(inode);
- bch2_pagecache_add_put(inode);
-
- bch2_pagecache_block_get(fdm_host);
-
- /* Signal that lock has been dropped: */
- set_fdm_dropped_locks();
- return VM_FAULT_SIGBUS;
- }
-
- bch2_pagecache_add_get(inode);
-got_lock:
- ret = filemap_fault(vmf);
- bch2_pagecache_add_put(inode);
-
- return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
- struct folio *folio = page_folio(vmf->page);
- struct file *file = vmf->vma->vm_file;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct address_space *mapping = file->f_mapping;
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation res;
- unsigned len;
- loff_t isize;
- vm_fault_t ret;
-
- bch2_folio_reservation_init(c, inode, &res);
-
- sb_start_pagefault(inode->v.i_sb);
- file_update_time(file);
-
- /*
- * Not strictly necessary, but helps avoid dio writes livelocking in
- * write_invalidate_inode_pages_range() - can drop this if/when we get
- * a write_invalidate_inode_pages_range() that works without dropping
- * page lock before invalidating page
- */
- bch2_pagecache_add_get(inode);
-
- folio_lock(folio);
- isize = i_size_read(&inode->v);
-
- if (folio->mapping != mapping || folio_pos(folio) >= isize) {
- folio_unlock(folio);
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
-
- if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
- bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
- folio_unlock(folio);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
- bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
- bch2_folio_reservation_put(c, inode, &res);
-
- folio_wait_stable(folio);
- ret = VM_FAULT_LOCKED;
-out:
- bch2_pagecache_add_put(inode);
- sb_end_pagefault(inode->v.i_sb);
-
- return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
- if (offset || length < folio_size(folio))
- return;
-
- bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
- if (folio_test_dirty(folio) || folio_test_writeback(folio))
- return false;
-
- bch2_clear_folio_bits(folio);
- return true;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
- struct folio_iter fi;
-
- bio_for_each_folio_all(fi, bio) {
- if (!bio->bi_status) {
- folio_mark_uptodate(fi.folio);
- } else {
- folio_clear_uptodate(fi.folio);
- folio_set_error(fi.folio);
- }
- folio_unlock(fi.folio);
- }
-
- bio_put(bio);
-}
-
-struct readpages_iter {
- struct address_space *mapping;
- unsigned idx;
- folios folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
- struct readahead_control *ractl)
-{
- struct folio **fi;
- int ret;
-
- memset(iter, 0, sizeof(*iter));
-
- iter->mapping = ractl->mapping;
-
- ret = filemap_get_contig_folios_d(iter->mapping,
- ractl->_index << PAGE_SHIFT,
- (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
- 0, mapping_gfp_mask(iter->mapping),
- &iter->folios);
- if (ret)
- return ret;
-
- darray_for_each(iter->folios, fi) {
- ractl->_nr_pages -= 1U << folio_order(*fi);
- __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
- folio_put(*fi);
- folio_put(*fi);
- }
-
- return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
- if (iter->idx >= iter->folios.nr)
- return NULL;
- return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
- iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *i;
-
- bkey_for_each_crc(k.k, ptrs, crc, i)
- if (crc.csum_type || crc.compression_type)
- return true;
- return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
- struct readpages_iter *iter,
- struct bio *bio,
- unsigned sectors_this_extent,
- bool get_more)
-{
- /* Don't hold btree locks while allocating memory: */
- bch2_trans_unlock(trans);
-
- while (bio_sectors(bio) < sectors_this_extent &&
- bio->bi_vcnt < bio->bi_max_vecs) {
- struct folio *folio = readpage_iter_peek(iter);
- int ret;
-
- if (folio) {
- readpage_iter_advance(iter);
- } else {
- pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
- if (!get_more)
- break;
-
- folio = xa_load(&iter->mapping->i_pages, folio_offset);
- if (folio && !xa_is_value(folio))
- break;
-
- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
- if (!folio)
- break;
-
- if (!__bch2_folio_create(folio, GFP_KERNEL)) {
- folio_put(folio);
- break;
- }
-
- ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
- if (ret) {
- __bch2_folio_release(folio);
- folio_put(folio);
- break;
- }
-
- folio_put(folio);
- }
-
- BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
- BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
- }
-
- return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
- struct bch_read_bio *rbio,
- subvol_inum inum,
- struct readpages_iter *readpages_iter)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_buf sk;
- int flags = BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE;
- u32 snapshot;
- int ret = 0;
-
- rbio->c = c;
- rbio->start_time = local_clock();
- rbio->subvol = inum.subvol;
-
- bch2_bkey_buf_init(&sk);
-retry:
- bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
- while (1) {
- struct bkey_s_c k;
- unsigned bytes, sectors, offset_into_extent;
- enum btree_id data_btree = BTREE_ID_extents;
-
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
- if (ret)
- break;
-
- bch2_btree_iter_set_pos(&iter,
- POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- offset_into_extent = iter.pos.offset -
- bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
-
- bch2_bkey_buf_reassemble(&sk, c, k);
-
- ret = bch2_read_indirect_extent(trans, &data_btree,
- &offset_into_extent, &sk);
- if (ret)
- break;
-
- k = bkey_i_to_s_c(sk.k);
-
- sectors = min(sectors, k.k->size - offset_into_extent);
-
- if (readpages_iter) {
- ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
- extent_partial_reads_expensive(k));
- if (ret)
- break;
- }
-
- bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
- swap(rbio->bio.bi_iter.bi_size, bytes);
-
- if (rbio->bio.bi_iter.bi_size == bytes)
- flags |= BCH_READ_LAST_FRAGMENT;
-
- bch2_bio_page_state_set(&rbio->bio, k);
-
- bch2_read_extent(trans, rbio, iter.pos,
- data_btree, k, offset_into_extent, flags);
-
- if (flags & BCH_READ_LAST_FRAGMENT)
- break;
-
- swap(rbio->bio.bi_iter.bi_size, bytes);
- bio_advance(&rbio->bio, bytes);
-
- ret = btree_trans_too_many_iters(trans);
- if (ret)
- break;
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- if (ret) {
- bch_err_inum_offset_ratelimited(c,
- iter.pos.inode,
- iter.pos.offset << 9,
- "read error %i from btree lookup", ret);
- rbio->bio.bi_status = BLK_STS_IOERR;
- bio_endio(&rbio->bio);
- }
-
- bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
- struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts;
- struct btree_trans trans;
- struct folio *folio;
- struct readpages_iter readpages_iter;
- int ret;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- ret = readpages_iter_init(&readpages_iter, ractl);
- BUG_ON(ret);
-
- bch2_trans_init(&trans, c, 0, 0);
-
- bch2_pagecache_add_get(inode);
-
- while ((folio = readpage_iter_peek(&readpages_iter))) {
- unsigned n = min_t(unsigned,
- readpages_iter.folios.nr -
- readpages_iter.idx,
- BIO_MAX_VECS);
- struct bch_read_bio *rbio =
- rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
- GFP_KERNEL, &c->bio_read),
- opts);
-
- readpage_iter_advance(&readpages_iter);
-
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- rbio->bio.bi_end_io = bch2_readpages_end_io;
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- bchfs_read(&trans, rbio, inode_inum(inode),
- &readpages_iter);
- bch2_trans_unlock(&trans);
- }
-
- bch2_pagecache_add_put(inode);
-
- bch2_trans_exit(&trans);
- darray_exit(&readpages_iter.folios);
-}
-
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
- subvol_inum inum, struct folio *folio)
-{
- struct btree_trans trans;
-
- bch2_folio_create(folio, __GFP_NOFAIL);
-
- rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- bch2_trans_init(&trans, c, 0, 0);
- bchfs_read(&trans, rbio, inum, NULL);
- bch2_trans_exit(&trans);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
- complete(bio->bi_private);
-}
-
-static int bch2_read_single_folio(struct folio *folio,
- struct address_space *mapping)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_read_bio *rbio;
- struct bch_io_opts opts;
- int ret;
- DECLARE_COMPLETION_ONSTACK(done);
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
- opts);
- rbio->bio.bi_private = &done;
- rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
-
- __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
- wait_for_completion(&done);
-
- ret = blk_status_to_errno(rbio->bio.bi_status);
- bio_put(&rbio->bio);
-
- if (ret < 0)
- return ret;
-
- folio_mark_uptodate(folio);
- return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
- int ret;
-
- ret = bch2_read_single_folio(folio, folio->mapping);
- folio_unlock(folio);
- return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_state {
- struct bch_writepage_io *io;
- struct bch_io_opts opts;
- struct bch_folio_sector *tmp;
- unsigned tmp_sectors;
-};
-
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct bch_writepage_state ret = { 0 };
-
- bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
- return ret;
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
- struct bch_writepage_io *io =
- container_of(op, struct bch_writepage_io, op);
- struct bch_fs *c = io->op.c;
- struct bio *bio = &io->op.wbio.bio;
- struct folio_iter fi;
- unsigned i;
-
- if (io->op.error) {
- set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s;
-
- folio_set_error(fi.folio);
- mapping_set_error(fi.folio->mapping, -EIO);
-
- s = __bch2_folio(fi.folio);
- spin_lock(&s->lock);
- for (i = 0; i < folio_sectors(fi.folio); i++)
- s->s[i].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
- }
-
- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s;
-
- s = __bch2_folio(fi.folio);
- spin_lock(&s->lock);
- for (i = 0; i < folio_sectors(fi.folio); i++)
- s->s[i].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
- }
-
- /*
- * racing with fallocate can cause us to add fewer sectors than
- * expected - but we shouldn't add more sectors than expected:
- */
- WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
- /*
- * (error (due to going RO) halfway through a page can screw that up
- * slightly)
- * XXX wtf?
- BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
- */
-
- /*
- * PageWriteback is effectively our ref on the inode - fixup i_blocks
- * before calling end_page_writeback:
- */
- i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
- bio_for_each_folio_all(fi, bio) {
- struct bch_folio *s = __bch2_folio(fi.folio);
-
- if (atomic_dec_and_test(&s->write_count))
- folio_end_writeback(fi.folio);
- }
-
- bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
- struct bch_writepage_io *io = w->io;
-
- w->io = NULL;
- closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
- struct writeback_control *wbc,
- struct bch_writepage_state *w,
- struct bch_inode_info *inode,
- u64 sector,
- unsigned nr_replicas)
-{
- struct bch_write_op *op;
-
- w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
- REQ_OP_WRITE,
- GFP_KERNEL,
- &c->writepage_bioset),
- struct bch_writepage_io, op.wbio.bio);
-
- w->io->inode = inode;
- op = &w->io->op;
- bch2_write_op_init(op, c, w->opts);
- op->target = w->opts.foreground_target;
- op->nr_replicas = nr_replicas;
- op->res.nr_replicas = nr_replicas;
- op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->subvol = inode->ei_subvol;
- op->pos = POS(inode->v.i_ino, sector);
- op->end_io = bch2_writepage_io_done;
- op->devs_need_flush = &inode->ei_devs_need_flush;
- op->wbio.bio.bi_iter.bi_sector = sector;
- op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
- struct writeback_control *wbc,
- void *data)
-{
- struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_writepage_state *w = data;
- struct bch_folio *s;
- unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
- loff_t i_size = i_size_read(&inode->v);
- int ret;
-
- EBUG_ON(!folio_test_uptodate(folio));
-
- /* Is the folio fully inside i_size? */
- if (folio_end_pos(folio) <= i_size)
- goto do_io;
-
- /* Is the folio fully outside i_size? (truncate in progress) */
- if (folio_pos(folio) >= i_size) {
- folio_unlock(folio);
- return 0;
- }
-
- /*
- * The folio straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
- * in multiples of the folio size. For a file that is not a multiple of
- * the folio size, the remaining memory is zeroed when mapped, and
- * writes to that region are not written out to the file."
- */
- folio_zero_segment(folio,
- i_size - folio_pos(folio),
- folio_size(folio));
-do_io:
- f_sectors = folio_sectors(folio);
- s = bch2_folio(folio);
-
- if (f_sectors > w->tmp_sectors) {
- kfree(w->tmp);
- w->tmp = kzalloc(sizeof(struct bch_folio_sector) *
- f_sectors, __GFP_NOFAIL);
- w->tmp_sectors = f_sectors;
- }
-
- /*
- * Things get really hairy with errors during writeback:
- */
- ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
- BUG_ON(ret);
-
- /* Before unlocking the page, get copy of reservations: */
- spin_lock(&s->lock);
- memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
- for (i = 0; i < f_sectors; i++) {
- if (s->s[i].state < SECTOR_dirty)
- continue;
-
- nr_replicas_this_write =
- min_t(unsigned, nr_replicas_this_write,
- s->s[i].nr_replicas +
- s->s[i].replicas_reserved);
- }
-
- for (i = 0; i < f_sectors; i++) {
- if (s->s[i].state < SECTOR_dirty)
- continue;
-
- s->s[i].nr_replicas = w->opts.compression
- ? 0 : nr_replicas_this_write;
-
- s->s[i].replicas_reserved = 0;
- folio_sector_set(folio, s, i, SECTOR_allocated);
- }
- spin_unlock(&s->lock);
-
- BUG_ON(atomic_read(&s->write_count));
- atomic_set(&s->write_count, 1);
-
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
-
- folio_unlock(folio);
-
- offset = 0;
- while (1) {
- unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
- u64 sector;
-
- while (offset < f_sectors &&
- w->tmp[offset].state < SECTOR_dirty)
- offset++;
-
- if (offset == f_sectors)
- break;
-
- while (offset + sectors < f_sectors &&
- w->tmp[offset + sectors].state >= SECTOR_dirty) {
- reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
- dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
- sectors++;
- }
- BUG_ON(!sectors);
-
- sector = folio_sector(folio) + offset;
-
- if (w->io &&
- (w->io->op.res.nr_replicas != nr_replicas_this_write ||
- bio_full(&w->io->op.wbio.bio, sectors << 9) ||
- w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
- (BIO_MAX_VECS * PAGE_SIZE) ||
- bio_end_sector(&w->io->op.wbio.bio) != sector))
- bch2_writepage_do_io(w);
-
- if (!w->io)
- bch2_writepage_io_alloc(c, wbc, w, inode, sector,
- nr_replicas_this_write);
-
- atomic_inc(&s->write_count);
-
- BUG_ON(inode != w->io->inode);
- BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
- sectors << 9, offset << 9));
-
- /* Check for writing past i_size: */
- WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
- round_up(i_size, block_bytes(c)) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
- "writing past i_size: %llu > %llu (unrounded %llu)\n",
- bio_end_sector(&w->io->op.wbio.bio) << 9,
- round_up(i_size, block_bytes(c)),
- i_size);
-
- w->io->op.res.sectors += reserved_sectors;
- w->io->op.i_sectors_delta -= dirty_sectors;
- w->io->op.new_i_size = i_size;
-
- offset += sectors;
- }
-
- if (atomic_dec_and_test(&s->write_count))
- folio_end_writeback(folio);
-
- return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
- struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w =
- bch_writepage_state_init(c, to_bch_ei(mapping->host));
- struct blk_plug plug;
- int ret;
-
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
- if (w.io)
- bch2_writepage_do_io(&w);
- blk_finish_plug(&plug);
- kfree(w.tmp);
- return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation *res;
- struct folio *folio;
- unsigned offset;
- int ret = -ENOMEM;
-
- res = kmalloc(sizeof(*res), GFP_KERNEL);
- if (!res)
- return -ENOMEM;
-
- bch2_folio_reservation_init(c, inode, res);
- *fsdata = res;
-
- bch2_pagecache_add_get(inode);
-
- folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
- FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
- mapping_gfp_mask(mapping));
- if (IS_ERR_OR_NULL(folio))
- goto err_unlock;
-
- if (folio_test_uptodate(folio))
- goto out;
-
- offset = pos - folio_pos(folio);
- len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
- /* If we're writing entire folio, don't need to read it in first: */
- if (!offset && len == folio_size(folio))
- goto out;
-
- if (!offset && pos + len >= inode->v.i_size) {
- folio_zero_segment(folio, len, folio_size(folio));
- flush_dcache_folio(folio);
- goto out;
- }
-
- if (folio_pos(folio) >= inode->v.i_size) {
- folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
- flush_dcache_folio(folio);
- goto out;
- }
-readpage:
- ret = bch2_read_single_folio(folio, mapping);
- if (ret)
- goto err;
-out:
- ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
- if (ret)
- goto err;
-
- ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
- if (ret) {
- if (!folio_test_uptodate(folio)) {
- /*
- * If the folio hasn't been read in, we won't know if we
- * actually need a reservation - we don't actually need
- * to read here, we just need to check if the folio is
- * fully backed by uncompressed data:
- */
- goto readpage;
- }
-
- goto err;
- }
-
- *pagep = &folio->page;
- return 0;
-err:
- folio_unlock(folio);
- folio_put(folio);
- *pagep = NULL;
-err_unlock:
- bch2_pagecache_add_put(inode);
- kfree(res);
- *fsdata = NULL;
- return bch2_err_class(ret);
-}
-
-int bch2_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation *res = fsdata;
- struct folio *folio = page_folio(page);
- unsigned offset = pos - folio_pos(folio);
-
- lockdep_assert_held(&inode->v.i_rwsem);
- BUG_ON(offset + copied > folio_size(folio));
-
- if (unlikely(copied < len && !folio_test_uptodate(folio))) {
- /*
- * The folio needs to be read in, but that would destroy
- * our partial write - simplest thing is to just force
- * userspace to redo the write:
- */
- folio_zero_range(folio, 0, folio_size(folio));
- flush_dcache_folio(folio);
- copied = 0;
- }
-
- spin_lock(&inode->v.i_lock);
- if (pos + copied > inode->v.i_size)
- i_size_write(&inode->v, pos + copied);
- spin_unlock(&inode->v.i_lock);
-
- if (copied) {
- if (!folio_test_uptodate(folio))
- folio_mark_uptodate(folio);
-
- bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
- inode->ei_last_dirtied = (unsigned long) current;
- }
-
- folio_unlock(folio);
- folio_put(folio);
- bch2_pagecache_add_put(inode);
-
- bch2_folio_reservation_put(c, inode, res);
- kfree(res);
-
- return copied;
-}
-
-static noinline void folios_trunc(folios *folios, struct folio **fi)
-{
- while (folios->data + folios->nr > fi) {
- struct folio *f = darray_pop(folios);
-
- folio_unlock(f);
- folio_put(f);
- }
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
- struct address_space *mapping,
- struct iov_iter *iter,
- loff_t pos, unsigned len)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch2_folio_reservation res;
- folios folios;
- struct folio **fi, *f;
- unsigned copied = 0, f_offset;
- u64 end = pos + len, f_pos;
- loff_t last_folio_pos = inode->v.i_size;
- int ret = 0;
-
- BUG_ON(!len);
-
- bch2_folio_reservation_init(c, inode, &res);
- darray_init(&folios);
-
- ret = filemap_get_contig_folios_d(mapping, pos, end,
- FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
- mapping_gfp_mask(mapping),
- &folios);
- if (ret)
- goto out;
-
- BUG_ON(!folios.nr);
-
- f = darray_first(folios);
- if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
- ret = bch2_read_single_folio(f, mapping);
- if (ret)
- goto out;
- }
-
- f = darray_last(folios);
- end = min(end, folio_end_pos(f));
- last_folio_pos = folio_pos(f);
- if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
- if (end >= inode->v.i_size) {
- folio_zero_range(f, 0, folio_size(f));
- } else {
- ret = bch2_read_single_folio(f, mapping);
- if (ret)
- goto out;
- }
- }
-
- ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
- if (ret)
- goto out;
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(folios));
- darray_for_each(folios, fi) {
- struct folio *f = *fi;
- u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-
- /*
- * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
- * supposed to write as much as we have disk space for.
- *
- * On failure here we should still write out a partial page if
- * we aren't completely out of disk space - we don't do that
- * yet:
- */
- ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
- if (unlikely(ret)) {
- folios_trunc(&folios, fi);
- if (!folios.nr)
- goto out;
-
- end = min(end, folio_end_pos(darray_last(folios)));
- break;
- }
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- if (mapping_writably_mapped(mapping))
- darray_for_each(folios, fi)
- flush_dcache_folio(*fi);
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(folios));
- darray_for_each(folios, fi) {
- struct folio *f = *fi;
- u64 f_len = min(end, folio_end_pos(f)) - f_pos;
- unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
-
- if (!f_copied) {
- folios_trunc(&folios, fi);
- break;
- }
-
- if (!folio_test_uptodate(f) &&
- f_copied != folio_size(f) &&
- pos + copied + f_copied < inode->v.i_size) {
- folio_zero_range(f, 0, folio_size(f));
- folios_trunc(&folios, fi);
- break;
- }
-
- flush_dcache_folio(f);
- copied += f_copied;
-
- if (f_copied != f_len) {
- folios_trunc(&folios, fi + 1);
- break;
- }
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- if (!copied)
- goto out;
-
- end = pos + copied;
-
- spin_lock(&inode->v.i_lock);
- if (end > inode->v.i_size)
- i_size_write(&inode->v, end);
- spin_unlock(&inode->v.i_lock);
-
- f_pos = pos;
- f_offset = pos - folio_pos(darray_first(folios));
- darray_for_each(folios, fi) {
- struct folio *f = *fi;
- u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-
- if (!folio_test_uptodate(f))
- folio_mark_uptodate(f);
-
- bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
- f_pos = folio_end_pos(f);
- f_offset = 0;
- }
-
- inode->ei_last_dirtied = (unsigned long) current;
-out:
- darray_for_each(folios, fi) {
- folio_unlock(*fi);
- folio_put(*fi);
- }
-
- /*
- * If the last folio added to the mapping starts beyond current EOF, we
- * performed a short write but left around at least one post-EOF folio.
- * Clean up the mapping before we return.
- */
- if (last_folio_pos >= inode->v.i_size)
- truncate_pagecache(&inode->v, inode->v.i_size);
-
- darray_exit(&folios);
- bch2_folio_reservation_put(c, inode, &res);
-
- return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct bch_inode_info *inode = file_bch_inode(file);
- loff_t pos = iocb->ki_pos;
- ssize_t written = 0;
- int ret = 0;
-
- bch2_pagecache_add_get(inode);
-
- do {
- unsigned offset = pos & (PAGE_SIZE - 1);
- unsigned bytes = iov_iter_count(iter);
-again:
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
- */
- if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
- bytes = min_t(unsigned long, iov_iter_count(iter),
- PAGE_SIZE - offset);
-
- if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
- ret = -EFAULT;
- break;
- }
- }
-
- if (unlikely(fatal_signal_pending(current))) {
- ret = -EINTR;
- break;
- }
-
- ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
- if (unlikely(ret < 0))
- break;
-
- cond_resched();
-
- if (unlikely(ret == 0)) {
- /*
- * If we were unable to copy any data at all, we must
- * fall back to a single segment length write.
- *
- * If we didn't fallback here, we could livelock
- * because not all segments in the iov can be copied at
- * once without a pagefault.
- */
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_single_seg_count(iter));
- goto again;
- }
- pos += ret;
- written += ret;
- ret = 0;
-
- balance_dirty_pages_ratelimited(mapping);
- } while (iov_iter_count(iter));
-
- bch2_pagecache_add_put(inode);
-
- return written ? written : ret;
-}
-
-/* O_DIRECT reads */
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
- if (check_dirty) {
- bio_check_pages_dirty(bio);
- } else {
- bio_release_pages(bio, false);
- bio_put(bio);
- }
-}
-
-static void bch2_dio_read_complete(struct closure *cl)
-{
- struct dio_read *dio = container_of(cl, struct dio_read, cl);
-
- dio->req->ki_complete(dio->req, dio->ret);
- bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
- struct dio_read *dio = bio->bi_private;
-
- if (bio->bi_status)
- dio->ret = blk_status_to_errno(bio->bi_status);
-
- closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
- struct dio_read *dio = bio->bi_private;
- bool should_dirty = dio->should_dirty;
-
- bch2_direct_IO_read_endio(bio);
- bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
- struct file *file = req->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct bch_io_opts opts;
- struct dio_read *dio;
- struct bio *bio;
- loff_t offset = req->ki_pos;
- bool sync = is_sync_kiocb(req);
- size_t shorten;
- ssize_t ret;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- if ((offset|iter->count) & (block_bytes(c) - 1))
- return -EINVAL;
-
- ret = min_t(loff_t, iter->count,
- max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
- if (!ret)
- return ret;
-
- shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
- iter->count -= shorten;
-
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_READ,
- GFP_KERNEL,
- &c->dio_read_bioset);
-
- bio->bi_end_io = bch2_direct_IO_read_endio;
-
- dio = container_of(bio, struct dio_read, rbio.bio);
- closure_init(&dio->cl, NULL);
-
- /*
- * this is a _really_ horrible hack just to avoid an atomic sub at the
- * end:
- */
- if (!sync) {
- set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
- atomic_set(&dio->cl.remaining,
- CLOSURE_REMAINING_INITIALIZER -
- CLOSURE_RUNNING +
- CLOSURE_DESTRUCTOR);
- } else {
- atomic_set(&dio->cl.remaining,
- CLOSURE_REMAINING_INITIALIZER + 1);
- }
-
- dio->req = req;
- dio->ret = ret;
- /*
- * This is one of the sketchier things I've encountered: we have to skip
- * the dirtying of requests that are internal from the kernel (i.e. from
- * loopback), because we'll deadlock on page_lock.
- */
- dio->should_dirty = iter_is_iovec(iter);
-
- goto start;
- while (iter->count) {
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_READ,
- GFP_KERNEL,
- &c->bio_read);
- bio->bi_end_io = bch2_direct_IO_read_split_endio;
-start:
- bio->bi_opf = REQ_OP_READ|REQ_SYNC;
- bio->bi_iter.bi_sector = offset >> 9;
- bio->bi_private = dio;
-
- ret = bio_iov_iter_get_pages(bio, iter);
- if (ret < 0) {
- /* XXX: fault inject this path */
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- break;
- }
-
- offset += bio->bi_iter.bi_size;
-
- if (dio->should_dirty)
- bio_set_pages_dirty(bio);
-
- if (iter->count)
- closure_get(&dio->cl);
-
- bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
- }
-
- iter->count += shorten;
-
- if (sync) {
- closure_sync(&dio->cl);
- closure_debug_destroy(&dio->cl);
- ret = dio->ret;
- bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
- return ret;
- } else {
- return -EIOCBQUEUED;
- }
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct address_space *mapping = file->f_mapping;
- size_t count = iov_iter_count(iter);
- ssize_t ret;
-
- if (!count)
- return 0; /* skip atime */
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- struct blk_plug plug;
-
- if (unlikely(mapping->nrpages)) {
- ret = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret < 0)
- goto out;
- }
-
- file_accessed(file);
-
- blk_start_plug(&plug);
- ret = bch2_direct_IO_read(iocb, iter);
- blk_finish_plug(&plug);
-
- if (ret >= 0)
- iocb->ki_pos += ret;
- } else {
- bch2_pagecache_add_get(inode);
- ret = generic_file_read_iter(iocb, iter);
- bch2_pagecache_add_put(inode);
- }
-out:
- return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
- u64 offset, u64 size,
- unsigned nr_replicas, bool compressed)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 end = offset + size;
- u32 snapshot;
- bool ret = true;
- int err;
-
- bch2_trans_init(&trans, c, 0, 0);
-retry:
- bch2_trans_begin(&trans);
-
- err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
- if (err)
- goto err;
-
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
- SPOS(inum.inum, offset, snapshot),
- BTREE_ITER_SLOTS, k, err) {
- if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
- break;
-
- if (k.k->p.snapshot != snapshot ||
- nr_replicas > bch2_bkey_replicas(c, k) ||
- (!compressed && bch2_bkey_sectors_compressed(k))) {
- ret = false;
- break;
- }
- }
-
- offset = iter.pos.offset;
- bch2_trans_iter_exit(&trans, &iter);
-err:
- if (bch2_err_matches(err, BCH_ERR_transaction_restart))
- goto retry;
- bch2_trans_exit(&trans);
-
- return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct bch_inode_info *inode = dio->inode;
- struct bio *bio = &dio->op.wbio.bio;
-
- return bch2_check_range_allocated(c, inode_inum(inode),
- dio->op.pos.offset, bio_sectors(bio),
- dio->op.opts.data_replicas,
- dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
- struct iovec *iov = dio->inline_vecs;
-
- /*
- * iov_iter has a single embedded iovec - nothing to do:
- */
- if (iter_is_ubuf(&dio->iter))
- return 0;
-
- /*
- * We don't currently handle non-iovec iov_iters here - return an error,
- * and we'll fall back to doing the IO synchronously:
- */
- if (!iter_is_iovec(&dio->iter))
- return -1;
-
- if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
- iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
- GFP_KERNEL);
- if (unlikely(!iov))
- return -ENOMEM;
-
- dio->free_iov = true;
- }
-
- memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
- dio->iter.__iov = iov;
- return 0;
-}
-
-static void bch2_dio_write_flush_done(struct closure *cl)
-{
- struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
- struct bch_fs *c = dio->op.c;
-
- closure_debug_destroy(cl);
-
- dio->op.error = bch2_journal_error(&c->journal);
-
- bch2_dio_write_done(dio);
-}
-
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct bch_inode_unpacked inode;
- int ret;
-
- dio->flush = 0;
-
- closure_init(&dio->op.cl, NULL);
-
- if (!dio->op.error) {
- ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
- if (ret) {
- dio->op.error = ret;
- } else {
- bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
- bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
- }
- }
-
- if (dio->sync) {
- closure_sync(&dio->op.cl);
- closure_debug_destroy(&dio->op.cl);
- } else {
- continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
- }
-}
-
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
-{
- struct kiocb *req = dio->req;
- struct bch_inode_info *inode = dio->inode;
- bool sync = dio->sync;
- long ret;
-
- if (unlikely(dio->flush)) {
- bch2_dio_write_flush(dio);
- if (!sync)
- return -EIOCBQUEUED;
- }
-
- bch2_pagecache_block_put(inode);
-
- if (dio->free_iov)
- kfree(dio->iter.__iov);
-
- ret = dio->op.error ?: ((long) dio->written << 9);
- bio_put(&dio->op.wbio.bio);
-
- /* inode->i_dio_count is our ref on inode and thus bch_fs */
- inode_dio_end(&inode->v);
-
- if (ret < 0)
- ret = bch2_err_class(ret);
-
- if (!sync) {
- req->ki_complete(req, ret);
- ret = -EIOCBQUEUED;
- }
- return ret;
-}
-
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct kiocb *req = dio->req;
- struct bch_inode_info *inode = dio->inode;
- struct bio *bio = &dio->op.wbio.bio;
-
- req->ki_pos += (u64) dio->op.written << 9;
- dio->written += dio->op.written;
-
- if (dio->extending) {
- spin_lock(&inode->v.i_lock);
- if (req->ki_pos > inode->v.i_size)
- i_size_write(&inode->v, req->ki_pos);
- spin_unlock(&inode->v.i_lock);
- }
-
- if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
- mutex_lock(&inode->ei_quota_lock);
- __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
- __bch2_quota_reservation_put(c, inode, &dio->quota_res);
- mutex_unlock(&inode->ei_quota_lock);
- }
-
- bio_release_pages(bio, false);
-
- if (unlikely(dio->op.error))
- set_bit(EI_INODE_ERROR, &inode->ei_flags);
-}
-
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
- struct bch_fs *c = dio->op.c;
- struct kiocb *req = dio->req;
- struct address_space *mapping = dio->mapping;
- struct bch_inode_info *inode = dio->inode;
- struct bch_io_opts opts;
- struct bio *bio = &dio->op.wbio.bio;
- unsigned unaligned, iter_count;
- bool sync = dio->sync, dropped_locks;
- long ret;
-
- bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
- while (1) {
- iter_count = dio->iter.count;
-
- EBUG_ON(current->faults_disabled_mapping);
- current->faults_disabled_mapping = mapping;
-
- ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
- dropped_locks = fdm_dropped_locks();
-
- current->faults_disabled_mapping = NULL;
-
- /*
- * If the fault handler returned an error but also signalled
- * that it dropped & retook ei_pagecache_lock, we just need to
- * re-shoot down the page cache and retry:
- */
- if (dropped_locks && ret)
- ret = 0;
-
- if (unlikely(ret < 0))
- goto err;
-
- if (unlikely(dropped_locks)) {
- ret = write_invalidate_inode_pages_range(mapping,
- req->ki_pos,
- req->ki_pos + iter_count - 1);
- if (unlikely(ret))
- goto err;
-
- if (!bio->bi_iter.bi_size)
- continue;
- }
-
- unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
- bio->bi_iter.bi_size -= unaligned;
- iov_iter_revert(&dio->iter, unaligned);
-
- if (!bio->bi_iter.bi_size) {
- /*
- * bio_iov_iter_get_pages was only able to get <
- * blocksize worth of pages:
- */
- ret = -EFAULT;
- goto err;
- }
-
- bch2_write_op_init(&dio->op, c, opts);
- dio->op.end_io = sync
- ? NULL
- : bch2_dio_write_loop_async;
- dio->op.target = dio->op.opts.foreground_target;
- dio->op.write_point = writepoint_hashed((unsigned long) current);
- dio->op.nr_replicas = dio->op.opts.data_replicas;
- dio->op.subvol = inode->ei_subvol;
- dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
- dio->op.devs_need_flush = &inode->ei_devs_need_flush;
-
- if (sync)
- dio->op.flags |= BCH_WRITE_SYNC;
- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
-
- ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
- bio_sectors(bio), true);
- if (unlikely(ret))
- goto err;
-
- ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
- dio->op.opts.data_replicas, 0);
- if (unlikely(ret) &&
- !bch2_dio_write_check_allocated(dio))
- goto err;
-
- task_io_account_write(bio->bi_iter.bi_size);
-
- if (unlikely(dio->iter.count) &&
- !dio->sync &&
- !dio->loop &&
- bch2_dio_write_copy_iov(dio))
- dio->sync = sync = true;
-
- dio->loop = true;
- closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
- if (!sync)
- return -EIOCBQUEUED;
-
- bch2_dio_write_end(dio);
-
- if (likely(!dio->iter.count) || dio->op.error)
- break;
-
- bio_reset(bio, NULL, REQ_OP_WRITE);
+ bio->bio.bi_end_io = nocow_flush_endio;
+ closure_bio_submit(&bio->bio, cl);
}
-out:
- return bch2_dio_write_done(dio);
-err:
- dio->op.error = ret;
-
- bio_release_pages(bio, false);
-
- bch2_quota_reservation_put(c, inode, &dio->quota_res);
- goto out;
}
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+ struct bch_inode_info *inode)
{
- struct mm_struct *mm = dio->mm;
+ struct closure cl;
- bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+ closure_init_stack(&cl);
+ bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+ closure_sync(&cl);
- if (mm)
- kthread_use_mm(mm);
- bch2_dio_write_loop(dio);
- if (mm)
- kthread_unuse_mm(mm);
+ return 0;
}
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
-{
- struct dio_write *dio = container_of(op, struct dio_write, op);
-
- bch2_dio_write_end(dio);
+/* i_size updates: */
- if (likely(!dio->iter.count) || dio->op.error)
- bch2_dio_write_done(dio);
- else
- bch2_dio_write_continue(dio);
-}
+struct inode_new_size {
+ loff_t new_size;
+ u64 now;
+ unsigned fields;
+};
-static noinline
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+static int inode_set_size(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
{
- struct file *file = req->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct dio_write *dio;
- struct bio *bio;
- bool locked = true, extending;
- ssize_t ret;
-
- prefetch(&c->opts);
- prefetch((void *) &c->opts + 64);
- prefetch(&inode->ei_inode);
- prefetch((void *) &inode->ei_inode + 64);
-
- inode_lock(&inode->v);
-
- ret = generic_write_checks(req, iter);
- if (unlikely(ret <= 0))
- goto err;
-
- ret = file_remove_privs(file);
- if (unlikely(ret))
- goto err;
-
- ret = file_update_time(file);
- if (unlikely(ret))
- goto err;
-
- if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
- goto err;
-
- inode_dio_begin(&inode->v);
- bch2_pagecache_block_get(inode);
-
- extending = req->ki_pos + iter->count > inode->v.i_size;
- if (!extending) {
- inode_unlock(&inode->v);
- locked = false;
- }
+ struct inode_new_size *s = p;
- bio = bio_alloc_bioset(NULL,
- bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
- REQ_OP_WRITE,
- GFP_KERNEL,
- &c->dio_write_bioset);
- dio = container_of(bio, struct dio_write, op.wbio.bio);
- dio->req = req;
- dio->mapping = mapping;
- dio->inode = inode;
- dio->mm = current->mm;
- dio->loop = false;
- dio->extending = extending;
- dio->sync = is_sync_kiocb(req) || extending;
- dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
- dio->free_iov = false;
- dio->quota_res.sectors = 0;
- dio->written = 0;
- dio->iter = *iter;
- dio->op.c = c;
-
- if (unlikely(mapping->nrpages)) {
- ret = write_invalidate_inode_pages_range(mapping,
- req->ki_pos,
- req->ki_pos + iter->count - 1);
- if (unlikely(ret))
- goto err_put_bio;
- }
+ bi->bi_size = s->new_size;
+ if (s->fields & ATTR_ATIME)
+ bi->bi_atime = s->now;
+ if (s->fields & ATTR_MTIME)
+ bi->bi_mtime = s->now;
+ if (s->fields & ATTR_CTIME)
+ bi->bi_ctime = s->now;
- ret = bch2_dio_write_loop(dio);
-err:
- if (locked)
- inode_unlock(&inode->v);
- return ret;
-err_put_bio:
- bch2_pagecache_block_put(inode);
- bio_put(bio);
- inode_dio_end(&inode->v);
- goto err;
+ return 0;
}
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ loff_t new_size, unsigned fields)
{
- struct file *file = iocb->ki_filp;
- struct bch_inode_info *inode = file_bch_inode(file);
- ssize_t ret;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ret = bch2_direct_write(iocb, from);
- goto out;
- }
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = inode_to_bdi(&inode->v);
- inode_lock(&inode->v);
-
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto unlock;
+ struct inode_new_size s = {
+ .new_size = new_size,
+ .now = bch2_current_time(c),
+ .fields = fields,
+ };
- ret = file_remove_privs(file);
- if (ret)
- goto unlock;
+ return bch2_write_inode(c, inode, inode_set_size, &s, fields);
+}
- ret = file_update_time(file);
- if (ret)
- goto unlock;
+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+ struct quota_res *quota_res, s64 sectors)
+{
+ bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+ "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+ inode->ei_inode.bi_sectors);
+ inode->v.i_blocks += sectors;
- ret = bch2_buffered_write(iocb, from);
- if (likely(ret > 0))
- iocb->ki_pos += ret;
-unlock:
- inode_unlock(&inode->v);
- current->backing_dev_info = NULL;
+#ifdef CONFIG_BCACHEFS_QUOTA
+ if (quota_res &&
+ !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+ sectors > 0) {
+ BUG_ON(sectors > quota_res->sectors);
+ BUG_ON(sectors > inode->ei_quota_reserved);
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-out:
- return bch2_err_class(ret);
+ quota_res->sectors -= sectors;
+ inode->ei_quota_reserved -= sectors;
+ } else {
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
+ }
+#endif
}
/* fsync: */
struct bpos start,
struct bpos end)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
-
- bch2_trans_init(&trans, c, 0, 0);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
if (ret)
goto err;
- for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
ret = 1;
break;
}
start = iter.pos;
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
struct bch_folio *s;
- unsigned start_offset = start & (PAGE_SIZE - 1);
- unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+ unsigned start_offset;
+ unsigned end_offset;
unsigned i;
struct folio *folio;
s64 i_sectors_delta = 0;
folio = __filemap_get_folio(mapping, index,
FGP_LOCK|FGP_CREAT, GFP_KERNEL);
- if (unlikely(IS_ERR_OR_NULL(folio))) {
+ if (IS_ERR_OR_NULL(folio)) {
ret = -ENOMEM;
goto out;
}
s->s[i].nr_replicas = 0;
i_sectors_delta -= s->s[i].state == SECTOR_dirty;
- folio_sector_set(folio, s, i, SECTOR_unallocated);
+ bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
}
- i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
/*
* Caller needs to know whether this folio will be written out by
return bch2_setattr_nonsize(idmap, inode, iattr);
}
-static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
- return 0;
-}
-
-static int bch2_truncate_start_fn(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi, void *p)
-{
- u64 *new_i_size = p;
-
- bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
- bi->bi_size = *new_i_size;
- return 0;
-}
-
-int bch2_truncate(struct mnt_idmap *idmap,
+int bchfs_truncate(struct mnt_idmap *idmap,
struct bch_inode_info *inode, struct iattr *iattr)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
struct bch_inode_unpacked inode_u;
- u64 new_i_size = iattr->ia_size;
s64 i_sectors_delta = 0;
int ret = 0;
if (unlikely(ret < 0))
goto err;
+ truncate_setsize(&inode->v, iattr->ia_size);
+
/*
* When extending, we're going to write the new i_size to disk
* immediately so we need to flush anything above the current on disk
if (ret)
goto err;
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
- &new_i_size, 0);
- mutex_unlock(&inode->ei_update_lock);
+ ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ /*
+ * If we error here, VFS caches are now inconsistent with btree
+ */
+ set_bit(EI_INODE_ERROR, &inode->ei_flags);
goto err;
-
- truncate_setsize(&inode->v, iattr->ia_size);
-
- ret = bch2_fpunch(c, inode_inum(inode),
- round_up(iattr->ia_size, block_bytes(c)) >> 9,
- U64_MAX, &i_sectors_delta);
- i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ }
bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
!bch2_journal_error(&c->journal), c,
"inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
inode->v.i_ino, (u64) inode->v.i_blocks,
inode->ei_inode.bi_sectors);
- if (unlikely(ret))
- goto err;
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
- mutex_unlock(&inode->ei_update_lock);
ret = bch2_setattr_nonsize(idmap, inode, iattr);
err:
/* fallocate: */
-static int inode_update_times_fn(struct bch_inode_info *inode,
+static int inode_update_times_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
struct bch_inode_unpacked *bi, void *p)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
ret = bch2_fpunch(c, inode_inum(inode),
block_start >> 9, block_end >> 9,
&i_sectors_delta);
- i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
}
mutex_lock(&inode->ei_update_lock);
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
- struct bkey_buf copy;
- struct btree_trans trans;
- struct btree_iter src, dst, del;
- loff_t shift, new_size;
- u64 src_start;
+ s64 i_sectors_delta = 0;
int ret = 0;
if ((offset | len) & (block_bytes(c) - 1))
return -EINVAL;
if (insert) {
- if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
- return -EFBIG;
-
if (offset >= inode->v.i_size)
return -EINVAL;
-
- src_start = U64_MAX;
- shift = len;
} else {
if (offset + len >= inode->v.i_size)
return -EINVAL;
-
- src_start = offset + len;
- shift = -len;
}
- new_size = inode->v.i_size + shift;
-
- ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+ ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
if (ret)
return ret;
- if (insert) {
- i_size_write(&inode->v, new_size);
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode_size(c, inode, new_size,
- ATTR_MTIME|ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
- } else {
- s64 i_sectors_delta = 0;
-
- ret = bch2_fpunch(c, inode_inum(inode),
- offset >> 9, (offset + len) >> 9,
- &i_sectors_delta);
- i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
- if (ret)
- return ret;
- }
-
- bch2_bkey_buf_init(©);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
- POS(inode->v.i_ino, src_start >> 9),
- BTREE_ITER_INTENT);
- bch2_trans_copy_iter(&dst, &src);
- bch2_trans_copy_iter(&del, &src);
-
- while (ret == 0 ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i delete;
- struct bkey_s_c k;
- struct bpos next_pos;
- struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
- struct bpos atomic_end;
- unsigned trigger_flags = 0;
- u32 snapshot;
-
- bch2_trans_begin(&trans);
-
- ret = bch2_subvolume_get_snapshot(&trans,
- inode->ei_subvol, &snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(&src, snapshot);
- bch2_btree_iter_set_snapshot(&dst, snapshot);
- bch2_btree_iter_set_snapshot(&del, snapshot);
-
- bch2_trans_begin(&trans);
-
- k = insert
- ? bch2_btree_iter_peek_prev(&src)
- : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
- if ((ret = bkey_err(k)))
- continue;
-
- if (!k.k || k.k->p.inode != inode->v.i_ino)
- break;
-
- if (insert &&
- bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
- break;
-reassemble:
- bch2_bkey_buf_reassemble(©, c, k);
-
- if (insert &&
- bkey_lt(bkey_start_pos(k.k), move_pos))
- bch2_cut_front(move_pos, copy.k);
-
- copy.k->k.p.offset += shift >> 9;
- bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k));
-
- ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
- if (ret)
- continue;
-
- if (!bkey_eq(atomic_end, copy.k->k.p)) {
- if (insert) {
- move_pos = atomic_end;
- move_pos.offset -= shift >> 9;
- goto reassemble;
- } else {
- bch2_cut_back(atomic_end, copy.k);
- }
- }
-
- bkey_init(&delete.k);
- delete.k.p = copy.k->k.p;
- delete.k.size = copy.k->k.size;
- delete.k.p.offset -= shift >> 9;
- bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
-
- next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
-
- if (copy.k->k.size != k.k->size) {
- /* We might end up splitting compressed extents: */
- unsigned nr_ptrs =
- bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
-
- ret = bch2_disk_reservation_get(c, &disk_res,
- copy.k->k.size, nr_ptrs,
- BCH_DISK_RESERVATION_NOFAIL);
- BUG_ON(ret);
- }
-
- ret = bch2_btree_iter_traverse(&del) ?:
- bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
- bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
- bch2_trans_commit(&trans, &disk_res, NULL,
- BTREE_INSERT_NOFAIL);
- bch2_disk_reservation_put(c, &disk_res);
-
- if (!ret)
- bch2_btree_iter_set_pos(&src, next_pos);
- }
- bch2_trans_iter_exit(&trans, &del);
- bch2_trans_iter_exit(&trans, &dst);
- bch2_trans_iter_exit(&trans, &src);
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(©, c);
+ if (insert)
+ i_size_write(&inode->v, inode->v.i_size + len);
- if (ret)
- return ret;
+ ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+ insert, &i_sectors_delta);
+ if (!ret && !insert)
+ i_size_write(&inode->v, inode->v.i_size - len);
+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
- mutex_lock(&inode->ei_update_lock);
- if (!insert) {
- i_size_write(&inode->v, new_size);
- ret = bch2_write_inode_size(c, inode, new_size,
- ATTR_MTIME|ATTR_CTIME);
- } else {
- /* We need an inode update to update bi_journal_seq for fsync: */
- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
- ATTR_MTIME|ATTR_CTIME);
- }
- mutex_unlock(&inode->ei_update_lock);
return ret;
}
u64 start_sector, u64 end_sector)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bpos end_pos = POS(inode->v.i_ino, end_sector);
struct bch_io_opts opts;
int ret = 0;
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inode->v.i_ino, start_sector),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
u64 hole_start, hole_end;
u32 snapshot;
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans,
+ ret = bch2_subvolume_get_snapshot(trans,
inode->ei_subvol, &snapshot);
if (ret)
goto bkey_err;
}
if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+ /*
+ * Lock ordering - can't be holding btree locks while
+ * blocking on a folio lock:
+ */
if (bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
opts.data_replicas, true))
- ret = drop_locks_do(&trans,
+ ret = drop_locks_do(trans,
(bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
goto bkey_err;
}
- ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
+ ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
sectors, opts, &i_sectors_delta,
writepoint_hashed((unsigned long) current));
if (ret)
goto bkey_err;
- i_sectors_acct(c, inode, "a_res, i_sectors_delta);
+ bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta);
- drop_locks_do(&trans,
- (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+ drop_locks_do(trans,
+ (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
bkey_err:
bch2_quota_reservation_put(c, inode, "a_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
struct quota_res quota_res = { 0 };
s64 i_sectors_delta = 0;
- bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+ bch2_fpunch_at(trans, &iter, inode_inum(inode),
end_sector, &i_sectors_delta);
- i_sectors_acct(c, inode, "a_res, i_sectors_delta);
+ bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta);
bch2_quota_reservation_put(c, inode, "a_res);
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
u64 start, u64 end)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
u32 snapshot;
u64 sectors = end - start;
u64 pos = start;
int ret;
-
- bch2_trans_init(&trans, c, 0, 0);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
if (ret)
goto err;
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inode->v.i_ino, pos, snapshot), 0);
- while (!(ret = btree_trans_too_many_iters(&trans)) &&
+ while (!(ret = btree_trans_too_many_iters(trans)) &&
(k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
!(ret = bkey_err(k))) {
if (bkey_extent_is_allocation(k.k)) {
bch2_btree_iter_advance(&iter);
}
pos = iter.pos.offset;
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_exit(&trans);
-
- if (ret)
- return ret;
+ bch2_trans_put(trans);
- return bch2_quota_reservation_add(c, inode, res, sectors, true);
+ return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
}
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
aligned_len = round_up((u64) len, block_bytes(c));
- ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+ ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
pos_dst, pos_dst + len - 1);
if (ret)
goto err;
file_update_time(file_dst);
- mark_pagecache_unallocated(src, pos_src >> 9,
+ bch2_mark_pagecache_unallocated(src, pos_src >> 9,
(pos_src + aligned_len) >> 9);
ret = bch2_remap_range(c,
*/
ret = min((u64) ret << 9, (u64) len);
- i_sectors_acct(c, dst, "a_res, i_sectors_delta);
+ bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta);
spin_lock(&dst->v.i_lock);
if (pos_dst + ret > dst->v.i_size)
/* fseek: */
-static int folio_data_offset(struct folio *folio, loff_t pos,
- unsigned min_replicas)
-{
- struct bch_folio *s = bch2_folio(folio);
- unsigned i, sectors = folio_sectors(folio);
-
- if (s)
- for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
- if (s->s[i].state >= SECTOR_dirty &&
- s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
- return i << SECTOR_SHIFT;
-
- return -1;
-}
-
-static loff_t bch2_seek_pagecache_data(struct inode *vinode,
- loff_t start_offset,
- loff_t end_offset,
- unsigned min_replicas,
- bool nonblock)
-{
- struct folio_batch fbatch;
- pgoff_t start_index = start_offset >> PAGE_SHIFT;
- pgoff_t end_index = end_offset >> PAGE_SHIFT;
- pgoff_t index = start_index;
- unsigned i;
- loff_t ret;
- int offset;
-
- folio_batch_init(&fbatch);
-
- while (filemap_get_folios(vinode->i_mapping,
- &index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct folio *folio = fbatch.folios[i];
-
- if (!nonblock) {
- folio_lock(folio);
- } else if (!folio_trylock(folio)) {
- folio_batch_release(&fbatch);
- return -EAGAIN;
- }
-
- offset = folio_data_offset(folio,
- max(folio_pos(folio), start_offset),
- min_replicas);
- if (offset >= 0) {
- ret = clamp(folio_pos(folio) + offset,
- start_offset, end_offset);
- folio_unlock(folio);
- folio_batch_release(&fbatch);
- return ret;
- }
- folio_unlock(folio);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
-
- return end_offset;
-}
-
static loff_t bch2_seek_data(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
if (offset >= isize)
return -ENXIO;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
- for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
SPOS(inode->v.i_ino, offset >> 9, snapshot),
POS(inode->v.i_ino, U64_MAX),
0, k, ret) {
} else if (k.k->p.offset >> 9 > isize)
break;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
return ret;
return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
}
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
- unsigned min_replicas, bool nonblock)
-{
- struct folio *folio;
- struct bch_folio *s;
- unsigned i, sectors;
- bool ret = true;
-
- folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
- !nonblock ? FGP_LOCK : 0, 0);
- if (IS_ERR_OR_NULL(folio))
- return true;
-
- if (nonblock && !folio_trylock(folio)) {
- folio_put(folio);
- return -EAGAIN;
- }
-
- s = bch2_folio(folio);
- if (!s)
- goto unlock;
-
- sectors = folio_sectors(folio);
- for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
- if (s->s[i].state < SECTOR_dirty ||
- s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
- *offset = max(*offset,
- folio_pos(folio) + (i << SECTOR_SHIFT));
- goto unlock;
- }
-
- *offset = folio_end_pos(folio);
- ret = false;
-unlock:
- folio_unlock(folio);
- folio_put(folio);
- return ret;
-}
-
-static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
- loff_t start_offset,
- loff_t end_offset,
- unsigned min_replicas,
- bool nonblock)
-{
- struct address_space *mapping = vinode->i_mapping;
- loff_t offset = start_offset;
-
- while (offset < end_offset &&
- !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
- ;
-
- return min(offset, end_offset);
-}
-
-static int bch2_clamp_data_hole(struct inode *inode,
- u64 *hole_start,
- u64 *hole_end,
- unsigned min_replicas,
- bool nonblock)
-{
- loff_t ret;
-
- ret = bch2_seek_pagecache_hole(inode,
- *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
- if (ret < 0)
- return ret;
-
- *hole_start = ret;
-
- if (*hole_start == *hole_end)
- return 0;
-
- ret = bch2_seek_pagecache_data(inode,
- *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
- if (ret < 0)
- return ret;
-
- *hole_end = ret;
- return 0;
-}
-
static loff_t bch2_seek_hole(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
if (offset >= isize)
return -ENXIO;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
- for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inode->v.i_ino, offset >> 9, snapshot),
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
offset = max(offset, bkey_start_offset(k.k) << 9);
}
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
return ret;
void bch2_fs_fsio_exit(struct bch_fs *c)
{
bioset_exit(&c->nocow_flush_bioset);
- bioset_exit(&c->dio_write_bioset);
- bioset_exit(&c->dio_read_bioset);
- bioset_exit(&c->writepage_bioset);
}
int bch2_fs_fsio_init(struct bch_fs *c)
{
- if (bioset_init(&c->writepage_bioset,
- 4, offsetof(struct bch_writepage_io, op.wbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
- if (bioset_init(&c->dio_read_bioset,
- 4, offsetof(struct dio_read, rbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
- if (bioset_init(&c->dio_write_bioset,
- 4, offsetof(struct dio_write, op.wbio.bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
if (bioset_init(&c->nocow_flush_bioset,
1, offsetof(struct nocow_flush, bio), 0))
return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
#ifndef NO_BCACHEFS_FS
#include "buckets.h"
-#include "io_types.h"
+#include "fs.h"
+#include "io_write_types.h"
+#include "quota.h"
#include <linux/uio.h>
-struct quota_res;
+struct folio_vec {
+ struct folio *fv_folio;
+ size_t fv_offset;
+ size_t fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+ struct folio *folio = page_folio(bv.bv_page);
+ size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+ bv.bv_offset;
+ size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+ return (struct folio_vec) {
+ .fv_folio = folio,
+ .fv_offset = offset,
+ .fv_len = len,
+ };
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+ struct bvec_iter iter)
+{
+ return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start) \
+ for (iter = (start); \
+ (iter).bi_size && \
+ ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \
+ bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter) \
+ __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
+struct quota_res {
+ u64 sectors;
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res)
+{
+ BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+ inode->ei_quota_reserved -= res->sectors;
+ res->sectors = 0;
+}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res)
+{
+ if (res->sectors) {
+ mutex_lock(&inode->ei_quota_lock);
+ __bch2_quota_reservation_put(c, inode, res);
+ mutex_unlock(&inode->ei_quota_lock);
+ }
+}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res,
+ u64 sectors,
+ bool check_enospc)
+{
+ int ret;
+
+ if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+ return 0;
+
+ mutex_lock(&inode->ei_quota_lock);
+ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+ if (likely(!ret)) {
+ inode->ei_quota_reserved += sectors;
+ res->sectors += sectors;
+ }
+ mutex_unlock(&inode->ei_quota_lock);
+
+ return ret;
+}
-int __must_check bch2_write_inode_size(struct bch_fs *,
- struct bch_inode_info *,
- loff_t, unsigned);
+#else
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res) {}
-int bch2_read_folio(struct file *, struct folio *);
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res) {}
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct quota_res *res,
+ unsigned sectors,
+ bool check_enospc)
+{
+ return 0;
+}
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
- unsigned, struct page **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
- unsigned, unsigned, struct page *, void *);
+#endif
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
+ struct quota_res *, s64);
+
+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+ struct quota_res *quota_res, s64 sectors)
+{
+ if (sectors) {
+ mutex_lock(&inode->ei_quota_lock);
+ __bch2_i_sectors_acct(c, inode, quota_res, sectors);
+ mutex_unlock(&inode->ei_quota_lock);
+ }
+}
+
+static inline struct address_space *faults_disabled_mapping(void)
+{
+ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+ current->faults_disabled_mapping =
+ (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+ return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
+ struct bch_inode_info *, struct closure *);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+ struct bch_inode_info *,
+ loff_t, unsigned);
int bch2_fsync(struct file *, loff_t, loff_t, int);
-int bch2_truncate(struct mnt_idmap *,
+int bchfs_truncate(struct mnt_idmap *,
struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
loff_t bch2_llseek(struct file *, loff_t, int);
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
void bch2_fs_fsio_exit(struct bch_fs *);
int bch2_fs_fsio_init(struct bch_fs *);
#else
bool projinherit;
};
-static int bch2_inode_flags_set(struct bch_inode_info *inode,
+static int bch2_inode_flags_set(struct btree_trans *trans,
+ struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
unsigned newflags = s->flags;
unsigned oldflags = bi->bi_flags & s->mask;
- if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+ if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
if (!S_ISREG(bi->bi_mode) &&
!S_ISDIR(bi->bi_mode) &&
- (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+ (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
return -EINVAL;
if (s->set_projinherit) {
fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
- return copy_to_user(arg, &fa, sizeof(fa));
+ if (copy_to_user(arg, &fa, sizeof(fa)))
+ return -EFAULT;
+
+ return 0;
}
-static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
bi->bi_project = s->projid;
}
- return bch2_inode_flags_set(inode, bi, p);
+ return bch2_inode_flags_set(trans, inode, bi, p);
}
static int bch2_ioc_fssetxattr(struct bch_fs *c,
return ret;
}
-static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
+static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
return ret;
}
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
+static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
{
struct inode *dir;
struct bch_inode_info *inode;
return error;
}
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ down_write(&c->snapshot_create_lock);
+ long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
+ up_write(&c->snapshot_create_lock);
+
+ return ret;
+}
+
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
+ struct filename *name;
struct path path;
struct inode *dir;
+ struct dentry *victim;
int ret = 0;
if (arg.flags)
return -EINVAL;
- ret = user_path_at(arg.dirfd,
- (const char __user *)(unsigned long)arg.dst_ptr,
- LOOKUP_FOLLOW, &path);
- if (ret)
- return ret;
+ name = getname((const char __user *)(unsigned long)arg.dst_ptr);
+ victim = filename_path_locked(arg.dirfd, name, &path);
+ putname(name);
+ if (IS_ERR(victim))
+ return PTR_ERR(victim);
- if (path.dentry->d_sb->s_fs_info != c) {
+ if (victim->d_sb->s_fs_info != c) {
ret = -EXDEV;
goto err;
}
- dir = path.dentry->d_parent->d_inode;
-
- ret = __bch2_unlink(dir, path.dentry, true);
- if (ret)
- goto err;
-
- fsnotify_rmdir(dir, path.dentry);
- d_delete(path.dentry);
+ dir = d_inode(path.dentry);
+ ret = __bch2_unlink(dir, victim, true);
+ if (!ret) {
+ fsnotify_rmdir(dir, victim);
+ d_delete(victim);
+ }
+ inode_unlock(dir);
err:
+ dput(victim);
path_put(&path);
return ret;
}
/* Inode flags: */
/* bcachefs inode flags -> vfs inode flags: */
-static const unsigned bch_flags_to_vfs[] = {
- [__BCH_INODE_SYNC] = S_SYNC,
- [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
- [__BCH_INODE_APPEND] = S_APPEND,
- [__BCH_INODE_NOATIME] = S_NOATIME,
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
+ [__BCH_INODE_sync] = S_SYNC,
+ [__BCH_INODE_immutable] = S_IMMUTABLE,
+ [__BCH_INODE_append] = S_APPEND,
+ [__BCH_INODE_noatime] = S_NOATIME,
};
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const unsigned bch_flags_to_uflags[] = {
- [__BCH_INODE_SYNC] = FS_SYNC_FL,
- [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
- [__BCH_INODE_APPEND] = FS_APPEND_FL,
- [__BCH_INODE_NODUMP] = FS_NODUMP_FL,
- [__BCH_INODE_NOATIME] = FS_NOATIME_FL,
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
+ [__BCH_INODE_sync] = FS_SYNC_FL,
+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
+ [__BCH_INODE_append] = FS_APPEND_FL,
+ [__BCH_INODE_nodump] = FS_NODUMP_FL,
+ [__BCH_INODE_noatime] = FS_NOATIME_FL,
};
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const unsigned bch_flags_to_xflags[] = {
- [__BCH_INODE_SYNC] = FS_XFLAG_SYNC,
- [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
- [__BCH_INODE_APPEND] = FS_XFLAG_APPEND,
- [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP,
- [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME,
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
+ [__BCH_INODE_sync] = FS_XFLAG_SYNC,
+ [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
+ [__BCH_INODE_append] = FS_XFLAG_APPEND,
+ [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
+ [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
};
#include "fs-common.h"
#include "fs-io.h"
#include "fs-ioctl.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
#include "fsck.h"
#include "inode.h"
-#include "io.h"
+#include "io_read.h"
#include "journal.h"
#include "keylist.h"
#include "quota.h"
+#include "snapshot.h"
#include "super.h"
#include "xattr.h"
inode->v.i_mode = bi->bi_mode;
if (fields & ATTR_ATIME)
- inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+ inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
if (fields & ATTR_MTIME)
- inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+ inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
if (fields & ATTR_CTIME)
- inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
+ inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
inode->ei_inode = *bi;
inode_set_fn set,
void *p, unsigned fields)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bch_inode_unpacked inode_u;
int ret;
-
- bch2_trans_init(&trans, c, 0, 512);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
+ ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT) ?:
- (set ? set(inode, &inode_u, p) : 0) ?:
- bch2_inode_write(&trans, &iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ (set ? set(trans, inode, &inode_u, p) : 0) ?:
+ bch2_inode_write(trans, &iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
* this is important for inode updates via bchfs_write_index_update
*/
if (!ret)
- bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
+ bch2_inode_update_after_write(trans, inode, &inode_u, fields);
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
inode_inum(inode).subvol,
inode_inum(inode).inum);
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret < 0 ? ret : 0;
}
{
struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct bch_subvolume subvol;
int ret;
if (!(inode->v.i_state & I_NEW))
return &inode->v;
- bch2_trans_init(&trans, c, 8, 0);
- ret = lockrestart_do(&trans,
- bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
- bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+ trans = bch2_trans_get(c);
+ ret = lockrestart_do(trans,
+ bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
if (!ret)
- bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
- bch2_trans_exit(&trans);
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+ bch2_trans_put(trans);
if (ret) {
iget_failed(&inode->v);
- return ERR_PTR(ret);
+ return ERR_PTR(bch2_err_class(ret));
}
mutex_lock(&c->vfs_inodes_lock);
unsigned flags)
{
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct bch_inode_unpacked dir_u;
struct bch_inode_info *inode, *old;
struct bch_inode_unpacked inode_u;
if (!(flags & BCH_CREATE_TMPFILE))
mutex_lock(&dir->ei_update_lock);
- bch2_trans_init(&trans, c, 8,
- 2048 + (!(flags & BCH_CREATE_TMPFILE)
- ? dentry->d_name.len : 0));
+ trans = bch2_trans_get(c);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_create_trans(&trans,
+ ret = bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
inum.inum = inode_u.bi_inum;
- ret = bch2_subvolume_get(&trans, inum.subvol, true,
+ ret = bch2_subvolume_get(trans, inum.subvol, true,
BTREE_ITER_WITH_UPDATES, &subvol) ?:
- bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+ bch2_trans_commit(trans, NULL, &journal_seq, 0);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
}
if (!(flags & BCH_CREATE_TMPFILE)) {
- bch2_inode_update_after_write(&trans, dir, &dir_u,
+ bch2_inode_update_after_write(trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
mutex_unlock(&dir->ei_update_lock);
}
bch2_iget5_set(&inode->v, &inum);
- bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+ bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
unlock_new_inode(&inode->v);
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
posix_acl_release(acl);
if (!(flags & BCH_CREATE_TMPFILE))
mutex_unlock(&dir->ei_update_lock);
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
make_bad_inode(&inode->v);
iput(&inode->v);
inode = ERR_PTR(ret);
struct bch_inode_info *dir,
struct dentry *dentry)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct bch_inode_unpacked dir_u, inode_u;
int ret;
mutex_lock(&inode->ei_update_lock);
- bch2_trans_init(&trans, c, 4, 1024);
- ret = commit_do(&trans, NULL, NULL, 0,
- bch2_link_trans(&trans,
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_link_trans(trans,
inode_inum(dir), &dir_u,
inode_inum(inode), &inode_u,
&dentry->d_name));
if (likely(!ret)) {
- bch2_inode_update_after_write(&trans, dir, &dir_u,
+ bch2_inode_update_after_write(trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
+ bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
mutex_unlock(&inode->ei_update_lock);
return ret;
}
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_inode_unpacked dir_u, inode_u;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
int ret;
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
- bch2_trans_init(&trans, c, 4, 1024);
- ret = commit_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_unlink_trans(&trans,
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_unlink_trans(trans,
inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name,
deleting_snapshot));
if (unlikely(ret))
goto err;
- bch2_inode_update_after_write(&trans, dir, &dir_u,
+ bch2_inode_update_after_write(trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(&trans, inode, &inode_u,
+ bch2_inode_update_after_write(trans, inode, &inode_u,
ATTR_MTIME);
if (inode_u.bi_subvol) {
set_nlink(&inode->v, 0);
}
err:
- bch2_trans_exit(&trans);
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+ bch2_trans_put(trans);
return ret;
}
struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
struct bch_inode_unpacked dst_dir_u, src_dir_u;
struct bch_inode_unpacked src_inode_u, dst_inode_u;
- struct btree_trans trans;
+ struct btree_trans *trans;
enum bch_rename_mode mode = flags & RENAME_EXCHANGE
? BCH_RENAME_EXCHANGE
: dst_dentry->d_inode
return ret;
}
- bch2_trans_init(&trans, c, 8, 2048);
+ trans = bch2_trans_get(c);
bch2_lock_inodes(INODE_UPDATE_LOCK,
src_dir,
goto err;
}
- ret = commit_do(&trans, NULL, NULL, 0,
- bch2_rename_trans(&trans,
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_rename_trans(trans,
inode_inum(src_dir), &src_dir_u,
inode_inum(dst_dir), &dst_dir_u,
&src_inode_u,
BUG_ON(dst_inode &&
dst_inode->v.i_ino != dst_inode_u.bi_inum);
- bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
+ bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
ATTR_MTIME|ATTR_CTIME);
if (src_dir != dst_dir)
- bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
+ bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
+ bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
ATTR_CTIME);
if (dst_inode)
- bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
+ bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
ATTR_CTIME);
err:
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
bch2_fs_quota_transfer(c, src_inode,
bch_qid(&src_inode->ei_inode),
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_qid qid;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
if (ret)
goto err;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
kfree(acl);
acl = NULL;
- ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_INTENT);
if (ret)
goto btree_err;
bch2_setattr_copy(idmap, inode, &inode_u, attr);
if (attr->ia_valid & ATTR_MODE) {
- ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+ ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
inode_u.bi_mode, &acl);
if (ret)
goto btree_err;
}
- ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc);
btree_err:
- bch2_trans_iter_exit(&trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (unlikely(ret))
goto err_trans;
- bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
+ bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
if (acl)
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
err_trans:
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
err:
mutex_unlock(&inode->ei_update_lock);
stat->gid = inode->v.i_gid;
stat->rdev = inode->v.i_rdev;
stat->size = i_size_read(&inode->v);
- stat->atime = inode->v.i_atime;
- stat->mtime = inode->v.i_mtime;
- stat->ctime = inode->v.i_ctime;
+ stat->atime = inode_get_atime(&inode->v);
+ stat->mtime = inode_get_mtime(&inode->v);
+ stat->ctime = inode_get_ctime(&inode->v);
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
}
- if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+ if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
- if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+ if (inode->ei_inode.bi_flags & BCH_INODE_append)
stat->attributes |= STATX_ATTR_APPEND;
stat->attributes_mask |= STATX_ATTR_APPEND;
- if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+ if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
stat->attributes |= STATX_ATTR_NODUMP;
stat->attributes_mask |= STATX_ATTR_NODUMP;
return ret;
return iattr->ia_valid & ATTR_SIZE
- ? bch2_truncate(idmap, inode, iattr)
+ ? bchfs_truncate(idmap, inode, iattr)
: bch2_setattr_nonsize(idmap, inode, iattr);
}
{
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *ei = to_bch_ei(vinode);
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
if (ret)
goto err;
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(ei->v.i_ino, start, snapshot), 0);
- while (!(ret = btree_trans_too_many_iters(&trans)) &&
+ while (!(ret = btree_trans_too_many_iters(trans)) &&
(k = bch2_btree_iter_peek_upto(&iter, end)).k &&
!(ret = bkey_err(k))) {
enum btree_id data_btree = BTREE_ID_extents;
bch2_bkey_buf_reassemble(&cur, c, k);
- ret = bch2_read_indirect_extent(&trans, &data_btree,
+ ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &cur);
if (ret)
break;
cur.k->k.p.offset += cur.k->k.size;
if (have_extent) {
- bch2_trans_unlock(&trans);
+ bch2_trans_unlock(trans);
ret = bch2_fill_extent(c, info,
bkey_i_to_s_c(prev.k), 0);
if (ret)
POS(iter.pos.inode, iter.pos.offset + sectors));
}
start = iter.pos.offset;
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (!ret && have_extent) {
- bch2_trans_unlock(&trans);
+ bch2_trans_unlock(trans);
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
return ret < 0 ? ret : 0;
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret;
if (!dir_emit_dots(file, ctx))
return 0;
- return bch2_readdir(c, inode_inum(inode), ctx);
+ ret = bch2_readdir(c, inode_inum(inode), ctx);
+ if (ret)
+ bch_err_fn(c, ret);
+
+ return bch2_err_class(ret);
}
static const struct file_operations bch_file_operations = {
.inum = inode->ei_inode.bi_dir,
};
- if (!parent_inum.inum)
- return NULL;
-
return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
}
struct bch_inode_info *inode = to_bch_ei(child->d_inode);
struct bch_inode_info *dir = to_bch_ei(parent->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter1;
struct btree_iter iter2;
struct bkey_s_c k;
struct bch_inode_unpacked inode_u;
subvol_inum target;
u32 snapshot;
- unsigned name_len;
+ struct qstr dirent_name;
+ unsigned name_len = 0;
int ret;
if (!S_ISDIR(dir->v.i_mode))
return -EINVAL;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
- bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+ bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
POS(dir->ei_inode.bi_inum, 0), 0);
- bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+ bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
POS(dir->ei_inode.bi_inum, 0), 0);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
if (ret)
goto err;
bch2_btree_iter_set_snapshot(&iter1, snapshot);
bch2_btree_iter_set_snapshot(&iter2, snapshot);
- ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+ ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
if (ret)
goto err;
}
d = bkey_s_c_to_dirent(k);
- ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+ ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
if (ret > 0)
ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
if (ret)
continue;
d = bkey_s_c_to_dirent(k);
- ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+ ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
if (ret < 0)
break;
if (ret)
ret = -ENOENT;
goto err;
found:
- name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+ dirent_name = bch2_dirent_get_name(d);
- memcpy(name, d.v->d_name, name_len);
+ name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
+ memcpy(name, dirent_name.name, name_len);
name[name_len] = '\0';
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_iter_exit(&trans, &iter1);
- bch2_trans_iter_exit(&trans, &iter2);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter1);
+ bch2_trans_iter_exit(trans, &iter2);
+ bch2_trans_put(trans);
return ret;
}
call_rcu(&vinode->i_rcu, bch2_i_callback);
}
-static int inode_update_times_fn(struct bch_inode_info *inode,
+static int inode_update_times_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime);
- bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime);
- bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime);
+ bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
+ bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
+ bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
return 0;
}
static char **split_devs(const char *_dev_name, unsigned *nr)
{
char *dev_name = NULL, **devs = NULL, *s;
- size_t i, nr_devs = 0;
+ size_t i = 0, nr_devs = 0;
dev_name = kstrdup(_dev_name, GFP_KERNEL);
if (!dev_name)
return NULL;
}
- for (i = 0, s = dev_name;
- s;
- (s = strchr(s, ':')) && (*s++ = '\0'))
+ while ((s = strsep(&dev_name, ":")))
devs[i++] = s;
*nr = nr_devs;
up_write(&c->state_lock);
}
- if (opts.errors >= 0)
+ if (opt_defined(opts, errors))
c->opts.errors = opts.errors;
err:
return bch2_err_class(ret);
__bch2_fs_stop(c);
}
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+ struct bch_fs *c = sb->s_fs_info;
+
+ down_write(&c->state_lock);
+ bch2_fs_read_only(c);
+ up_write(&c->state_lock);
+ return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ int ret;
+
+ down_write(&c->state_lock);
+ ret = bch2_fs_read_write(c);
+ up_write(&c->state_lock);
+ return ret;
+}
+
static const struct super_operations bch_super_operations = {
.alloc_inode = bch2_alloc_inode,
.destroy_inode = bch2_destroy_inode,
.show_options = bch2_show_options,
.remount_fs = bch2_remount,
.put_super = bch2_put_super,
-#if 0
.freeze_fs = bch2_freeze,
.unfreeze_fs = bch2_unfreeze,
-#endif
};
static int bch2_set_super(struct super_block *s, void *data)
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
if (ret) {
- bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "mounting: error getting root inode");
goto err_put_super;
}
struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
/* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct bch_inode_info *,
+typedef int (*inode_set_fn)(struct btree_trans *,
+ struct bch_inode_info *,
struct bch_inode_unpacked *, void *);
void bch2_inode_update_after_write(struct btree_trans *,
#else
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0)
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
#include "bcachefs.h"
#include "bkey_buf.h"
+#include "btree_cache.h"
#include "btree_update.h"
#include "buckets.h"
#include "darray.h"
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
-#include "subvolume.h"
+#include "recovery.h"
+#include "snapshot.h"
#include "super.h"
#include "xattr.h"
if (!ret)
*subvol = le32_to_cpu(s.subvol);
else if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot %u not fonud", snapshot);
+ bch_err(trans->c, "snapshot %u not found", snapshot);
return ret;
}
ret = bch2_inode_unpack(k, inode);
err:
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(trans->c, "error fetching inode %llu: %s",
- inode_nr, bch2_err_str(ret));
+ bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
if (!ret)
*snapshot = iter.pos.snapshot;
err:
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(trans->c, "error fetching inode %llu:%u: %s",
- inode_nr, *snapshot, bch2_err_str(ret));
+ bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
}
-static int write_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
+static int fsck_write_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
{
int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
__write_inode(trans, inode, snapshot));
if (ret)
- bch_err(trans->c, "error in fsck: error updating inode: %s",
- bch2_err_str(ret));
+ bch_err_fn(trans->c, ret);
return ret;
}
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
err:
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
goto create_lostfound;
}
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
+ bch_err_fn(c, ret);
if (ret)
return ret;
if (d_type != DT_DIR) {
bch_err(c, "error looking up lost+found: not a directory");
- return ret;
+ return -BCH_ERR_ENOENT_not_directory;
}
/*
lostfound, &lostfound_str,
0, 0, S_IFDIR|0700, 0, NULL, NULL,
(subvol_inum) { }, 0);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "creating lost+found");
return ret;
}
u32 inode_snapshot)
{
int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_lazy_rw|
+ BCH_TRANS_COMMIT_no_enospc,
__reattach_inode(trans, inode, inode_snapshot));
- if (ret) {
- bch_err(trans->c, "error reattaching inode %llu: %s",
- inode->bi_inum, bch2_err_str(ret));
- return ret;
- }
-
+ bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
return ret;
}
memset(s, 0, sizeof(*s));
}
+static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+ struct snapshots_seen_entry *i, n = {
+ .id = id,
+ .equiv = bch2_snapshot_equiv(c, id),
+ };
+ int ret = 0;
+
+ darray_for_each(s->ids, i) {
+ if (i->id == id)
+ return 0;
+ if (i->id > id)
+ break;
+ }
+
+ ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+ if (ret)
+ bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+ s->ids.size);
+ return ret;
+}
+
static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
enum btree_id btree_id, struct bpos pos)
{
if (i->equiv == n.equiv) {
bch_err(c, "snapshot deletion did not finish:\n"
" duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
- bch2_btree_ids[btree_id],
+ bch2_btree_id_str(btree_id),
pos.inode, pos.offset,
i->id, n.id, n.equiv);
+ set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
}
}
* key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
* and @ancestor hasn't been overwritten in @seen
*
- * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ * @c: filesystem handle
+ * @seen: list of snapshot ids already seen at current position
+ * @id: descendent snapshot id
+ * @ancestor: ancestor snapshot id
+ *
+ * Returns: whether key in @ancestor snapshot is visible in @id snapshot
*/
static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
u32 id, u32 ancestor)
* snapshot id @dst, test whether there is some snapshot in which @dst is
* visible.
*
- * This assumes we're visiting @src keys in natural key order.
+ * @c: filesystem handle
+ * @s: list of snapshot IDs already seen at @src
+ * @src: snapshot ID of src key
+ * @dst: snapshot ID of dst key
+ * Returns: true if there is some snapshot in which @dst is visible
*
- * @s - list of snapshot IDs already seen at @src
- * @src - snapshot ID of src key
- * @dst - snapshot ID of dst key
+ * Assumes we're visiting @src keys in natural key order
*/
-static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
- u32 src, u32 dst)
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+ u32 src, u32 dst)
{
return dst <= src
? key_visible_in_snapshot(c, s, dst, src)
w->first_this_inode = true;
- if (trans_was_restarted(trans, restart_count))
- return -BCH_ERR_transaction_restart_nested;
-
- return 0;
+ return trans_was_restarted(trans, restart_count);
}
static struct inode_walker_entry *
int ret = 0;
if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
- "key in missing snapshot: %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ bkey_in_missing_snapshot,
+ "key in missing snapshot: %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
fsck_err:
BCH_HASH_SET_MUST_CREATE,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw);
}
static int hash_check_key(struct btree_trans *trans,
if (fsck_err_on(k.k->type == desc.key_type &&
!desc.cmp_bkey(k, hash_k), c,
+ hash_table_key_duplicate,
"duplicate hash table keys:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k),
printbuf_exit(&buf);
return ret;
bad_hash:
- if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
- bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
+ if (fsck_err(c, hash_table_key_wrong_offset,
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+ bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+ bch_err_fn(c, ret);
if (ret)
return ret;
ret = -BCH_ERR_transaction_restart_nested;
if (ret)
goto err;
- /*
- * if snapshot id isn't a leaf node, skip it - deletion in
- * particular is not atomic, so on the internal snapshot nodes
- * we can see inodes marked for deletion after a clean shutdown
- */
- if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot))
- return 0;
-
if (!bkey_is_inode(k.k))
return 0;
BUG_ON(bch2_inode_unpack(k, &u));
if (!full &&
- !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
- BCH_INODE_I_SECTORS_DIRTY|
- BCH_INODE_UNLINKED)))
+ !(u.bi_flags & (BCH_INODE_i_size_dirty|
+ BCH_INODE_i_sectors_dirty|
+ BCH_INODE_unlinked)))
return 0;
if (prev->bi_inum != u.bi_inum)
*prev = u;
if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
- inode_d_type(prev) != inode_d_type(&u), c,
+ inode_d_type(prev) != inode_d_type(&u),
+ c, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
bch_err(c, "repair not implemented yet");
return -EINVAL;
}
- if (u.bi_flags & BCH_INODE_UNLINKED &&
+ if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
+ bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
+ struct bpos new_min_pos;
+
+ ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+ if (ret)
+ goto err;
+
+ u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
+
+ ret = __write_inode(trans, &u, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck updating inode");
+ if (ret)
+ return ret;
+
+ if (!bpos_eq(new_min_pos, POS_MIN))
+ bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
+ return 0;
+ }
+
+ if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
- fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
+ fsck_err(c, inode_unlinked_but_clean,
+ "filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
bch2_trans_unlock(trans);
bch2_fs_lazy_rw(c);
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(c, "error in fsck: error while deleting inode: %s",
- bch2_err_str(ret));
+ bch_err_msg(c, ret, "in fsck deleting inode");
return ret;
}
- if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
+ if (u.bi_flags & BCH_INODE_i_size_dirty &&
(!c->sb.clean ||
- fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
+ fsck_err(c, inode_i_size_dirty_but_clean,
+ "filesystem marked clean, but inode %llu has i_size dirty",
u.bi_inum))) {
bch_verbose(c, "truncating inode %llu", u.bi_inum);
iter->pos.snapshot),
POS(u.bi_inum, U64_MAX),
0, NULL);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err(c, "error in fsck: error truncating inode: %s",
- bch2_err_str(ret));
+ bch_err_msg(c, ret, "in fsck truncating inode");
if (ret)
return ret;
* We truncated without our normal sector accounting hook, just
* make sure we recalculate it:
*/
- u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+ u.bi_flags |= BCH_INODE_i_sectors_dirty;
- u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+ u.bi_flags &= ~BCH_INODE_i_size_dirty;
do_update = true;
}
- if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
+ if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
(!c->sb.clean ||
- fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
+ fsck_err(c, inode_i_sectors_dirty_but_clean,
+ "filesystem marked clean, but inode %llu has i_sectors dirty",
u.bi_inum))) {
s64 sectors;
sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
if (sectors < 0) {
- bch_err(c, "error in fsck: error recounting inode sectors: %s",
- bch2_err_str(sectors));
+ bch_err_msg(c, sectors, "in fsck recounting inode sectors");
return sectors;
}
u.bi_sectors = sectors;
- u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+ u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
do_update = true;
}
- if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
+ if (u.bi_flags & BCH_INODE_backptr_untrusted) {
u.bi_dir = 0;
u.bi_dir_offset = 0;
- u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
+ u.bi_flags &= ~BCH_INODE_backptr_untrusted;
do_update = true;
}
if (do_update) {
ret = __write_inode(trans, &u, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck updating inode");
if (ret)
- bch_err(c, "error in fsck: error updating inode: %s",
- bch2_err_str(ret));
+ return ret;
}
err:
fsck_err:
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
int bch2_check_inodes(struct bch_fs *c)
{
bool full = c->opts.fsck;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bch_inode_unpacked prev = { 0 };
struct snapshots_seen s;
int ret;
snapshots_seen_init(&s);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_inode(&trans, &iter, k, &prev, &s, full));
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+ check_inode(trans, &iter, k, &prev, &s, full));
- bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
- if (ret)
- bch_err_fn(c, ret);
+ bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
: le64_to_cpu(d.v->d_inum) == inode->bi_inum;
}
-static int inode_backpointer_exists(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret;
-
- d = dirent_get_by_pos(trans, &iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
- ret = bkey_err(d);
- if (ret)
- return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
- ret = dirent_points_to_inode(d, inode);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
return -BCH_ERR_internal_fsck_err;
}
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
- "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
- w->last_pos.inode, i->snapshot,
- i->inode.bi_sectors, i->count)) {
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
+ c, inode_i_sectors_wrong,
+ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+ w->last_pos.inode, i->snapshot,
+ i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
- ret = write_inode(trans, &i->inode, i->snapshot);
+ ret = fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
}
fsck_err:
- if (ret)
- bch_err_fn(c, ret);
- if (!ret && trans_was_restarted(trans, restart_count))
- ret = -BCH_ERR_transaction_restart_nested;
- return ret;
+ bch_err_fn(c, ret);
+ return ret ?: trans_was_restarted(trans, restart_count);
}
struct extent_end {
static int overlapping_extents_found(struct btree_trans *trans,
enum btree_id btree,
- struct bpos pos1, struct bkey pos2,
+ struct bpos pos1, struct snapshots_seen *pos1_seen,
+ struct bkey pos2,
bool *fixed,
struct extent_end *extent_end)
{
prt_printf(&buf, "\n overwriting %s extent",
pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
- if (fsck_err(c, "overlapping extents%s", buf.buf)) {
+ if (fsck_err(c, extent_overlapping,
+ "overlapping extents%s", buf.buf)) {
struct btree_iter *old_iter = &iter1;
struct disk_reservation res = { 0 };
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
k1, k2) ?:
bch2_trans_commit(trans, &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
if (ret)
*fixed = true;
- if (pos1.snapshot == pos2.p.snapshot)
+ if (pos1.snapshot == pos2.p.snapshot) {
+ /*
+ * We overwrote the first extent, and did the overwrite
+ * in the same snapshot:
+ */
extent_end->offset = bkey_start_offset(&pos2);
- else
+ } else if (pos1.snapshot > pos2.p.snapshot) {
+ /*
+ * We overwrote the first extent in pos2's snapshot:
+ */
+ ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
+ } else {
+ /*
+ * We overwrote the second extent - restart
+ * check_extent() from the top:
+ */
ret = -BCH_ERR_transaction_restart_nested;
+ }
}
fsck_err:
err:
SPOS(iter->pos.inode,
i->offset,
i->snapshot),
+ &i->seen,
*k.k, fixed, i);
if (ret)
goto err;
return ret;
}
+static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
+ unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
+
+ bkey_for_each_crc(k.k, ptrs, crc, i)
+ if (crc_is_encoded(crc) &&
+ crc.uncompressed_size > encoded_extent_max_sectors) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ return 0;
+}
+
static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
- if (fsck_err_on(!i, c,
+ if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
if (fsck_err_on(i &&
!S_ISREG(i->inode.bi_mode) &&
- !S_ISLNK(i->inode.bi_mode), c,
+ !S_ISLNK(i->inode.bi_mode),
+ c, extent_in_non_reg_inode,
"extent in non regular inode mode %o:\n %s",
i->inode.bi_mode,
(printbuf_reset(&buf),
continue;
if (k.k->type != KEY_TYPE_whiteout) {
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
- !bkey_extent_is_reservation(k), c,
+ !bkey_extent_is_reservation(k),
+ c, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n %s",
i->inode.bi_inum, i->snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
err:
fsck_err:
printbuf_exit(&buf);
-
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
delete:
ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
{
struct inode_walker w = inode_walker_init();
struct snapshots_seen s;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct extent_ends extent_ends;
snapshots_seen_init(&s);
extent_ends_init(&extent_ends);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
&res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res);
- check_extent(&trans, &iter, k, &w, &s, &extent_ends);
+ check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent_overbig(trans, &iter, k);
})) ?:
- check_i_sectors(&trans, &w);
+ check_i_sectors(trans, &w);
bch2_disk_reservation_put(c, &res);
extent_ends_exit(&extent_ends);
inode_walker_exit(&w);
- bch2_trans_exit(&trans);
snapshots_seen_exit(&s);
+ bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_check_indirect_extents(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct disk_reservation res = { 0 };
+ int ret = 0;
+
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+ POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ &res, NULL,
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent_overbig(trans, &iter, k);
+ }));
+
+ bch2_disk_reservation_put(c, &res);
+ bch2_trans_put(trans);
+
+ bch_err_fn(c, ret);
return ret;
}
continue;
}
- if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+ if (fsck_err_on(i->inode.bi_nlink != i->count,
+ c, inode_dir_wrong_nlink,
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
- ret = write_inode(trans, &i->inode, i->snapshot);
+ ret = fsck_write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
}
}
fsck_err:
- if (ret)
- bch_err_fn(c, ret);
- if (!ret && trans_was_restarted(trans, restart_count))
- ret = -BCH_ERR_transaction_restart_nested;
- return ret;
+ bch_err_fn(c, ret);
+ return ret ?: trans_was_restarted(trans, restart_count);
}
static int check_dirent_target(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
- bool backpointer_exists = true;
struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
int ret = 0;
if (!target->bi_dir &&
}
if (!inode_points_to_dirent(target, d)) {
- ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
- if (ret < 0)
+ struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+ SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- backpointer_exists = ret;
+ bool backpointer_exists = !ret;
ret = 0;
- if (fsck_err_on(S_ISDIR(target->bi_mode) &&
- backpointer_exists, c,
- "directory %llu with multiple links",
- target->bi_inum)) {
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ if (backpointer_exists)
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+ if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
+ c, inode_dir_multiple_links,
+ "directory %llu:%u with multiple links\n%s",
+ target->bi_inum, target_snapshot, buf.buf)) {
ret = __remove_dirent(trans, d.k->p);
goto out;
}
- if (fsck_err_on(backpointer_exists &&
- !target->bi_nlink, c,
- "inode %llu type %s has multiple links but i_nlink 0",
- target->bi_inum, bch2_d_types[d.v->d_type])) {
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
+ if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+ c, inode_multiple_links_but_nlink_0,
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_UNLINKED;
+ target->bi_flags &= ~BCH_INODE_unlinked;
ret = __write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
- if (fsck_err_on(!backpointer_exists, c,
+ if (fsck_err_on(!backpointer_exists,
+ c, inode_wrong_backpointer,
"inode %llu:%u has wrong backpointer:\n"
"got %llu:%llu\n"
"should be %llu:%llu",
}
}
- if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
+ if (fsck_err_on(d.v->d_type != inode_d_type(target),
+ c, dirent_d_type_wrong,
"incorrect d_type: got %s, should be %s:\n%s",
bch2_d_type_str(d.v->d_type),
bch2_d_type_str(inode_d_type(target)),
if (d.v->d_type == DT_SUBVOL &&
target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
(c->sb.version < bcachefs_metadata_version_subvol_dirent ||
- fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
+ fsck_err(c, dirent_d_parent_subvol_wrong,
+ "dirent has wrong d_parent_subvol field: got %u, should be %u",
le32_to_cpu(d.v->d_parent_subvol),
target->bi_parent_subvol))) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
printbuf_exit(&buf);
-
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false;
- if (fsck_err_on(!i, c,
+ if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
if (!i)
goto out;
- if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
+ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
+ c, dirent_in_non_dir_inode,
"dirent in non directory inode type %s:\n%s",
bch2_d_type_str(inode_d_type(&i->inode)),
(printbuf_reset(&buf),
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- if (fsck_err_on(ret, c,
+ if (fsck_err_on(ret, c, dirent_to_missing_subvol,
"dirent points to missing subvolume %u",
le32_to_cpu(d.v->d_child_subvol))) {
ret = __remove_dirent(trans, d.k->p);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- if (fsck_err_on(ret, c,
+ if (fsck_err_on(ret, c, subvol_to_missing_root,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
goto err;
}
- if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+ if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
+ c, subvol_root_wrong_bi_subvol,
"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
target_inum,
subvol_root.bi_subvol, target_subvol)) {
if (ret)
goto err;
- if (fsck_err_on(!target->inodes.nr, c,
+ if (fsck_err_on(!target->inodes.nr,
+ c, dirent_to_missing_inode,
"dirent points to missing inode: (equiv %u)\n%s",
equiv.snapshot,
(printbuf_reset(&buf),
err:
fsck_err:
printbuf_exit(&buf);
-
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
struct inode_walker target = inode_walker_init();
struct snapshots_seen s;
struct bch_hash_info hash_info;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
snapshots_seen_init(&s);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
-
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
inode->first_this_inode = false;
- if (fsck_err_on(!i, c,
+ if (fsck_err_on(!i, c, xattr_in_missing_inode,
"xattr for missing inode %llu",
k.k->p.inode))
return bch2_btree_delete_at(trans, iter, 0);
ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
fsck_err:
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
{
struct inode_walker inode = inode_walker_init();
struct bch_hash_info hash_info;
- struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_xattr(&trans, &iter, k, &hash_info, &inode));
-
- bch2_trans_exit(&trans);
-
- if (ret)
- bch_err_fn(c, ret);
+ BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+ check_xattr(trans, &iter, k, &hash_info, &inode)));
+ bch_err_fn(c, ret);
return ret;
}
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+ if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
+ "root subvol missing")) {
struct bkey_i_subvolume root_subvol;
snapshot = U32_MAX;
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- __bch2_btree_insert(trans, BTREE_ID_subvolumes,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
+ bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
&root_subvol.k_i, 0));
- if (ret) {
- bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "writing root subvol");
+ if (ret)
goto err;
- }
}
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
- mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+ if (mustfix_fsck_err_on(ret, c, root_dir_missing,
+ "root directory missing") ||
+ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
+ c, root_inode_not_dir,
"root inode not a directory")) {
bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
root_inode.bi_inum = inum;
ret = __write_inode(trans, &root_inode, snapshot);
- if (ret)
- bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "writing root inode");
}
err:
fsck_err:
int ret;
ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- check_root_trans(&trans));
-
- if (ret)
- bch_err_fn(c, ret);
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
+ check_root_trans(trans));
+ bch_err_fn(c, ret);
return ret;
}
}
if (bch2_err_matches(ret, ENOENT)) {
- if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+ if (fsck_err(c, inode_unreachable,
+ "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
inode->bi_inum, snapshot,
bch2_d_type_str(inode_d_type(inode)),
inode->bi_nlink,
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
- if (!fsck_err(c, "directory structure loop"))
+ if (!fsck_err(c, dir_loop,
+ "directory structure loop"))
return 0;
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
remove_backpointer(trans, inode));
if (ret) {
bch_err(c, "error removing dirent: %i", ret);
}
}
fsck_err:
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked u;
pathbuf path = { 0, };
int ret;
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
break;
}
- if (u.bi_flags & BCH_INODE_UNLINKED)
+ if (u.bi_flags & BCH_INODE_unlinked)
continue;
- ret = check_path(&trans, &path, &u, iter.pos.snapshot);
+ ret = check_path(trans, &path, &u, iter.pos.snapshot);
if (ret)
break;
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
darray_exit(&path);
-
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
-/* check_nlink pass: */
-
struct nlink_table {
size_t nr;
size_t size;
const struct nlink *l = _l;
const struct nlink *r = _r;
- return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+ return cmp_int(l->inum, r->inum);
}
static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
struct nlink_table *t,
u64 start, u64 *end)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked u;
int ret = 0;
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_inodes,
+ for_each_btree_key(trans, iter, BTREE_ID_inodes,
POS(0, start),
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH|
}
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
if (ret)
bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
u64 range_start, u64 range_end)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct snapshots_seen s;
struct btree_iter iter;
struct bkey_s_c k;
snapshots_seen_init(&s);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
BTREE_ITER_INTENT|
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
break;
}
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (ret)
bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
snapshots_seen_exit(&s);
return ret;
}
link = &links->d[++*idx];
}
- if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
+ c, inode_wrong_nlink,
"inode %llu type %s has wrong i_nlink (%u, should be %u)",
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
struct nlink_table *links,
u64 range_start, u64 range_end)
{
- struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
size_t idx = 0;
int ret = 0;
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
- POS(0, range_start),
- BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
-
- bch2_trans_exit(&trans);
-
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS(0, range_start),
+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+ check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
- bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
return ret;
}
} while (next_iter_range_start != U64_MAX);
kvfree(links.d);
-
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
return 0;
ret = bch2_trans_run(c,
- for_each_btree_key_commit(&trans, iter,
+ for_each_btree_key_commit(trans, iter,
BTREE_ID_extents, POS_MIN,
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
- fix_reflink_p_key(&trans, &iter, k)));
-
- if (ret)
- bch_err_fn(c, ret);
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ fix_reflink_p_key(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
int bch2_check_inodes(struct bch_fs *);
int bch2_check_extents(struct bch_fs *);
+int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
#include "bkey_methods.h"
#include "btree_update.h"
#include "buckets.h"
+#include "compress.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
#include "inode.h"
#include "str_hash.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "varint.h"
#include <asm/unaligned.h>
-const char * const bch2_inode_opts[] = {
#define x(name, ...) #name,
+const char * const bch2_inode_opts[] = {
BCH_INODE_OPTS()
-#undef x
NULL,
};
+static const char * const bch2_inode_flag_strs[] = {
+ BCH_INODE_FLAGS()
+ NULL
+};
+#undef x
+
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
static int inode_decode_field(const u8 *in, const u8 *end,
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
- int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
- &unpacked);
+ ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
BUG_ON(ret);
BUG_ON(unpacked.bi_inum != inode->bi_inum);
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
return bch2_inode_unpack_slowpath(k, unpacked);
}
-int bch2_inode_peek(struct btree_trans *trans,
+static int bch2_inode_peek_nowarn(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
subvol_inum inum, unsigned flags)
return ret;
}
-int bch2_inode_write(struct btree_trans *trans,
+int bch2_inode_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
+{
+ int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+ bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+ return ret;
+}
+
+int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
- struct bch_inode_unpacked *inode)
+ struct bch_inode_unpacked *inode,
+ enum btree_update_flags flags)
{
struct bkey_inode_buf *inode_p;
bch2_inode_pack_inlined(inode_p, inode);
inode_p->inode.k.p.snapshot = iter->snapshot;
- return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+ return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
return &inode_p->inode.k_i;
}
-static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
+static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
{
struct bch_inode_unpacked unpacked;
+ int ret = 0;
- if (k.k->p.inode) {
- prt_printf(err, "nonzero k.p.inode");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(k.k->p.inode, c, err,
+ inode_pos_inode_nonzero,
+ "nonzero k.p.inode");
- if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
- prt_printf(err, "fs inode in blockdev range");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
+ inode_pos_blockdev_range,
+ "fs inode in blockdev range");
- if (bch2_inode_unpack(k, &unpacked)) {
- prt_printf(err, "invalid variable length fields");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
- prt_printf(err, "invalid data checksum type (%u >= %u",
- unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
+ inode_unpack_error,
+ "invalid variable length fields");
- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
- prt_printf(err, "invalid data checksum type (%u >= %u)",
- unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
+ inode_checksum_type_invalid,
+ "invalid data checksum type (%u >= %u",
+ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
- if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
- unpacked.bi_nlink != 0) {
- prt_printf(err, "flagged as unlinked but bi_nlink != 0");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(unpacked.bi_compression &&
+ !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
+ inode_compression_type_invalid,
+ "invalid compression opt %u", unpacked.bi_compression - 1);
- if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
- prt_printf(err, "subvolume root but not a directory");
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
+ unpacked.bi_nlink != 0, c, err,
+ inode_unlinked_but_nlink_nonzero,
+ "flagged as unlinked but bi_nlink != 0");
- return 0;
+ bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
+ inode_subvol_root_but_not_dir,
+ "subvolume root but not a directory");
+fsck_err:
+ return ret;
}
-int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+ int ret = 0;
- if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
- prt_printf(err, "invalid str hash type (%llu >= %u)",
- INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return __bch2_inode_invalid(k, err);
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
}
-int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+ int ret = 0;
- if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
- prt_printf(err, "invalid str hash type (%llu >= %u)",
- INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return __bch2_inode_invalid(k, err);
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
}
-int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+ int ret = 0;
- if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
- INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
- prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
- INODEv3_FIELDS_START(inode.v),
- INODEv3_FIELDS_START_INITIAL,
- bkey_val_u64s(inode.k));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
+ inode_v3_fields_start_bad,
+ "invalid fields_start (got %llu, min %u max %zu)",
+ INODEv3_FIELDS_START(inode.v),
+ INODEv3_FIELDS_START_INITIAL,
+ bkey_val_u64s(inode.k));
- if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
- prt_printf(err, "invalid str hash type (%llu >= %u)",
- INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+ inode_str_hash_invalid,
+ "invalid str hash type (%llu >= %u)",
+ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
- return __bch2_inode_invalid(k, err);
+ ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+ return ret;
}
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
- prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
- inode->bi_mode, inode->bi_flags,
+ prt_printf(out, "mode=%o ", inode->bi_mode);
+
+ prt_str(out, "flags=");
+ prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
+ prt_printf(out, " (%x)", inode->bi_flags);
+
+ prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
inode->bi_journal_seq,
inode->bi_size,
inode->bi_sectors,
inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, " "#_name " %llu", (u64) inode->_name);
+ prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
BCH_INODE_FIELDS_v3()
#undef x
}
__bch2_inode_unpacked_to_text(out, &inode);
}
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+static inline u64 bkey_inode_flags(struct bkey_s_c k)
{
switch (k.k->type) {
case KEY_TYPE_inode:
- return bkey_s_c_to_inode(k).v->bi_flags &
- cpu_to_le32(BCH_INODE_UNLINKED);
+ return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
case KEY_TYPE_inode_v2:
- return bkey_s_c_to_inode_v2(k).v->bi_flags &
- cpu_to_le32(BCH_INODE_UNLINKED);
+ return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
case KEY_TYPE_inode_v3:
- return bkey_s_c_to_inode_v3(k).v->bi_flags &
- cpu_to_le64(BCH_INODE_UNLINKED);
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
default:
- return false;
+ return 0;
}
}
+static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+{
+ return bkey_inode_flags(k) & BCH_INODE_unlinked;
+}
+
int bch2_trans_mark_inode(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
return 0;
}
-int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (k.k->p.inode) {
- prt_printf(err, "nonzero k.p.inode");
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(k.k->p.inode, c, err,
+ inode_pos_inode_nonzero,
+ "nonzero k.p.inode");
+fsck_err:
+ return ret;
}
void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i delete;
+ struct bpos end = POS(inum.inum, U64_MAX);
u32 snapshot;
int ret = 0;
* extent iterator:
*/
bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
- BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+ BTREE_ITER_INTENT);
while (1) {
bch2_trans_begin(trans);
bch2_btree_iter_set_snapshot(&iter, snapshot);
- k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+ k = bch2_btree_iter_peek_upto(&iter, end);
ret = bkey_err(k);
if (ret)
goto err;
bkey_init(&delete.k);
delete.k.p = iter.pos;
+ if (iter.flags & BTREE_ITER_IS_EXTENTS)
+ bch2_key_resize(&delete.k,
+ bpos_min(end, k.k->p).offset -
+ iter.pos.offset);
+
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
u32 snapshot;
int ret;
- bch2_trans_init(&trans, c, 0, 1024);
-
/*
* If this was a directory, there shouldn't be any real dirents left -
* but there could be whiteouts (from hash collisions) that we should
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
- ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
- bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
- bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
+ ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+ bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+ bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
if (ret)
goto err;
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
- k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes,
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0, inum.inum, snapshot),
BTREE_ITER_INTENT|BTREE_ITER_CACHED);
ret = bkey_err(k);
goto err;
if (!bkey_is_inode(k.k)) {
- bch2_fs_inconsistent(trans.c,
+ bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum.inum, snapshot);
ret = -EIO;
delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
- ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc);
err:
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
+ return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct bch_inode_unpacked *inode)
{
return bch2_trans_do(c, NULL, NULL, 0,
- bch2_inode_find_by_inum_trans(&trans, inum, inode));
+ bch2_inode_find_by_inum_trans(trans, inum, inode));
}
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
{
- if (bi->bi_flags & BCH_INODE_UNLINKED)
- bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ if (bi->bi_flags & BCH_INODE_unlinked)
+ bi->bi_flags &= ~BCH_INODE_unlinked;
else {
if (bi->bi_nlink == U32_MAX)
return -EINVAL;
void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
{
- if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
+ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
bi->bi_inum);
return;
}
- if (bi->bi_flags & BCH_INODE_UNLINKED) {
+ if (bi->bi_flags & BCH_INODE_unlinked) {
bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
return;
}
if (bi->bi_nlink)
bi->bi_nlink--;
else
- bi->bi_flags |= BCH_INODE_UNLINKED;
+ bi->bi_flags |= BCH_INODE_unlinked;
}
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
}
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+ struct bch_inode_unpacked inode;
+ int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+ if (ret)
+ return ret;
+
+ bch2_inode_opts_get(opts, trans->c, &inode);
+ return 0;
+}
+
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret ?: -BCH_ERR_transaction_restart_nested;
}
-static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
+static int may_delete_deleted_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos,
+ bool *need_another_pass)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
+ struct btree_iter inode_iter;
struct bkey_s_c k;
struct bch_inode_unpacked inode;
int ret;
- if (bch2_snapshot_is_internal_node(c, pos.snapshot))
- return 0;
-
- if (!fsck_err_on(c->sb.clean, c,
- "filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot))
- return 0;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+ k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
ret = bkey_err(k);
if (ret)
return ret;
ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
if (fsck_err_on(!bkey_is_inode(k.k), c,
+ deleted_inode_missing,
"nonexistent inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
ret = bch2_inode_unpack(k, &inode);
if (ret)
- goto err;
+ goto out;
+
+ if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
+ deleted_inode_is_dir,
+ "directory %llu:%u in deleted_inodes btree",
+ pos.offset, pos.snapshot))
+ goto delete;
- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c,
+ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
+ deleted_inode_not_unlinked,
"non-deleted inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
- return 1;
-err:
+ if (c->sb.clean &&
+ !fsck_err(c,
+ deleted_inode_but_clean,
+ "filesystem marked as clean but have deleted inode %llu:%u",
+ pos.offset, pos.snapshot)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
+ struct bpos new_min_pos;
+
+ ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
+ if (ret)
+ goto out;
+
+ inode.bi_flags &= ~BCH_INODE_unlinked;
+
+ ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch_err_msg(c, ret, "clearing inode unlinked flag");
+ if (ret)
+ goto out;
+
+ /*
+ * We'll need another write buffer flush to pick up the new
+ * unlinked inodes in the snapshot leaves:
+ */
+ *need_another_pass = true;
+ goto out;
+ }
+
+ ret = 1;
+out:
fsck_err:
+ bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
- return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+ goto out;
}
int bch2_delete_dead_inodes(struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
+ bool need_another_pass;
int ret;
+again:
+ need_another_pass = false;
- bch2_trans_init(&trans, c, 0, 0);
-
- ret = bch2_btree_write_buffer_flush_sync(&trans);
+ ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
* but we can't retry because the btree write buffer won't have been
* flushed and we'd spin:
*/
- for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+ for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p));
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
+ may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
if (ret < 0)
break;
if (ret) {
if (!test_bit(BCH_FS_RW, &c->flags)) {
- bch2_trans_unlock(&trans);
+ bch2_trans_unlock(trans);
bch2_fs_lazy_rw(c);
}
- ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot);
+ bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
+
+ ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (!ret && need_another_pass)
+ goto again;
err:
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
#define _BCACHEFS_INODE_H
#include "bkey.h"
+#include "bkey_methods.h"
#include "opts.h"
enum bkey_invalid_flags;
extern const char * const bch2_inode_opts[];
-int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
-int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
k->type == KEY_TYPE_inode_v3;
}
-int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
-int bch2_inode_write(struct btree_trans *, struct btree_iter *,
- struct bch_inode_unpacked *);
+
+int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, enum btree_update_flags);
+
+static inline int bch2_inode_write(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode)
+{
+ return bch2_inode_write_flags(trans, iter, inode, 0);
+}
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
int bch2_inode_rm(struct bch_fs *, subvol_inum);
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+ subvol_inum,
+ struct bch_inode_unpacked *);
int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *);
int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
{
- return bi->bi_flags & BCH_INODE_UNLINKED
+ return bi->bi_flags & BCH_INODE_unlinked
? 0
: bi->bi_nlink + nlink_bias(bi->bi_mode);
}
{
if (nlink) {
bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
- bi->bi_flags &= ~BCH_INODE_UNLINKED;
+ bi->bi_flags &= ~BCH_INODE_unlinked;
} else {
bi->bi_nlink = 0;
- bi->bi_flags |= BCH_INODE_UNLINKED;
+ bi->bi_flags |= BCH_INODE_unlinked;
}
}
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "clock.h"
-#include "data_update.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/blkdev.h>
-#include <linux/prefetch.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
- if (status == BLK_STS_REMOVED)
- return "device removed";
- return blk_status_to_str(status);
-}
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
- const struct bch_devs_mask *devs;
- unsigned d, nr = 0, total = 0;
- u64 now = local_clock(), last;
- s64 congested;
- struct bch_dev *ca;
-
- if (!target)
- return false;
-
- rcu_read_lock();
- devs = bch2_target_to_mask(c, target) ?:
- &c->rw_devs[BCH_DATA_user];
-
- for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
- ca = rcu_dereference(c->devs[d]);
- if (!ca)
- continue;
-
- congested = atomic_read(&ca->congested);
- last = READ_ONCE(ca->congested_last);
- if (time_after64(now, last))
- congested -= (now - last) >> 12;
-
- total += max(congested, 0LL);
- nr++;
- }
- rcu_read_unlock();
-
- return bch2_rand_range(nr * CONGESTED_MAX) < total;
-}
-
-static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
- u64 now, int rw)
-{
- u64 latency_capable =
- ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
- /* ideally we'd be taking into account the device's variance here: */
- u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
- s64 latency_over = io_latency - latency_threshold;
-
- if (latency_threshold && latency_over > 0) {
- /*
- * bump up congested by approximately latency_over * 4 /
- * latency_threshold - we don't need much accuracy here so don't
- * bother with the divide:
- */
- if (atomic_read(&ca->congested) < CONGESTED_MAX)
- atomic_add(latency_over >>
- max_t(int, ilog2(latency_threshold) - 2, 0),
- &ca->congested);
-
- ca->congested_last = now;
- } else if (atomic_read(&ca->congested) > 0) {
- atomic_dec(&ca->congested);
- }
-}
-
-void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
-{
- atomic64_t *latency = &ca->cur_latency[rw];
- u64 now = local_clock();
- u64 io_latency = time_after64(now, submit_time)
- ? now - submit_time
- : 0;
- u64 old, new, v = atomic64_read(latency);
-
- do {
- old = v;
-
- /*
- * If the io latency was reasonably close to the current
- * latency, skip doing the update and atomic operation - most of
- * the time:
- */
- if (abs((int) (old - io_latency)) < (old >> 1) &&
- now & ~(~0U << 5))
- break;
-
- new = ewma_add(old, io_latency, 5);
- } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
-
- bch2_congested_acct(ca, io_latency, now, rw);
-
- __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
-}
-
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
- return false;
-}
-
-#endif
-
-/* Allocate, free from mempool: */
-
-void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
- struct bvec_iter_all iter;
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, bio, iter)
- if (bv->bv_page != ZERO_PAGE(0))
- mempool_free(bv->bv_page, &c->bio_bounce_pages);
- bio->bi_vcnt = 0;
-}
-
-static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
-{
- struct page *page;
-
- if (likely(!*using_mempool)) {
- page = alloc_page(GFP_NOFS);
- if (unlikely(!page)) {
- mutex_lock(&c->bio_bounce_pages_lock);
- *using_mempool = true;
- goto pool_alloc;
-
- }
- } else {
-pool_alloc:
- page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
- }
-
- return page;
-}
-
-void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
- size_t size)
-{
- bool using_mempool = false;
-
- while (size) {
- struct page *page = __bio_alloc_page_pool(c, &using_mempool);
- unsigned len = min_t(size_t, PAGE_SIZE, size);
-
- BUG_ON(!bio_add_page(bio, page, len, 0));
- size -= len;
- }
-
- if (using_mempool)
- mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-/* Extent update path: */
-
-int bch2_sum_sector_overwrites(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- struct bkey_i *new,
- bool *usage_increasing,
- s64 *i_sectors_delta,
- s64 *disk_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c old;
- unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
- bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
- int ret = 0;
-
- *usage_increasing = false;
- *i_sectors_delta = 0;
- *disk_sectors_delta = 0;
-
- bch2_trans_copy_iter(&iter, extent_iter);
-
- for_each_btree_key_upto_continue_norestart(iter,
- new->k.p, BTREE_ITER_SLOTS, old, ret) {
- s64 sectors = min(new->k.p.offset, old.k->p.offset) -
- max(bkey_start_offset(&new->k),
- bkey_start_offset(old.k));
-
- *i_sectors_delta += sectors *
- (bkey_extent_is_allocation(&new->k) -
- bkey_extent_is_allocation(old.k));
-
- *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
- *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
- ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
- : 0;
-
- if (!*usage_increasing &&
- (new->k.p.snapshot != old.k->p.snapshot ||
- new_replicas > bch2_bkey_replicas(c, old) ||
- (!new_compressed && bch2_bkey_sectors_compressed(old))))
- *usage_increasing = true;
-
- if (bkey_ge(old.k->p, new->k.p))
- break;
- }
-
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
- struct btree_iter *extent_iter,
- u64 new_i_size,
- s64 i_sectors_delta)
-{
- struct btree_iter iter;
- struct bkey_i *k;
- struct bkey_i_inode_v3 *inode;
- unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
- int ret;
-
- k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
- SPOS(0,
- extent_iter->pos.inode,
- extent_iter->snapshot),
- BTREE_ITER_CACHED);
- ret = PTR_ERR_OR_ZERO(k);
- if (unlikely(ret))
- return ret;
-
- if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
- k = bch2_inode_to_v3(trans, k);
- ret = PTR_ERR_OR_ZERO(k);
- if (unlikely(ret))
- goto err;
- }
-
- inode = bkey_i_to_inode_v3(k);
-
- if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > le64_to_cpu(inode->v.bi_size)) {
- inode->v.bi_size = cpu_to_le64(new_i_size);
- inode_update_flags = 0;
- }
-
- if (i_sectors_delta) {
- le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
- inode_update_flags = 0;
- }
-
- if (inode->k.p.snapshot != iter.snapshot) {
- inode->k.p.snapshot = iter.snapshot;
- inode_update_flags = 0;
- }
-
- ret = bch2_trans_update(trans, &iter, &inode->k_i,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- inode_update_flags);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
- subvol_inum inum,
- struct btree_iter *iter,
- struct bkey_i *k,
- struct disk_reservation *disk_res,
- u64 new_i_size,
- s64 *i_sectors_delta_total,
- bool check_enospc)
-{
- struct bpos next_pos;
- bool usage_increasing;
- s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- int ret;
-
- /*
- * This traverses us the iterator without changing iter->path->pos to
- * search_key() (which is pos + 1 for extents): we want there to be a
- * path already traversed at iter->pos because
- * bch2_trans_extent_update() will use it to attempt extent merging
- */
- ret = __bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- ret = bch2_extent_trim_atomic(trans, iter, k);
- if (ret)
- return ret;
-
- next_pos = k->k.p;
-
- ret = bch2_sum_sector_overwrites(trans, iter, k,
- &usage_increasing,
- &i_sectors_delta,
- &disk_sectors_delta);
- if (ret)
- return ret;
-
- if (disk_res &&
- disk_sectors_delta > (s64) disk_res->sectors) {
- ret = bch2_disk_reservation_add(trans->c, disk_res,
- disk_sectors_delta - disk_res->sectors,
- !check_enospc || !usage_increasing
- ? BCH_DISK_RESERVATION_NOFAIL : 0);
- if (ret)
- return ret;
- }
-
- /*
- * Note:
- * We always have to do an inode update - even when i_size/i_sectors
- * aren't changing - for fsync to work properly; fsync relies on
- * inode->bi_journal_seq which is updated by the trigger code:
- */
- ret = bch2_extent_update_i_size_sectors(trans, iter,
- min(k->k.p.offset << 9, new_i_size),
- i_sectors_delta) ?:
- bch2_trans_update(trans, iter, k, 0) ?:
- bch2_trans_commit(trans, disk_res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL);
- if (unlikely(ret))
- return ret;
-
- if (i_sectors_delta_total)
- *i_sectors_delta_total += i_sectors_delta;
- bch2_btree_iter_set_pos(iter, next_pos);
- return 0;
-}
-
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
- subvol_inum inum,
- struct btree_iter *iter,
- unsigned sectors,
- struct bch_io_opts opts,
- s64 *i_sectors_delta,
- struct write_point_specifier write_point)
-{
- struct bch_fs *c = trans->c;
- struct disk_reservation disk_res = { 0 };
- struct closure cl;
- struct open_buckets open_buckets;
- struct bkey_s_c k;
- struct bkey_buf old, new;
- unsigned sectors_allocated;
- bool have_reservation = false;
- bool unwritten = opts.nocow &&
- c->sb.version >= bcachefs_metadata_version_unwritten_extents;
- int ret;
-
- bch2_bkey_buf_init(&old);
- bch2_bkey_buf_init(&new);
- closure_init_stack(&cl);
- open_buckets.nr = 0;
-retry:
- sectors_allocated = 0;
-
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
-
- if (!have_reservation) {
- unsigned new_replicas =
- max(0, (int) opts.data_replicas -
- (int) bch2_bkey_nr_ptrs_fully_allocated(k));
- /*
- * Get a disk reservation before (in the nocow case) calling
- * into the allocator:
- */
- ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
- if (unlikely(ret))
- goto out;
-
- bch2_bkey_buf_reassemble(&old, c, k);
- }
-
- if (have_reservation) {
- if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
- goto out;
-
- bch2_key_resize(&new.k->k, sectors);
- } else if (!unwritten) {
- struct bkey_i_reservation *reservation;
-
- bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
- reservation = bkey_reservation_init(new.k);
- reservation->k.p = iter->pos;
- bch2_key_resize(&reservation->k, sectors);
- reservation->v.nr_replicas = opts.data_replicas;
- } else {
- struct bkey_i_extent *e;
- struct bch_devs_list devs_have;
- struct write_point *wp;
- struct bch_extent_ptr *ptr;
-
- devs_have.nr = 0;
-
- bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
- e = bkey_extent_init(new.k);
- e->k.p = iter->pos;
-
- ret = bch2_alloc_sectors_start_trans(trans,
- opts.foreground_target,
- false,
- write_point,
- &devs_have,
- opts.data_replicas,
- opts.data_replicas,
- BCH_WATERMARK_normal, 0, &cl, &wp);
- if (ret) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- goto retry;
- return ret;
- }
-
- sectors = min(sectors, wp->sectors_free);
- sectors_allocated = sectors;
-
- bch2_key_resize(&e->k, sectors);
-
- bch2_open_bucket_get(c, wp, &open_buckets);
- bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
- bch2_alloc_sectors_done(c, wp);
-
- extent_for_each_ptr(extent_i_to_s(e), ptr)
- ptr->unwritten = true;
- }
-
- have_reservation = true;
-
- ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
- 0, i_sectors_delta, true);
-out:
- if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- }
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- bch2_trans_begin(trans);
- goto retry;
- }
-
- if (!ret && sectors_allocated)
- bch2_increment_clock(c, sectors_allocated, WRITE);
-
- bch2_open_buckets_put(c, &open_buckets);
- bch2_disk_reservation_put(c, &disk_res);
- bch2_bkey_buf_exit(&new, c);
- bch2_bkey_buf_exit(&old, c);
-
- return ret;
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
- subvol_inum inum, u64 end,
- s64 *i_sectors_delta)
-{
- struct bch_fs *c = trans->c;
- unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
- struct bpos end_pos = POS(inum.inum, end);
- struct bkey_s_c k;
- int ret = 0, ret2 = 0;
- u32 snapshot;
-
- while (!ret ||
- bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
- struct bkey_i delete;
-
- if (ret)
- ret2 = ret;
-
- bch2_trans_begin(trans);
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- continue;
-
- bch2_btree_iter_set_snapshot(iter, snapshot);
-
- /*
- * peek_upto() doesn't have ideal semantics for extents:
- */
- k = bch2_btree_iter_peek_upto(iter, end_pos);
- if (!k.k)
- break;
-
- ret = bkey_err(k);
- if (ret)
- continue;
-
- bkey_init(&delete.k);
- delete.k.p = iter->pos;
-
- /* create the biggest key we can */
- bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end_pos, &delete);
-
- ret = bch2_extent_update(trans, inum, iter, &delete,
- &disk_res, 0, i_sectors_delta, false);
- bch2_disk_reservation_put(c, &disk_res);
- }
-
- return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
- s64 *i_sectors_delta)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- int ret;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- POS(inum.inum, start),
- BTREE_ITER_INTENT);
-
- ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
-
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- ret = 0;
-
- return ret;
-}
-
-static int bch2_write_index_default(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct bkey_buf sk;
- struct keylist *keys = &op->insert_keys;
- struct bkey_i *k = bch2_keylist_front(keys);
- struct btree_trans trans;
- struct btree_iter iter;
- subvol_inum inum = {
- .subvol = op->subvol,
- .inum = k->k.p.inode,
- };
- int ret;
-
- BUG_ON(!inum.subvol);
-
- bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
- do {
- bch2_trans_begin(&trans);
-
- k = bch2_keylist_front(keys);
- bch2_bkey_buf_copy(&sk, c, k);
-
- ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
- &sk.k->k.p.snapshot);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- bkey_start_pos(&sk.k->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
- ret = bch2_extent_update(&trans, inum, &iter, sk.k,
- &op->res,
- op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_CHECK_ENOSPC);
- bch2_trans_iter_exit(&trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
-
- if (bkey_ge(iter.pos, k->k.p))
- bch2_keylist_pop_front(&op->insert_keys);
- else
- bch2_cut_front(iter.pos, k);
- } while (!bch2_keylist_empty(keys));
-
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
-
- return ret;
-}
-
-/* Writes */
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
- enum bch_data_type type,
- const struct bkey_i *k,
- bool nocow)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
- const struct bch_extent_ptr *ptr;
- struct bch_write_bio *n;
- struct bch_dev *ca;
-
- BUG_ON(c->opts.nochanges);
-
- bkey_for_each_ptr(ptrs, ptr) {
- BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
- !c->devs[ptr->dev]);
-
- ca = bch_dev_bkey_exists(c, ptr->dev);
-
- if (to_entry(ptr + 1) < ptrs.end) {
- n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
- GFP_NOFS, &ca->replica_set));
-
- n->bio.bi_end_io = wbio->bio.bi_end_io;
- n->bio.bi_private = wbio->bio.bi_private;
- n->parent = wbio;
- n->split = true;
- n->bounce = false;
- n->put_bio = true;
- n->bio.bi_opf = wbio->bio.bi_opf;
- bio_inc_remaining(&wbio->bio);
- } else {
- n = wbio;
- n->split = false;
- }
-
- n->c = c;
- n->dev = ptr->dev;
- n->have_ioref = nocow || bch2_dev_get_ioref(ca,
- type == BCH_DATA_btree ? READ : WRITE);
- n->nocow = nocow;
- n->submit_time = local_clock();
- n->inode_offset = bkey_start_offset(&k->k);
- n->bio.bi_iter.bi_sector = ptr->offset;
-
- if (likely(n->have_ioref)) {
- this_cpu_add(ca->io_done->sectors[WRITE][type],
- bio_sectors(&n->bio));
-
- bio_set_dev(&n->bio, ca->disk_sb.bdev);
-
- if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
- bio_endio(&n->bio);
- continue;
- }
-
- submit_bio(&n->bio);
- } else {
- n->bio.bi_status = BLK_STS_REMOVED;
- bio_endio(&n->bio);
- }
- }
-}
-
-static void __bch2_write(struct bch_write_op *);
-
-static void bch2_write_done(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_fs *c = op->c;
-
- bch2_disk_reservation_put(c, &op->res);
- if (!(op->flags & BCH_WRITE_MOVE))
- bch2_write_ref_put(c, BCH_WRITE_REF_write);
- bch2_keylist_free(&op->insert_keys, op->inline_keys);
-
- bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
-
- EBUG_ON(cl->parent);
- closure_debug_destroy(cl);
- if (op->end_io)
- op->end_io(op);
-}
-
-static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
-{
- struct keylist *keys = &op->insert_keys;
- struct bch_extent_ptr *ptr;
- struct bkey_i *src, *dst = keys->keys, *n;
-
- for (src = keys->keys; src != keys->top; src = n) {
- n = bkey_next(src);
-
- if (bkey_extent_is_direct_data(&src->k)) {
- bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
- test_bit(ptr->dev, op->failed.d));
-
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return -EIO;
- }
-
- if (dst != src)
- memmove_u64s_down(dst, src, src->k.u64s);
- dst = bkey_next(dst);
- }
-
- keys->top = dst;
- return 0;
-}
-
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void __bch2_write_index(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct keylist *keys = &op->insert_keys;
- struct bkey_i *k;
- unsigned dev;
- int ret = 0;
-
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
- ret = bch2_write_drop_io_error_ptrs(op);
- if (ret)
- goto err;
- }
-
- /*
- * probably not the ideal place to hook this in, but I don't
- * particularly want to plumb io_opts all the way through the btree
- * update stack right now
- */
- for_each_keylist_key(keys, k)
- bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
-
- if (!bch2_keylist_empty(keys)) {
- u64 sectors_start = keylist_sectors(keys);
-
- ret = !(op->flags & BCH_WRITE_MOVE)
- ? bch2_write_index_default(op)
- : bch2_data_update_index_update(op);
-
- BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
- BUG_ON(keylist_sectors(keys) && !ret);
-
- op->written += sectors_start - keylist_sectors(keys);
-
- if (ret && !bch2_err_matches(ret, EROFS)) {
- struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
-
- bch_err_inum_offset_ratelimited(c,
- k->k.p.inode, k->k.p.offset << 9,
- "write error while doing btree update: %s",
- bch2_err_str(ret));
- }
-
- if (ret)
- goto err;
- }
-out:
- /* If some a bucket wasn't written, we can't erasure code it: */
- for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
- bch2_open_bucket_write_error(c, &op->open_buckets, dev);
-
- bch2_open_buckets_put(c, &op->open_buckets);
- return;
-err:
- keys->top = keys->keys;
- op->error = ret;
- op->flags |= BCH_WRITE_DONE;
- goto out;
-}
-
-static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
-{
- if (state != wp->state) {
- u64 now = ktime_get_ns();
-
- if (wp->last_state_change &&
- time_after64(now, wp->last_state_change))
- wp->time[wp->state] += now - wp->last_state_change;
- wp->state = state;
- wp->last_state_change = now;
- }
-}
-
-static inline void wp_update_state(struct write_point *wp, bool running)
-{
- enum write_point_state state;
-
- state = running ? WRITE_POINT_running :
- !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
- : WRITE_POINT_stopped;
-
- __wp_update_state(wp, state);
-}
-
-static void bch2_write_index(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct write_point *wp = op->wp;
- struct workqueue_struct *wq = index_update_wq(op);
- unsigned long flags;
-
- if ((op->flags & BCH_WRITE_DONE) &&
- (op->flags & BCH_WRITE_MOVE))
- bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
-
- spin_lock_irqsave(&wp->writes_lock, flags);
- if (wp->state == WRITE_POINT_waiting_io)
- __wp_update_state(wp, WRITE_POINT_waiting_work);
- list_add_tail(&op->wp_list, &wp->writes);
- spin_unlock_irqrestore (&wp->writes_lock, flags);
-
- queue_work(wq, &wp->index_update_work);
-}
-
-static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
-{
- op->wp = wp;
-
- if (wp->state == WRITE_POINT_stopped) {
- spin_lock_irq(&wp->writes_lock);
- __wp_update_state(wp, WRITE_POINT_waiting_io);
- spin_unlock_irq(&wp->writes_lock);
- }
-}
-
-void bch2_write_point_do_index_updates(struct work_struct *work)
-{
- struct write_point *wp =
- container_of(work, struct write_point, index_update_work);
- struct bch_write_op *op;
-
- while (1) {
- spin_lock_irq(&wp->writes_lock);
- op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
- if (op)
- list_del(&op->wp_list);
- wp_update_state(wp, op != NULL);
- spin_unlock_irq(&wp->writes_lock);
-
- if (!op)
- break;
-
- op->flags |= BCH_WRITE_IN_WORKER;
-
- __bch2_write_index(op);
-
- if (!(op->flags & BCH_WRITE_DONE))
- __bch2_write(op);
- else
- bch2_write_done(&op->cl);
- }
-}
-
-static void bch2_write_endio(struct bio *bio)
-{
- struct closure *cl = bio->bi_private;
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
- struct bch_fs *c = wbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
-
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
- op->pos.inode,
- wbio->inode_offset << 9,
- "data write error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
- set_bit(wbio->dev, op->failed.d);
- op->flags |= BCH_WRITE_IO_ERROR;
- }
-
- if (wbio->nocow)
- set_bit(wbio->dev, op->devs_need_flush->d);
-
- if (wbio->have_ioref) {
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
- percpu_ref_put(&ca->io_ref);
- }
-
- if (wbio->bounce)
- bch2_bio_free_pages_pool(c, bio);
-
- if (wbio->put_bio)
- bio_put(bio);
-
- if (parent)
- bio_endio(&parent->bio);
- else
- closure_put(cl);
-}
-
-static void init_append_extent(struct bch_write_op *op,
- struct write_point *wp,
- struct bversion version,
- struct bch_extent_crc_unpacked crc)
-{
- struct bkey_i_extent *e;
-
- op->pos.offset += crc.uncompressed_size;
-
- e = bkey_extent_init(op->insert_keys.top);
- e->k.p = op->pos;
- e->k.size = crc.uncompressed_size;
- e->k.version = version;
-
- if (crc.csum_type ||
- crc.compression_type ||
- crc.nonce)
- bch2_extent_crc_append(&e->k_i, crc);
-
- bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
- op->flags & BCH_WRITE_CACHED);
-
- bch2_keylist_push(&op->insert_keys);
-}
-
-static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
- struct write_point *wp,
- struct bio *src,
- bool *page_alloc_failed,
- void *buf)
-{
- struct bch_write_bio *wbio;
- struct bio *bio;
- unsigned output_available =
- min(wp->sectors_free << 9, src->bi_iter.bi_size);
- unsigned pages = DIV_ROUND_UP(output_available +
- (buf
- ? ((unsigned long) buf & (PAGE_SIZE - 1))
- : 0), PAGE_SIZE);
-
- pages = min(pages, BIO_MAX_VECS);
-
- bio = bio_alloc_bioset(NULL, pages, 0,
- GFP_NOFS, &c->bio_write);
- wbio = wbio_init(bio);
- wbio->put_bio = true;
- /* copy WRITE_SYNC flag */
- wbio->bio.bi_opf = src->bi_opf;
-
- if (buf) {
- bch2_bio_map(bio, buf, output_available);
- return bio;
- }
-
- wbio->bounce = true;
-
- /*
- * We can't use mempool for more than c->sb.encoded_extent_max
- * worth of pages, but we'd like to allocate more if we can:
- */
- bch2_bio_alloc_pages_pool(c, bio,
- min_t(unsigned, output_available,
- c->opts.encoded_extent_max));
-
- if (bio->bi_iter.bi_size < output_available)
- *page_alloc_failed =
- bch2_bio_alloc_pages(bio,
- output_available -
- bio->bi_iter.bi_size,
- GFP_NOFS) != 0;
-
- return bio;
-}
-
-static int bch2_write_rechecksum(struct bch_fs *c,
- struct bch_write_op *op,
- unsigned new_csum_type)
-{
- struct bio *bio = &op->wbio.bio;
- struct bch_extent_crc_unpacked new_crc;
- int ret;
-
- /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
-
- if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
- bch2_csum_type_is_encryption(new_csum_type))
- new_csum_type = op->crc.csum_type;
-
- ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
- NULL, &new_crc,
- op->crc.offset, op->crc.live_size,
- new_csum_type);
- if (ret)
- return ret;
-
- bio_advance(bio, op->crc.offset << 9);
- bio->bi_iter.bi_size = op->crc.live_size << 9;
- op->crc = new_crc;
- return 0;
-}
-
-static int bch2_write_decrypt(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct nonce nonce = extent_nonce(op->version, op->crc);
- struct bch_csum csum;
- int ret;
-
- if (!bch2_csum_type_is_encryption(op->crc.csum_type))
- return 0;
-
- /*
- * If we need to decrypt data in the write path, we'll no longer be able
- * to verify the existing checksum (poly1305 mac, in this case) after
- * it's decrypted - this is the last point we'll be able to reverify the
- * checksum:
- */
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- if (bch2_crc_cmp(op->crc.csum, csum))
- return -EIO;
-
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- return ret;
-}
-
-static enum prep_encoded_ret {
- PREP_ENCODED_OK,
- PREP_ENCODED_ERR,
- PREP_ENCODED_CHECKSUM_ERR,
- PREP_ENCODED_DO_WRITE,
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
-{
- struct bch_fs *c = op->c;
- struct bio *bio = &op->wbio.bio;
-
- if (!(op->flags & BCH_WRITE_DATA_ENCODED))
- return PREP_ENCODED_OK;
-
- BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
-
- /* Can we just write the entire extent as is? */
- if (op->crc.uncompressed_size == op->crc.live_size &&
- op->crc.compressed_size <= wp->sectors_free &&
- (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
- op->incompressible)) {
- if (!crc_is_compressed(op->crc) &&
- op->csum_type != op->crc.csum_type &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
-
- return PREP_ENCODED_DO_WRITE;
- }
-
- /*
- * If the data is compressed and we couldn't write the entire extent as
- * is, we have to decompress it:
- */
- if (crc_is_compressed(op->crc)) {
- struct bch_csum csum;
-
- if (bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
-
- /* Last point we can still verify checksum: */
- csum = bch2_checksum_bio(c, op->crc.csum_type,
- extent_nonce(op->version, op->crc),
- bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
-
- if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
- return PREP_ENCODED_ERR;
- }
-
- /*
- * No longer have compressed data after this point - data might be
- * encrypted:
- */
-
- /*
- * If the data is checksummed and we're only writing a subset,
- * rechecksum and adjust bio to point to currently live data:
- */
- if ((op->crc.live_size != op->crc.uncompressed_size ||
- op->crc.csum_type != op->csum_type) &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
-
- /*
- * If we want to compress the data, it has to be decrypted:
- */
- if ((op->compression_opt ||
- bch2_csum_type_is_encryption(op->crc.csum_type) !=
- bch2_csum_type_is_encryption(op->csum_type)) &&
- bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
-
- return PREP_ENCODED_OK;
-}
-
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
- struct bio **_dst)
-{
- struct bch_fs *c = op->c;
- struct bio *src = &op->wbio.bio, *dst = src;
- struct bvec_iter saved_iter;
- void *ec_buf;
- unsigned total_output = 0, total_input = 0;
- bool bounce = false;
- bool page_alloc_failed = false;
- int ret, more = 0;
-
- BUG_ON(!bio_sectors(src));
-
- ec_buf = bch2_writepoint_ec_buf(c, wp);
-
- switch (bch2_write_prep_encoded_data(op, wp)) {
- case PREP_ENCODED_OK:
- break;
- case PREP_ENCODED_ERR:
- ret = -EIO;
- goto err;
- case PREP_ENCODED_CHECKSUM_ERR:
- goto csum_err;
- case PREP_ENCODED_DO_WRITE:
- /* XXX look for bug here */
- if (ec_buf) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bio_copy_data(dst, src);
- bounce = true;
- }
- init_append_extent(op, wp, op->version, op->crc);
- goto do_write;
- }
-
- if (ec_buf ||
- op->compression_opt ||
- (op->csum_type &&
- !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
- (bch2_csum_type_is_encryption(op->csum_type) &&
- !(op->flags & BCH_WRITE_PAGES_OWNED))) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bounce = true;
- }
-
- saved_iter = dst->bi_iter;
-
- do {
- struct bch_extent_crc_unpacked crc = { 0 };
- struct bversion version = op->version;
- size_t dst_len, src_len;
-
- if (page_alloc_failed &&
- dst->bi_iter.bi_size < (wp->sectors_free << 9) &&
- dst->bi_iter.bi_size < c->opts.encoded_extent_max)
- break;
-
- BUG_ON(op->compression_opt &&
- (op->flags & BCH_WRITE_DATA_ENCODED) &&
- bch2_csum_type_is_encryption(op->crc.csum_type));
- BUG_ON(op->compression_opt && !bounce);
-
- crc.compression_type = op->incompressible
- ? BCH_COMPRESSION_TYPE_incompressible
- : op->compression_opt
- ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
- op->compression_opt)
- : 0;
- if (!crc_is_compressed(crc)) {
- dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
- dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
-
- if (op->csum_type)
- dst_len = min_t(unsigned, dst_len,
- c->opts.encoded_extent_max);
-
- if (bounce) {
- swap(dst->bi_iter.bi_size, dst_len);
- bio_copy_data(dst, src);
- swap(dst->bi_iter.bi_size, dst_len);
- }
-
- src_len = dst_len;
- }
-
- BUG_ON(!src_len || !dst_len);
-
- if (bch2_csum_type_is_encryption(op->csum_type)) {
- if (bversion_zero(version)) {
- version.lo = atomic64_inc_return(&c->key_version);
- } else {
- crc.nonce = op->nonce;
- op->nonce += src_len >> 9;
- }
- }
-
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
- !crc_is_compressed(crc) &&
- bch2_csum_type_is_encryption(op->crc.csum_type) ==
- bch2_csum_type_is_encryption(op->csum_type)) {
- u8 compression_type = crc.compression_type;
- u16 nonce = crc.nonce;
- /*
- * Note: when we're using rechecksum(), we need to be
- * checksumming @src because it has all the data our
- * existing checksum covers - if we bounced (because we
- * were trying to compress), @dst will only have the
- * part of the data the new checksum will cover.
- *
- * But normally we want to be checksumming post bounce,
- * because part of the reason for bouncing is so the
- * data can't be modified (by userspace) while it's in
- * flight.
- */
- if (bch2_rechecksum_bio(c, src, version, op->crc,
- &crc, &op->crc,
- src_len >> 9,
- bio_sectors(src) - (src_len >> 9),
- op->csum_type))
- goto csum_err;
- /*
- * rchecksum_bio sets compression_type on crc from op->crc,
- * this isn't always correct as sometimes we're changing
- * an extent from uncompressed to incompressible.
- */
- crc.compression_type = compression_type;
- crc.nonce = nonce;
- } else {
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
- bch2_rechecksum_bio(c, src, version, op->crc,
- NULL, &op->crc,
- src_len >> 9,
- bio_sectors(src) - (src_len >> 9),
- op->crc.csum_type))
- goto csum_err;
-
- crc.compressed_size = dst_len >> 9;
- crc.uncompressed_size = src_len >> 9;
- crc.live_size = src_len >> 9;
-
- swap(dst->bi_iter.bi_size, dst_len);
- ret = bch2_encrypt_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
- if (ret)
- goto err;
-
- crc.csum = bch2_checksum_bio(c, op->csum_type,
- extent_nonce(version, crc), dst);
- crc.csum_type = op->csum_type;
- swap(dst->bi_iter.bi_size, dst_len);
- }
-
- init_append_extent(op, wp, version, crc);
-
- if (dst != src)
- bio_advance(dst, dst_len);
- bio_advance(src, src_len);
- total_output += dst_len;
- total_input += src_len;
- } while (dst->bi_iter.bi_size &&
- src->bi_iter.bi_size &&
- wp->sectors_free &&
- !bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX));
-
- more = src->bi_iter.bi_size != 0;
-
- dst->bi_iter = saved_iter;
-
- if (dst == src && more) {
- BUG_ON(total_output != total_input);
-
- dst = bio_split(src, total_input >> 9,
- GFP_NOFS, &c->bio_write);
- wbio_init(dst)->put_bio = true;
- /* copy WRITE_SYNC flag */
- dst->bi_opf = src->bi_opf;
- }
-
- dst->bi_iter.bi_size = total_output;
-do_write:
- *_dst = dst;
- return more;
-csum_err:
- bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
- ret = -EIO;
-err:
- if (to_wbio(dst)->bounce)
- bch2_bio_free_pages_pool(c, dst);
- if (to_wbio(dst)->put_bio)
- bio_put(dst);
-
- return ret;
-}
-
-static bool bch2_extent_is_writeable(struct bch_write_op *op,
- struct bkey_s_c k)
-{
- struct bch_fs *c = op->c;
- struct bkey_s_c_extent e;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
- unsigned replicas = 0;
-
- if (k.k->type != KEY_TYPE_extent)
- return false;
-
- e = bkey_s_c_to_extent(k);
- extent_for_each_ptr_decode(e, p, entry) {
- if (p.crc.csum_type ||
- crc_is_compressed(p.crc) ||
- p.has_ec)
- return false;
-
- replicas += bch2_extent_ptr_durability(c, &p);
- }
-
- return replicas >= op->opts.data_replicas;
-}
-
-static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- const struct bch_extent_ptr *ptr;
- struct bkey_i *k;
-
- for_each_keylist_key(&op->insert_keys, k) {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-
- bkey_for_each_ptr(ptrs, ptr)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
- }
-}
-
-static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *orig,
- struct bkey_s_c k,
- u64 new_i_size)
-{
- struct bkey_i *new;
- struct bkey_ptrs ptrs;
- struct bch_extent_ptr *ptr;
- int ret;
-
- if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
- /* trace this */
- return 0;
- }
-
- new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- return ret;
-
- bch2_cut_front(bkey_start_pos(&orig->k), new);
- bch2_cut_back(orig->k.p, new);
-
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- bkey_for_each_ptr(ptrs, ptr)
- ptr->unwritten = 0;
-
- /*
- * Note that we're not calling bch2_subvol_get_snapshot() in this path -
- * that was done when we kicked off the write, and here it's important
- * that we update the extent that we wrote to - even if a snapshot has
- * since been created. The write is still outstanding, so we're ok
- * w.r.t. snapshot atomicity:
- */
- return bch2_extent_update_i_size_sectors(trans, iter,
- min(new->k.p.offset << 9, new_i_size), 0) ?:
- bch2_trans_update(trans, iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_i *orig;
- struct bkey_s_c k;
- int ret;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_keylist_key(&op->insert_keys, orig) {
- ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
- bkey_start_pos(&orig->k), orig->k.p,
- BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL, ({
- bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
- }));
-
- if (ret && !bch2_err_matches(ret, EROFS)) {
- struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
-
- bch_err_inum_offset_ratelimited(c,
- k->k.p.inode, k->k.p.offset << 9,
- "write error while doing btree update: %s",
- bch2_err_str(ret));
- }
-
- if (ret) {
- op->error = ret;
- break;
- }
- }
-
- bch2_trans_exit(&trans);
-}
-
-static void __bch2_nocow_write_done(struct bch_write_op *op)
-{
- bch2_nocow_write_unlock(op);
-
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
- op->error = -EIO;
- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
- bch2_nocow_write_convert_unwritten(op);
-}
-
-static void bch2_nocow_write_done(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-
- __bch2_nocow_write_done(op);
- bch2_write_done(cl);
-}
-
-static void bch2_nocow_write(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
- struct {
- struct bpos b;
- unsigned gen;
- struct nocow_lock_bucket *l;
- } buckets[BCH_REPLICAS_MAX];
- unsigned nr_buckets = 0;
- u32 snapshot;
- int ret, i;
-
- if (op->flags & BCH_WRITE_MOVE)
- return;
-
- bch2_trans_init(&trans, c, 0, 0);
-retry:
- bch2_trans_begin(&trans);
-
- ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
- if (unlikely(ret))
- goto err;
-
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- SPOS(op->pos.inode, op->pos.offset, snapshot),
- BTREE_ITER_SLOTS);
- while (1) {
- struct bio *bio = &op->wbio.bio;
-
- nr_buckets = 0;
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- /* fall back to normal cow write path? */
- if (unlikely(k.k->p.snapshot != snapshot ||
- !bch2_extent_is_writeable(op, k)))
- break;
-
- if (bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- k.k->u64s))
- break;
-
- /* Get iorefs before dropping btree locks: */
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
- buckets[nr_buckets].gen = ptr->gen;
- buckets[nr_buckets].l =
- bucket_nocow_lock(&c->nocow_locks,
- bucket_to_u64(buckets[nr_buckets].b));
-
- prefetch(buckets[nr_buckets].l);
-
- if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
- goto err_get_ioref;
-
- nr_buckets++;
-
- if (ptr->unwritten)
- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
- }
-
- /* Unlock before taking nocow locks, doing IO: */
- bkey_reassemble(op->insert_keys.top, k);
- bch2_trans_unlock(&trans);
-
- bch2_cut_front(op->pos, op->insert_keys.top);
- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
- bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
-
- for (i = 0; i < nr_buckets; i++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
- struct nocow_lock_bucket *l = buckets[i].l;
- bool stale;
-
- __bch2_bucket_nocow_lock(&c->nocow_locks, l,
- bucket_to_u64(buckets[i].b),
- BUCKET_NOCOW_LOCK_UPDATE);
-
- rcu_read_lock();
- stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
- rcu_read_unlock();
-
- if (unlikely(stale))
- goto err_bucket_stale;
- }
-
- bio = &op->wbio.bio;
- if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
- bio = bio_split(bio, k.k->p.offset - op->pos.offset,
- GFP_KERNEL, &c->bio_write);
- wbio_init(bio)->put_bio = true;
- bio->bi_opf = op->wbio.bio.bi_opf;
- } else {
- op->flags |= BCH_WRITE_DONE;
- }
-
- op->pos.offset += bio_sectors(bio);
- op->written += bio_sectors(bio);
-
- bio->bi_end_io = bch2_write_endio;
- bio->bi_private = &op->cl;
- bio->bi_opf |= REQ_OP_WRITE;
- closure_get(&op->cl);
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
- op->insert_keys.top, true);
-
- bch2_keylist_push(&op->insert_keys);
- if (op->flags & BCH_WRITE_DONE)
- break;
- bch2_btree_iter_advance(&iter);
- }
-out:
- bch2_trans_iter_exit(&trans, &iter);
-err:
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
-
- if (ret) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "%s: btree lookup error %s",
- __func__, bch2_err_str(ret));
- op->error = ret;
- op->flags |= BCH_WRITE_DONE;
- }
-
- bch2_trans_exit(&trans);
-
- /* fallback to cow write path? */
- if (!(op->flags & BCH_WRITE_DONE)) {
- closure_sync(&op->cl);
- __bch2_nocow_write_done(op);
- op->insert_keys.top = op->insert_keys.keys;
- } else if (op->flags & BCH_WRITE_SYNC) {
- closure_sync(&op->cl);
- bch2_nocow_write_done(&op->cl);
- } else {
- /*
- * XXX
- * needs to run out of process context because ei_quota_lock is
- * a mutex
- */
- continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
- }
- return;
-err_get_ioref:
- for (i = 0; i < nr_buckets; i++)
- percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
-
- /* Fall back to COW path: */
- goto out;
-err_bucket_stale:
- while (--i >= 0)
- bch2_bucket_nocow_unlock(&c->nocow_locks,
- buckets[i].b,
- BUCKET_NOCOW_LOCK_UPDATE);
- for (i = 0; i < nr_buckets; i++)
- percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
-
- /* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
- goto out;
-}
-
-static void __bch2_write(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct write_point *wp = NULL;
- struct bio *bio = NULL;
- unsigned nofs_flags;
- int ret;
-
- nofs_flags = memalloc_nofs_save();
-
- if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
- bch2_nocow_write(op);
- if (op->flags & BCH_WRITE_DONE)
- goto out_nofs_restore;
- }
-again:
- memset(&op->failed, 0, sizeof(op->failed));
-
- do {
- struct bkey_i *key_to_write;
- unsigned key_to_write_offset = op->insert_keys.top_p -
- op->insert_keys.keys_p;
-
- /* +1 for possible cache device: */
- if (op->open_buckets.nr + op->nr_replicas + 1 >
- ARRAY_SIZE(op->open_buckets.v))
- break;
-
- if (bch2_keylist_realloc(&op->insert_keys,
- op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_EXTENT_U64s_MAX))
- break;
-
- /*
- * The copygc thread is now global, which means it's no longer
- * freeing up space on specific disks, which means that
- * allocations for specific disks may hang arbitrarily long:
- */
- ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_alloc_sectors_start_trans(&trans,
- op->target,
- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
- op->write_point,
- &op->devs_have,
- op->nr_replicas,
- op->nr_replicas_required,
- op->watermark,
- op->flags,
- (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_ONLY_SPECIFIED_DEVS))
- ? NULL : &op->cl, &wp));
- if (unlikely(ret)) {
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- break;
-
- goto err;
- }
-
- EBUG_ON(!wp);
-
- bch2_open_bucket_get(c, wp, &op->open_buckets);
- ret = bch2_write_extent(op, wp, &bio);
-
- bch2_alloc_sectors_done_inlined(c, wp);
-err:
- if (ret <= 0) {
- op->flags |= BCH_WRITE_DONE;
-
- if (ret < 0) {
- op->error = ret;
- break;
- }
- }
-
- bio->bi_end_io = bch2_write_endio;
- bio->bi_private = &op->cl;
- bio->bi_opf |= REQ_OP_WRITE;
-
- closure_get(bio->bi_private);
-
- key_to_write = (void *) (op->insert_keys.keys_p +
- key_to_write_offset);
-
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
- key_to_write, false);
- } while (ret);
-
- /*
- * Sync or no?
- *
- * If we're running asynchronously, wne may still want to block
- * synchronously here if we weren't able to submit all of the IO at
- * once, as that signals backpressure to the caller.
- */
- if ((op->flags & BCH_WRITE_SYNC) ||
- (!(op->flags & BCH_WRITE_DONE) &&
- !(op->flags & BCH_WRITE_IN_WORKER))) {
- closure_sync(&op->cl);
- __bch2_write_index(op);
-
- if (!(op->flags & BCH_WRITE_DONE))
- goto again;
- bch2_write_done(&op->cl);
- } else {
- bch2_write_queue(op, wp);
- continue_at(&op->cl, bch2_write_index, NULL);
- }
-out_nofs_restore:
- memalloc_nofs_restore(nofs_flags);
-}
-
-static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
-{
- struct bio *bio = &op->wbio.bio;
- struct bvec_iter iter;
- struct bkey_i_inline_data *id;
- unsigned sectors;
- int ret;
-
- op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
- op->flags |= BCH_WRITE_DONE;
-
- bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
-
- ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
- ARRAY_SIZE(op->inline_keys),
- BKEY_U64s + DIV_ROUND_UP(data_len, 8));
- if (ret) {
- op->error = ret;
- goto err;
- }
-
- sectors = bio_sectors(bio);
- op->pos.offset += sectors;
-
- id = bkey_inline_data_init(op->insert_keys.top);
- id->k.p = op->pos;
- id->k.version = op->version;
- id->k.size = sectors;
-
- iter = bio->bi_iter;
- iter.bi_size = data_len;
- memcpy_from_bio(id->v.data, bio, iter);
-
- while (data_len & 7)
- id->v.data[data_len++] = '\0';
- set_bkey_val_bytes(&id->k, data_len);
- bch2_keylist_push(&op->insert_keys);
-
- __bch2_write_index(op);
-err:
- bch2_write_done(&op->cl);
-}
-
-/**
- * bch_write - handle a write to a cache device or flash only volume
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-void bch2_write(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->wbio.bio;
- struct bch_fs *c = op->c;
- unsigned data_len;
-
- EBUG_ON(op->cl.parent);
- BUG_ON(!op->nr_replicas);
- BUG_ON(!op->write_point.v);
- BUG_ON(bkey_eq(op->pos, POS_MAX));
-
- op->start_time = local_clock();
- bch2_keylist_init(&op->insert_keys, op->inline_keys);
- wbio_init(bio)->put_bio = false;
-
- if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
- bch_err_inum_offset_ratelimited(c,
- op->pos.inode,
- op->pos.offset << 9,
- "misaligned write");
- op->error = -EIO;
- goto err;
- }
-
- if (c->opts.nochanges) {
- op->error = -BCH_ERR_erofs_no_writes;
- goto err;
- }
-
- if (!(op->flags & BCH_WRITE_MOVE) &&
- !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
- op->error = -BCH_ERR_erofs_no_writes;
- goto err;
- }
-
- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
- bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
- data_len = min_t(u64, bio->bi_iter.bi_size,
- op->new_i_size - (op->pos.offset << 9));
-
- if (c->opts.inline_data &&
- data_len <= min(block_bytes(c) / 2, 1024U)) {
- bch2_write_data_inline(op, data_len);
- return;
- }
-
- __bch2_write(op);
- return;
-err:
- bch2_disk_reservation_put(c, &op->res);
-
- closure_debug_destroy(&op->cl);
- if (op->end_io)
- op->end_io(op);
-}
-
-static const char * const bch2_write_flags[] = {
-#define x(f) #f,
- BCH_WRITE_FLAGS()
-#undef x
- NULL
-};
-
-void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
-{
- prt_str(out, "pos: ");
- bch2_bpos_to_text(out, op->pos);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_str(out, "started: ");
- bch2_pr_time_units(out, local_clock() - op->start_time);
- prt_newline(out);
-
- prt_str(out, "flags: ");
- prt_bitflags(out, bch2_write_flags, op->flags);
- prt_newline(out);
-
- prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
-}
-
-/* Cache promotion on read */
-
-struct promote_op {
- struct rcu_head rcu;
- u64 start_time;
-
- struct rhash_head hash;
- struct bpos pos;
-
- struct data_update write;
- struct bio_vec bi_inline_vecs[0]; /* must be last */
-};
-
-static const struct rhashtable_params bch_promote_params = {
- .head_offset = offsetof(struct promote_op, hash),
- .key_offset = offsetof(struct promote_op, pos),
- .key_len = sizeof(struct bpos),
-};
-
-static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
- struct bpos pos,
- struct bch_io_opts opts,
- unsigned flags)
-{
- if (!(flags & BCH_READ_MAY_PROMOTE))
- return false;
-
- if (!opts.promote_target)
- return false;
-
- if (bch2_bkey_has_target(c, k, opts.promote_target))
- return false;
-
- if (bkey_extent_is_unwritten(k))
- return false;
-
- if (bch2_target_congested(c, opts.promote_target)) {
- /* XXX trace this */
- return false;
- }
-
- if (rhashtable_lookup_fast(&c->promote_table, &pos,
- bch_promote_params))
- return false;
-
- return true;
-}
-
-static void promote_free(struct bch_fs *c, struct promote_op *op)
-{
- int ret;
-
- bch2_data_update_exit(&op->write);
-
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
- kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
- struct promote_op *op =
- container_of(wop, struct promote_op, write.op);
- struct bch_fs *c = op->write.op.c;
-
- bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
- op->start_time);
- promote_free(c, op);
-}
-
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
-{
- struct bio *bio = &op->write.op.wbio.bio;
-
- trace_and_count(op->write.op.c, read_promote, &rbio->bio);
-
- /* we now own pages: */
- BUG_ON(!rbio->bounce);
- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
-
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
- swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-
- bch2_data_update_read_done(&op->write, rbio->pick.crc);
-}
-
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct bpos pos,
- struct extent_ptr_decoded *pick,
- struct bch_io_opts opts,
- unsigned sectors,
- struct bch_read_bio **rbio)
-{
- struct bch_fs *c = trans->c;
- struct promote_op *op = NULL;
- struct bio *bio;
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
- int ret;
-
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
- return NULL;
-
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
- if (!op)
- goto err;
-
- op->start_time = local_clock();
- op->pos = pos;
-
- /*
- * We don't use the mempool here because extents that aren't
- * checksummed or compressed can be too big for the mempool:
- */
- *rbio = kzalloc(sizeof(struct bch_read_bio) +
- sizeof(struct bio_vec) * pages,
- GFP_NOFS);
- if (!*rbio)
- goto err;
-
- rbio_init(&(*rbio)->bio, opts);
- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
-
- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
- GFP_NOFS))
- goto err;
-
- (*rbio)->bounce = true;
- (*rbio)->split = true;
- (*rbio)->kmalloc = true;
-
- if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
- bch_promote_params))
- goto err;
-
- bio = &op->write.op.wbio.bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
-
- ret = bch2_data_update_init(trans, NULL, &op->write,
- writepoint_hashed((unsigned long) current),
- opts,
- (struct data_update_opts) {
- .target = opts.promote_target,
- .extra_replicas = 1,
- .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
- },
- btree_id, k);
- /*
- * possible errors: -BCH_ERR_nocow_lock_blocked,
- * -BCH_ERR_ENOSPC_disk_reservation:
- */
- if (ret) {
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
- goto err;
- }
-
- op->write.op.end_io = promote_done;
-
- return op;
-err:
- if (*rbio)
- bio_free_pages(&(*rbio)->bio);
- kfree(*rbio);
- *rbio = NULL;
- kfree(op);
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
- return NULL;
-}
-
-noinline
-static struct promote_op *promote_alloc(struct btree_trans *trans,
- struct bvec_iter iter,
- struct bkey_s_c k,
- struct extent_ptr_decoded *pick,
- struct bch_io_opts opts,
- unsigned flags,
- struct bch_read_bio **rbio,
- bool *bounce,
- bool *read_full)
-{
- struct bch_fs *c = trans->c;
- bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
- /* data might have to be decompressed in the write path: */
- unsigned sectors = promote_full
- ? max(pick->crc.compressed_size, pick->crc.live_size)
- : bvec_iter_sectors(iter);
- struct bpos pos = promote_full
- ? bkey_start_pos(k.k)
- : POS(k.k->p.inode, iter.bi_sector);
- struct promote_op *promote;
-
- if (!should_promote(c, k, pos, opts, flags))
- return NULL;
-
- promote = __promote_alloc(trans,
- k.k->type == KEY_TYPE_reflink_v
- ? BTREE_ID_reflink
- : BTREE_ID_extents,
- k, pos, pick, opts, sectors, rbio);
- if (!promote)
- return NULL;
-
- *bounce = true;
- *read_full = promote_full;
- return promote;
-}
-
-/* Read */
-
-#define READ_RETRY_AVOID 1
-#define READ_RETRY 2
-#define READ_ERR 3
-
-enum rbio_context {
- RBIO_CONTEXT_NULL,
- RBIO_CONTEXT_HIGHPRI,
- RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
- return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
- enum rbio_context context,
- struct workqueue_struct *wq)
-{
- if (context <= rbio->context) {
- fn(&rbio->work);
- } else {
- rbio->work.func = fn;
- rbio->context = context;
- queue_work(wq, &rbio->work);
- }
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
- BUG_ON(rbio->bounce && !rbio->split);
-
- if (rbio->promote)
- promote_free(rbio->c, rbio->promote);
- rbio->promote = NULL;
-
- if (rbio->bounce)
- bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
- if (rbio->split) {
- struct bch_read_bio *parent = rbio->parent;
-
- if (rbio->kmalloc)
- kfree(rbio);
- else
- bio_put(&rbio->bio);
-
- rbio = parent;
- }
-
- return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
- if (rbio->start_time)
- bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
- rbio->start_time);
- bio_endio(&rbio->bio);
-}
-
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter,
- struct bch_io_failures *failed,
- unsigned flags)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_buf sk;
- struct bkey_s_c k;
- int ret;
-
- flags &= ~BCH_READ_LAST_FRAGMENT;
- flags |= BCH_READ_MUST_CLONE;
-
- bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
-
- bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_SLOTS);
-retry:
- rbio->bio.bi_status = 0;
-
- k = bch2_btree_iter_peek_slot(&iter);
- if (bkey_err(k))
- goto err;
-
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
- bch2_trans_unlock(&trans);
-
- if (!bch2_bkey_matches_ptr(c, k,
- rbio->pick.ptr,
- rbio->data_pos.offset -
- rbio->pick.crc.offset)) {
- /* extent we wanted to read no longer exists: */
- rbio->hole = true;
- goto out;
- }
-
- ret = __bch2_read_extent(&trans, rbio, bvec_iter,
- rbio->read_pos,
- rbio->data_btree,
- k, 0, failed, flags);
- if (ret == READ_RETRY)
- goto retry;
- if (ret)
- goto err;
-out:
- bch2_rbio_done(rbio);
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
- return;
-err:
- rbio->bio.bi_status = BLK_STS_IOERR;
- goto out;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bvec_iter iter = rbio->bvec_iter;
- unsigned flags = rbio->flags;
- subvol_inum inum = {
- .subvol = rbio->subvol,
- .inum = rbio->read_pos.inode,
- };
- struct bch_io_failures failed = { .nr = 0 };
-
- trace_and_count(c, read_retry, &rbio->bio);
-
- if (rbio->retry == READ_RETRY_AVOID)
- bch2_mark_io_failure(&failed, &rbio->pick);
-
- rbio->bio.bi_status = 0;
-
- rbio = bch2_rbio_free(rbio);
-
- flags |= BCH_READ_IN_RETRY;
- flags &= ~BCH_READ_MAY_PROMOTE;
-
- if (flags & BCH_READ_NODECODE) {
- bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
- } else {
- flags &= ~BCH_READ_LAST_FRAGMENT;
- flags |= BCH_READ_MUST_CLONE;
-
- __bch2_read(c, rbio, iter, inum, &failed, flags);
- }
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
- blk_status_t error)
-{
- rbio->retry = retry;
-
- if (rbio->flags & BCH_READ_IN_RETRY)
- return;
-
- if (retry == READ_ERR) {
- rbio = bch2_rbio_free(rbio);
-
- rbio->bio.bi_status = error;
- bch2_rbio_done(rbio);
- } else {
- bch2_rbio_punt(rbio, bch2_rbio_retry,
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
- }
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
- struct bch_read_bio *rbio)
-{
- struct bch_fs *c = rbio->c;
- u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
- struct bch_extent_crc_unpacked new_crc;
- struct btree_iter iter;
- struct bkey_i *new;
- struct bkey_s_c k;
- int ret = 0;
-
- if (crc_is_compressed(rbio->pick.crc))
- return 0;
-
- k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if ((ret = bkey_err(k)))
- goto out;
-
- if (bversion_cmp(k.k->version, rbio->version) ||
- !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
- goto out;
-
- /* Extent was merged? */
- if (bkey_start_offset(k.k) < data_offset ||
- k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
- goto out;
-
- if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
- rbio->pick.crc, NULL, &new_crc,
- bkey_start_offset(k.k) - data_offset, k.k->size,
- rbio->pick.crc.csum_type)) {
- bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
- ret = 0;
- goto out;
- }
-
- /*
- * going to be temporarily appending another checksum entry:
- */
- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
- sizeof(struct bch_extent_crc128));
- if ((ret = PTR_ERR_OR_ZERO(new)))
- goto out;
-
- bkey_reassemble(new, k);
-
- if (!bch2_bkey_narrow_crcs(new, new_crc))
- goto out;
-
- ret = bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-out:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
- bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
- __bch2_rbio_narrow_crcs(&trans, rbio));
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
- struct bio *src = &rbio->bio;
- struct bio *dst = &bch2_rbio_parent(rbio)->bio;
- struct bvec_iter dst_iter = rbio->bvec_iter;
- struct bch_extent_crc_unpacked crc = rbio->pick.crc;
- struct nonce nonce = extent_nonce(rbio->version, crc);
- unsigned nofs_flags;
- struct bch_csum csum;
- int ret;
-
- nofs_flags = memalloc_nofs_save();
-
- /* Reset iterator for checksumming and copying bounced data: */
- if (rbio->bounce) {
- src->bi_iter.bi_size = crc.compressed_size << 9;
- src->bi_iter.bi_idx = 0;
- src->bi_iter.bi_bvec_done = 0;
- } else {
- src->bi_iter = rbio->bvec_iter;
- }
-
- csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
- goto csum_err;
-
- /*
- * XXX
- * We need to rework the narrow_crcs path to deliver the read completion
- * first, and then punt to a different workqueue, otherwise we're
- * holding up reads while doing btree updates which is bad for memory
- * reclaim.
- */
- if (unlikely(rbio->narrow_crcs))
- bch2_rbio_narrow_crcs(rbio);
-
- if (rbio->flags & BCH_READ_NODECODE)
- goto nodecode;
-
- /* Adjust crc to point to subset of data we want: */
- crc.offset += rbio->offset_into_extent;
- crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
-
- if (crc_is_compressed(crc)) {
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
-
- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
- !c->opts.no_data_io)
- goto decompression_err;
- } else {
- /* don't need to decrypt the entire bio: */
- nonce = nonce_add(nonce, crc.offset << 9);
- bio_advance(src, crc.offset << 9);
-
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
- src->bi_iter.bi_size = dst_iter.bi_size;
-
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
-
- if (rbio->bounce) {
- struct bvec_iter src_iter = src->bi_iter;
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
- }
- }
-
- if (rbio->promote) {
- /*
- * Re encrypt data we decrypted, so it's consistent with
- * rbio->crc:
- */
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
-
- promote_start(rbio->promote, rbio);
- rbio->promote = NULL;
- }
-nodecode:
- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
- rbio = bch2_rbio_free(rbio);
- bch2_rbio_done(rbio);
- }
-out:
- memalloc_nofs_restore(nofs_flags);
- return;
-csum_err:
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
- rbio->flags |= BCH_READ_MUST_BOUNCE;
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
- goto out;
- }
-
- bch_err_inum_offset_ratelimited(ca,
- rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
- rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
- csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
- bch2_io_error(ca);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
- goto out;
-decompression_err:
- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "decompression error");
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
- goto out;
-decrypt_err:
- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
- rbio->read_pos.offset << 9,
- "decrypt error");
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
- goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
- struct bch_read_bio *rbio =
- container_of(bio, struct bch_read_bio, bio);
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
- struct workqueue_struct *wq = NULL;
- enum rbio_context context = RBIO_CONTEXT_NULL;
-
- if (rbio->have_ioref) {
- bch2_latency_acct(ca, rbio->submit_time, READ);
- percpu_ref_put(&ca->io_ref);
- }
-
- if (!rbio->split)
- rbio->bio.bi_end_io = rbio->end_io;
-
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
- rbio->read_pos.inode,
- rbio->read_pos.offset,
- "data read error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
- return;
- }
-
- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(ca, &rbio->pick.ptr)) {
- trace_and_count(c, read_reuse_race, &rbio->bio);
-
- if (rbio->flags & BCH_READ_RETRY_IF_STALE)
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
- else
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
- return;
- }
-
- if (rbio->narrow_crcs ||
- rbio->promote ||
- crc_is_compressed(rbio->pick.crc) ||
- bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
- context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
- else if (rbio->pick.crc.csum_type)
- context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
-
- bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-int __bch2_read_indirect_extent(struct btree_trans *trans,
- unsigned *offset_into_extent,
- struct bkey_buf *orig_k)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 reflink_offset;
- int ret;
-
- reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
- *offset_into_extent;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
- POS(0, reflink_offset), 0);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (k.k->type != KEY_TYPE_reflink_v &&
- k.k->type != KEY_TYPE_indirect_inline_data) {
- bch_err_inum_offset_ratelimited(trans->c,
- orig_k->k->k.p.inode,
- orig_k->k->k.p.offset << 9,
- "%llu len %u points to nonexistent indirect extent %llu",
- orig_k->k->k.p.offset,
- orig_k->k->k.size,
- reflink_offset);
- bch2_inconsistent_error(trans->c);
- ret = -EIO;
- goto err;
- }
-
- *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
- bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
- struct bkey_s_c k,
- struct bch_extent_ptr ptr)
-{
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
- struct btree_iter iter;
- struct printbuf buf = PRINTBUF;
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
- PTR_BUCKET_POS(c, &ptr),
- BTREE_ITER_CACHED);
-
- prt_printf(&buf, "Attempting to read from stale dirty pointer:");
- printbuf_indent_add(&buf, 2);
- prt_newline(&buf);
-
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
-
- prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
-
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (!ret) {
- prt_newline(&buf);
- bch2_bkey_val_to_text(&buf, c, k);
- }
-
- bch2_fs_inconsistent(c, "%s", buf.buf);
-
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bpos read_pos,
- enum btree_id data_btree, struct bkey_s_c k,
- unsigned offset_into_extent,
- struct bch_io_failures *failed, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct extent_ptr_decoded pick;
- struct bch_read_bio *rbio = NULL;
- struct bch_dev *ca = NULL;
- struct promote_op *promote = NULL;
- bool bounce = false, read_full = false, narrow_crcs = false;
- struct bpos data_pos = bkey_start_pos(k.k);
- int pick_ret;
-
- if (bkey_extent_is_inline_data(k.k)) {
- unsigned bytes = min_t(unsigned, iter.bi_size,
- bkey_inline_data_bytes(k.k));
-
- swap(iter.bi_size, bytes);
- memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
- swap(iter.bi_size, bytes);
- bio_advance_iter(&orig->bio, &iter, bytes);
- zero_fill_bio_iter(&orig->bio, iter);
- goto out_read_done;
- }
-retry_pick:
- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
-
- /* hole or reservation - just zero fill: */
- if (!pick_ret)
- goto hole;
-
- if (pick_ret < 0) {
- bch_err_inum_offset_ratelimited(c,
- read_pos.inode, read_pos.offset << 9,
- "no device to read from");
- goto err;
- }
-
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
-
- /*
- * Stale dirty pointers are treated as IO errors, but @failed isn't
- * allocated unless we're in the retry path - so if we're not in the
- * retry path, don't check here, it'll be caught in bch2_read_endio()
- * and we'll end up in the retry path:
- */
- if ((flags & BCH_READ_IN_RETRY) &&
- !pick.ptr.cached &&
- unlikely(ptr_stale(ca, &pick.ptr))) {
- read_from_stale_dirty_pointer(trans, k, pick.ptr);
- bch2_mark_io_failure(failed, &pick);
- goto retry_pick;
- }
-
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- bch2_trans_unlock(trans);
-
- if (flags & BCH_READ_NODECODE) {
- /*
- * can happen if we retry, and the extent we were going to read
- * has been merged in the meantime:
- */
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
- goto hole;
-
- iter.bi_size = pick.crc.compressed_size << 9;
- goto get_bio;
- }
-
- if (!(flags & BCH_READ_LAST_FRAGMENT) ||
- bio_flagged(&orig->bio, BIO_CHAIN))
- flags |= BCH_READ_MUST_CLONE;
-
- narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
- bch2_can_narrow_extent_crcs(k, pick.crc);
-
- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
- flags |= BCH_READ_MUST_BOUNCE;
-
- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
- if (crc_is_compressed(pick.crc) ||
- (pick.crc.csum_type != BCH_CSUM_none &&
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
- (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
- (flags & BCH_READ_USER_MAPPED)) ||
- (flags & BCH_READ_MUST_BOUNCE)))) {
- read_full = true;
- bounce = true;
- }
-
- if (orig->opts.promote_target)
- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
- &rbio, &bounce, &read_full);
-
- if (!read_full) {
- EBUG_ON(crc_is_compressed(pick.crc));
- EBUG_ON(pick.crc.csum_type &&
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
- bvec_iter_sectors(iter) != pick.crc.live_size ||
- pick.crc.offset ||
- offset_into_extent));
-
- data_pos.offset += offset_into_extent;
- pick.ptr.offset += pick.crc.offset +
- offset_into_extent;
- offset_into_extent = 0;
- pick.crc.compressed_size = bvec_iter_sectors(iter);
- pick.crc.uncompressed_size = bvec_iter_sectors(iter);
- pick.crc.offset = 0;
- pick.crc.live_size = bvec_iter_sectors(iter);
- offset_into_extent = 0;
- }
-get_bio:
- if (rbio) {
- /*
- * promote already allocated bounce rbio:
- * promote needs to allocate a bio big enough for uncompressing
- * data in the write path, but we're not going to use it all
- * here:
- */
- EBUG_ON(rbio->bio.bi_iter.bi_size <
- pick.crc.compressed_size << 9);
- rbio->bio.bi_iter.bi_size =
- pick.crc.compressed_size << 9;
- } else if (bounce) {
- unsigned sectors = pick.crc.compressed_size;
-
- rbio = rbio_init(bio_alloc_bioset(NULL,
- DIV_ROUND_UP(sectors, PAGE_SECTORS),
- 0,
- GFP_NOFS,
- &c->bio_read_split),
- orig->opts);
-
- bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
- rbio->bounce = true;
- rbio->split = true;
- } else if (flags & BCH_READ_MUST_CLONE) {
- /*
- * Have to clone if there were any splits, due to error
- * reporting issues (if a split errored, and retrying didn't
- * work, when it reports the error to its parent (us) we don't
- * know if the error was from our bio, and we should retry, or
- * from the whole bio, in which case we don't want to retry and
- * lose the error)
- */
- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
- &c->bio_read_split),
- orig->opts);
- rbio->bio.bi_iter = iter;
- rbio->split = true;
- } else {
- rbio = orig;
- rbio->bio.bi_iter = iter;
- EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
- }
-
- EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
- rbio->c = c;
- rbio->submit_time = local_clock();
- if (rbio->split)
- rbio->parent = orig;
- else
- rbio->end_io = orig->bio.bi_end_io;
- rbio->bvec_iter = iter;
- rbio->offset_into_extent= offset_into_extent;
- rbio->flags = flags;
- rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
- rbio->narrow_crcs = narrow_crcs;
- rbio->hole = 0;
- rbio->retry = 0;
- rbio->context = 0;
- /* XXX: only initialize this if needed */
- rbio->devs_have = bch2_bkey_devs(k);
- rbio->pick = pick;
- rbio->subvol = orig->subvol;
- rbio->read_pos = read_pos;
- rbio->data_btree = data_btree;
- rbio->data_pos = data_pos;
- rbio->version = k.k->version;
- rbio->promote = promote;
- INIT_WORK(&rbio->work, NULL);
-
- rbio->bio.bi_opf = orig->bio.bi_opf;
- rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
- rbio->bio.bi_end_io = bch2_read_endio;
-
- if (rbio->bounce)
- trace_and_count(c, read_bounce, &rbio->bio);
-
- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
- bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
- /*
- * If it's being moved internally, we don't want to flag it as a cache
- * hit:
- */
- if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
- bch2_bucket_io_time_reset(trans, pick.ptr.dev,
- PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
- bio_inc_remaining(&orig->bio);
- trace_and_count(c, read_split, &orig->bio);
- }
-
- if (!rbio->pick.idx) {
- if (!rbio->have_ioref) {
- bch_err_inum_offset_ratelimited(c,
- read_pos.inode,
- read_pos.offset << 9,
- "no device to read from");
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
- goto out;
- }
-
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
- bio_sectors(&rbio->bio));
- bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
- if (unlikely(c->opts.no_data_io)) {
- if (likely(!(flags & BCH_READ_IN_RETRY)))
- bio_endio(&rbio->bio);
- } else {
- if (likely(!(flags & BCH_READ_IN_RETRY)))
- submit_bio(&rbio->bio);
- else
- submit_bio_wait(&rbio->bio);
- }
-
- /*
- * We just submitted IO which may block, we expect relock fail
- * events and shouldn't count them:
- */
- trans->notrace_relock_fail = true;
- } else {
- /* Attempting reconstruct read: */
- if (bch2_ec_read_extent(c, rbio)) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
- goto out;
- }
-
- if (likely(!(flags & BCH_READ_IN_RETRY)))
- bio_endio(&rbio->bio);
- }
-out:
- if (likely(!(flags & BCH_READ_IN_RETRY))) {
- return 0;
- } else {
- int ret;
-
- rbio->context = RBIO_CONTEXT_UNBOUND;
- bch2_read_endio(&rbio->bio);
-
- ret = rbio->retry;
- rbio = bch2_rbio_free(rbio);
-
- if (ret == READ_RETRY_AVOID) {
- bch2_mark_io_failure(failed, &pick);
- ret = READ_RETRY;
- }
-
- if (!ret)
- goto out_read_done;
-
- return ret;
- }
-
-err:
- if (flags & BCH_READ_IN_RETRY)
- return READ_ERR;
-
- orig->bio.bi_status = BLK_STS_IOERR;
- goto out_read_done;
-
-hole:
- /*
- * won't normally happen in the BCH_READ_NODECODE
- * (bch2_move_extent()) path, but if we retry and the extent we wanted
- * to read no longer exists we have to signal that:
- */
- if (flags & BCH_READ_NODECODE)
- orig->hole = true;
-
- zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
- if (flags & BCH_READ_LAST_FRAGMENT)
- bch2_rbio_done(orig);
- return 0;
-}
-
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, subvol_inum inum,
- struct bch_io_failures *failed, unsigned flags)
-{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_buf sk;
- struct bkey_s_c k;
- u32 snapshot;
- int ret;
-
- BUG_ON(flags & BCH_READ_NODECODE);
-
- bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
-retry:
- bch2_trans_begin(&trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
- BTREE_ITER_SLOTS);
- while (1) {
- unsigned bytes, sectors, offset_into_extent;
- enum btree_id data_btree = BTREE_ID_extents;
-
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(&trans);
- if (ret)
- break;
-
- bch2_btree_iter_set_pos(&iter,
- POS(inum.inum, bvec_iter.bi_sector));
-
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- break;
-
- offset_into_extent = iter.pos.offset -
- bkey_start_offset(k.k);
- sectors = k.k->size - offset_into_extent;
-
- bch2_bkey_buf_reassemble(&sk, c, k);
-
- ret = bch2_read_indirect_extent(&trans, &data_btree,
- &offset_into_extent, &sk);
- if (ret)
- break;
-
- k = bkey_i_to_s_c(sk.k);
-
- /*
- * With indirect extents, the amount of data to read is the min
- * of the original extent and the indirect extent:
- */
- sectors = min(sectors, k.k->size - offset_into_extent);
-
- bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
- swap(bvec_iter.bi_size, bytes);
-
- if (bvec_iter.bi_size == bytes)
- flags |= BCH_READ_LAST_FRAGMENT;
-
- ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
- data_btree, k,
- offset_into_extent, failed, flags);
- if (ret)
- break;
-
- if (flags & BCH_READ_LAST_FRAGMENT)
- break;
-
- swap(bvec_iter.bi_size, bytes);
- bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
- ret = btree_trans_too_many_iters(&trans);
- if (ret)
- break;
- }
-err:
- bch2_trans_iter_exit(&trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- ret == READ_RETRY ||
- ret == READ_RETRY_AVOID)
- goto retry;
-
- bch2_trans_exit(&trans);
- bch2_bkey_buf_exit(&sk, c);
-
- if (ret) {
- bch_err_inum_offset_ratelimited(c, inum.inum,
- bvec_iter.bi_sector << 9,
- "read error %i from btree lookup", ret);
- rbio->bio.bi_status = BLK_STS_IOERR;
- bch2_rbio_done(rbio);
- }
-}
-
-void bch2_fs_io_exit(struct bch_fs *c)
-{
- if (c->promote_table.tbl)
- rhashtable_destroy(&c->promote_table);
- mempool_exit(&c->bio_bounce_pages);
- bioset_exit(&c->bio_write);
- bioset_exit(&c->bio_read_split);
- bioset_exit(&c->bio_read);
-}
-
-int bch2_fs_io_init(struct bch_fs *c)
-{
- if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_init;
-
- if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_split_init;
-
- if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
- BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_write_init;
-
- if (mempool_init_page_pool(&c->bio_bounce_pages,
- max_t(unsigned,
- c->opts.btree_node_size,
- c->opts.encoded_extent_max) /
- PAGE_SIZE, 0))
- return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
-
- if (rhashtable_init(&c->promote_table, &bch_promote_params))
- return -BCH_ERR_ENOMEM_promote_table_init;
-
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_H
-#define _BCACHEFS_IO_H
-
-#include "checksum.h"
-#include "bkey_buf.h"
-#include "io_types.h"
-
-#define to_wbio(_bio) \
- container_of((_bio), struct bch_write_bio, bio)
-
-#define to_rbio(_bio) \
- container_of((_bio), struct bch_read_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
- enum bch_data_type, const struct bkey_i *, bool);
-
-#define BLK_STS_REMOVED ((__force blk_status_t)128)
-
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#define BCH_WRITE_FLAGS() \
- x(ALLOC_NOWAIT) \
- x(CACHED) \
- x(DATA_ENCODED) \
- x(PAGES_STABLE) \
- x(PAGES_OWNED) \
- x(ONLY_SPECIFIED_DEVS) \
- x(WROTE_DATA_INLINE) \
- x(FROM_INTERNAL) \
- x(CHECK_ENOSPC) \
- x(SYNC) \
- x(MOVE) \
- x(IN_WORKER) \
- x(DONE) \
- x(IO_ERROR) \
- x(CONVERT_UNWRITTEN)
-
-enum __bch_write_flags {
-#define x(f) __BCH_WRITE_##f,
- BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f) BCH_WRITE_##f = 1U << __BCH_WRITE_##f,
- BCH_WRITE_FLAGS()
-#undef x
-};
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
- return op->watermark == BCH_WATERMARK_copygc
- ? op->c->copygc_wq
- : op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
- struct btree_iter *, struct bkey_i *,
- struct disk_reservation *, u64, s64 *, bool);
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
- unsigned, struct bch_io_opts, s64 *,
- struct write_point_specifier);
-
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
- subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct bch_io_opts opts)
-{
- op->c = c;
- op->end_io = NULL;
- op->flags = 0;
- op->written = 0;
- op->error = 0;
- op->csum_type = bch2_data_checksum_type(c, opts);
- op->compression_opt = opts.compression;
- op->nr_replicas = 0;
- op->nr_replicas_required = c->opts.data_replicas_required;
- op->watermark = BCH_WATERMARK_normal;
- op->incompressible = 0;
- op->open_buckets.nr = 0;
- op->devs_have.nr = 0;
- op->target = 0;
- op->opts = opts;
- op->subvol = 0;
- op->pos = POS_MAX;
- op->version = ZERO_VERSION;
- op->write_point = (struct write_point_specifier) { 0 };
- op->res = (struct disk_reservation) { 0 };
- op->new_i_size = U64_MAX;
- op->i_sectors_delta = 0;
- op->devs_need_flush = NULL;
-}
-
-void bch2_write(struct closure *);
-
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
- struct bch_write_bio *wbio = to_wbio(bio);
-
- memset(&wbio->wbio, 0, sizeof(wbio->wbio));
- return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
- struct bkey_buf *);
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
- enum btree_id *data_btree,
- unsigned *offset_into_extent,
- struct bkey_buf *k)
-{
- if (k->k->k.type != KEY_TYPE_reflink_p)
- return 0;
-
- *data_btree = BTREE_ID_reflink;
- return __bch2_read_indirect_extent(trans, offset_into_extent, k);
-}
-
-enum bch_read_flags {
- BCH_READ_RETRY_IF_STALE = 1 << 0,
- BCH_READ_MAY_PROMOTE = 1 << 1,
- BCH_READ_USER_MAPPED = 1 << 2,
- BCH_READ_NODECODE = 1 << 3,
- BCH_READ_LAST_FRAGMENT = 1 << 4,
-
- /* internal: */
- BCH_READ_MUST_BOUNCE = 1 << 5,
- BCH_READ_MUST_CLONE = 1 << 6,
- BCH_READ_IN_RETRY = 1 << 7,
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
- struct bvec_iter, struct bpos, enum btree_id,
- struct bkey_s_c, unsigned,
- struct bch_io_failures *, unsigned);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
- struct bch_read_bio *rbio, struct bpos read_pos,
- enum btree_id data_btree, struct bkey_s_c k,
- unsigned offset_into_extent, unsigned flags)
-{
- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
- data_btree, k, offset_into_extent, NULL, flags);
-}
-
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- subvol_inum, struct bch_io_failures *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- subvol_inum inum)
-{
- struct bch_io_failures failed = { .nr = 0 };
-
- BUG_ON(rbio->_state);
-
- rbio->c = c;
- rbio->start_time = local_clock();
- rbio->subvol = inum.subvol;
-
- __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
- BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE|
- BCH_READ_USER_MAPPED);
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
- struct bch_io_opts opts)
-{
- struct bch_read_bio *rbio = to_rbio(bio);
-
- rbio->_state = 0;
- rbio->promote = NULL;
- rbio->opts = opts;
- return rbio;
-}
-
-void bch2_fs_io_exit(struct bch_fs *);
-int bch2_fs_io_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_H */
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_TYPES_H
-#define _BCACHEFS_IO_TYPES_H
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-#include "keylist_types.h"
-#include "opts.h"
-#include "super_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-struct bch_read_bio {
- struct bch_fs *c;
- u64 start_time;
- u64 submit_time;
-
- /*
- * Reads will often have to be split, and if the extent being read from
- * was checksummed or compressed we'll also have to allocate bounce
- * buffers and copy the data back into the original bio.
- *
- * If we didn't have to split, we have to save and restore the original
- * bi_end_io - @split below indicates which:
- */
- union {
- struct bch_read_bio *parent;
- bio_end_io_t *end_io;
- };
-
- /*
- * Saved copy of bio->bi_iter, from submission time - allows us to
- * resubmit on IO error, and also to copy data back to the original bio
- * when we're bouncing:
- */
- struct bvec_iter bvec_iter;
-
- unsigned offset_into_extent;
-
- u16 flags;
- union {
- struct {
- u16 bounce:1,
- split:1,
- kmalloc:1,
- have_ioref:1,
- narrow_crcs:1,
- hole:1,
- retry:2,
- context:2;
- };
- u16 _state;
- };
-
- struct bch_devs_list devs_have;
-
- struct extent_ptr_decoded pick;
-
- /*
- * pos we read from - different from data_pos for indirect extents:
- */
- u32 subvol;
- struct bpos read_pos;
-
- /*
- * start pos of data we read (may not be pos of data we want) - for
- * promote, narrow extents paths:
- */
- enum btree_id data_btree;
- struct bpos data_pos;
- struct bversion version;
-
- struct promote_op *promote;
-
- struct bch_io_opts opts;
-
- struct work_struct work;
-
- struct bio bio;
-};
-
-struct bch_write_bio {
- struct_group(wbio,
- struct bch_fs *c;
- struct bch_write_bio *parent;
-
- u64 submit_time;
- u64 inode_offset;
-
- struct bch_devs_list failed;
- u8 dev;
-
- unsigned split:1,
- bounce:1,
- put_bio:1,
- have_ioref:1,
- nocow:1,
- used_mempool:1,
- first_btree_write:1;
- );
-
- struct bio bio;
-};
-
-struct bch_write_op {
- struct closure cl;
- struct bch_fs *c;
- void (*end_io)(struct bch_write_op *);
- u64 start_time;
-
- unsigned written; /* sectors */
- u16 flags;
- s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
-
- unsigned compression_opt:8;
- unsigned csum_type:4;
- unsigned nr_replicas:4;
- unsigned nr_replicas_required:4;
- unsigned watermark:3;
- unsigned incompressible:1;
- unsigned stripe_waited:1;
-
- struct bch_devs_list devs_have;
- u16 target;
- u16 nonce;
- struct bch_io_opts opts;
-
- u32 subvol;
- struct bpos pos;
- struct bversion version;
-
- /* For BCH_WRITE_DATA_ENCODED: */
- struct bch_extent_crc_unpacked crc;
-
- struct write_point_specifier write_point;
-
- struct write_point *wp;
- struct list_head wp_list;
-
- struct disk_reservation res;
-
- struct open_buckets open_buckets;
-
- u64 new_i_size;
- s64 i_sectors_delta;
-
- struct bch_devs_mask failed;
-
- struct keylist insert_keys;
- u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-
- /*
- * Bitmask of devices that have had nocow writes issued to them since
- * last flush:
- */
- struct bch_devs_mask *devs_need_flush;
-
- /* Must be last: */
- struct bch_write_bio wbio;
-};
-
-#endif /* _BCACHEFS_IO_TYPES_H */
static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
{
unsigned i;
+
for (i = 0; i < ARRAY_SIZE(p->list); i++)
INIT_LIST_HEAD(&p->list[i]);
INIT_LIST_HEAD(&p->flushed);
return stuck;
}
-/* journal entry close/open: */
-
-void __bch2_journal_buf_put(struct journal *j)
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ lockdep_assert_held(&j->lock);
+
+ if (__bch2_journal_pin_put(j, seq))
+ bch2_journal_reclaim_fast(j);
+ if (write)
+ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
}
/*
buf->data->last_seq = cpu_to_le64(buf->last_seq);
BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
- __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
-
cancel_delayed_work(&j->write_work);
bch2_journal_space_available(j);
- bch2_journal_buf_put(j, old.idx);
+ __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
}
void bch2_journal_halt(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- if (j->res_get_blocked_start)
- bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
-
mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
ret = journal_entry_open(j);
- if (ret == JOURNAL_ERR_max_in_flight)
+ if (ret == JOURNAL_ERR_max_in_flight) {
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, true);
trace_and_count(c, journal_entry_full, c);
-unlock:
- if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
- !j->res_get_blocked_start) {
- j->res_get_blocked_start = local_clock() ?: 1;
- trace_and_count(c, journal_full, c);
}
-
+unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
int ret;
closure_wait_event(&j->async_wait,
- (ret = __journal_res_get(j, res, flags)) !=
- -BCH_ERR_journal_res_get_blocked||
+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
}
-/* journal_preres: */
-
-static bool journal_preres_available(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
-
- if (!ret && mutex_trylock(&j->reclaim_lock)) {
- bch2_journal_reclaim(j);
- mutex_unlock(&j->reclaim_lock);
- }
-
- return ret;
-}
-
-int __bch2_journal_preres_get(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- int ret;
-
- closure_wait_event(&j->preres_wait,
- (ret = bch2_journal_error(j)) ||
- journal_preres_available(j, res, new_u64s, flags));
- return ret;
-}
-
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *j,
/**
* bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j: journal object
+ * @seq: seq to flush
+ * @parent: closure object to wait with
+ * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed,
+ * -EIO if @seq will never be flushed
*
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
* necessary
*/
int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
break;
ret = bch2_trans_run(c,
- bch2_trans_mark_metadata_bucket(&trans, ca,
+ bch2_trans_mark_metadata_bucket(trans, ca,
ob[nr_got]->bucket, BCH_DATA_journal,
ca->mi.bucket_size));
if (ret) {
bch2_open_bucket_put(c, ob[nr_got]);
- bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "marking new journal buckets");
break;
}
if (ret && !new_fs)
for (i = 0; i < nr_got; i++)
bch2_trans_run(c,
- bch2_trans_mark_metadata_bucket(&trans, ca,
+ bch2_trans_mark_metadata_bucket(trans, ca,
bu[i], BCH_DATA_free, 0));
err_free:
if (!new_fs)
goto unlock;
while (ja->nr < nr) {
- struct disk_reservation disk_res = { 0, 0 };
+ struct disk_reservation disk_res = { 0, 0, 0 };
/*
* note: journal buckets aren't really counted as _sectors_ used yet, so
return ret;
}
+int bch2_fs_journal_alloc(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ if (ca->journal.nr)
+ continue;
+
+ int ret = bch2_dev_journal_alloc(ca);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* startup/shutdown: */
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
{
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
- bch2_sb_get_journal(sb);
+ bch2_sb_field_get(sb, journal);
struct bch_sb_field_journal_v2 *journal_buckets_v2 =
- bch2_sb_get_journal_v2(sb);
+ bch2_sb_field_get(sb, journal_v2);
unsigned i, nr_bvecs;
ja->nr = 0;
union journal_res_state s;
struct bch_dev *ca;
unsigned long now = jiffies;
+ u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
u64 seq;
unsigned i;
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
- prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
- prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ prt_printf(out, "average write size:\t");
+ prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
+ prt_newline(out);
+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
- prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
return true;
}
-void __bch2_journal_buf_put(struct journal *);
-
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
{
union journal_res_state s;
.buf2_count = idx == 2,
.buf3_count = idx == 3,
}).v, &j->reservations.counter);
+ return s;
+}
+
+void bch2_journal_buf_put_final(struct journal *, u64, bool);
+
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+ union journal_res_state s;
+
+ s = journal_state_buf_put(j, idx);
+ if (!journal_state_count(s, idx))
+ bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+}
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+ union journal_res_state s;
- if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
- __bch2_journal_buf_put(j);
+ s = journal_state_buf_put(j, idx);
+ if (!journal_state_count(s, idx)) {
+ spin_lock(&j->lock);
+ bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+ spin_unlock(&j->lock);
+ }
}
/*
BCH_JSET_ENTRY_btree_keys,
0, 0, 0);
- bch2_journal_buf_put(j, res->idx);
+ bch2_journal_buf_put(j, res->idx, res->seq);
res->ref = 0;
}
return 0;
}
-/* journal_preres: */
-
-static inline void journal_set_watermark(struct journal *j)
-{
- union journal_preres_state s = READ_ONCE(j->prereserved);
- unsigned watermark = BCH_WATERMARK_stripe;
-
- if (fifo_free(&j->pin) < j->pin.size / 4)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
- if (fifo_free(&j->pin) < j->pin.size / 8)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (s.reserved > s.remaining)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
- if (!s.remaining)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (watermark == j->watermark)
- return;
-
- swap(watermark, j->watermark);
- if (watermark > j->watermark)
- journal_wake(j);
-}
-
-static inline void bch2_journal_preres_put(struct journal *j,
- struct journal_preres *res)
-{
- union journal_preres_state s = { .reserved = res->u64s };
-
- if (!res->u64s)
- return;
-
- s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
- res->u64s = 0;
-
- if (unlikely(s.waiting)) {
- clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
- (unsigned long *) &j->prereserved.v);
- closure_wake_up(&j->preres_wait);
- }
-
- if (s.reserved <= s.remaining && j->watermark)
- journal_set_watermark(j);
-}
-
-int __bch2_journal_preres_get(struct journal *,
- struct journal_preres *, unsigned, unsigned);
-
-static inline int bch2_journal_preres_get_fast(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags,
- bool set_waiting)
-{
- int d = new_u64s - res->u64s;
- union journal_preres_state old, new;
- u64 v = atomic64_read(&j->prereserved.counter);
- enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- int ret;
-
- do {
- old.v = new.v = v;
- ret = 0;
-
- if (watermark == BCH_WATERMARK_reclaim ||
- new.reserved + d < new.remaining) {
- new.reserved += d;
- ret = 1;
- } else if (set_waiting && !new.waiting)
- new.waiting = true;
- else
- return 0;
- } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
- old.v, new.v)) != old.v);
-
- if (ret)
- res->u64s += d;
- return ret;
-}
-
-static inline int bch2_journal_preres_get(struct journal *j,
- struct journal_preres *res,
- unsigned new_u64s,
- unsigned flags)
-{
- if (new_u64s <= res->u64s)
- return 0;
-
- if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
- return 0;
-
- if (flags & JOURNAL_RES_GET_NONBLOCK)
- return -BCH_ERR_journal_preres_get_blocked;
-
- return __bch2_journal_preres_get(j, res, new_u64s, flags);
-}
-
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *,
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
#include "checksum.h"
#include "disk_groups.h"
#include "error.h"
-#include "io.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
+#include "sb-clean.h"
#include "trace.h"
static struct nonce journal_nonce(const struct jset *jset)
if (!dup->csum_good)
goto replace;
- fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+ fsck_err(c, journal_entry_replicas_data_mismatch,
+ "found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
i = dup;
goto found;
#define JOURNAL_ENTRY_BAD 7
static void journal_entry_err_msg(struct printbuf *out,
+ u32 version,
struct jset *jset,
struct jset_entry *entry)
{
- prt_str(out, "invalid journal entry ");
- if (entry)
- prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
-
- if (!jset)
- prt_printf(out, "in superblock");
- else if (!entry)
- prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
- else
- prt_printf(out, "at offset %zi/%u seq %llu",
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s),
- le64_to_cpu(jset->seq));
+ prt_str(out, "invalid journal entry, version=");
+ bch2_version_to_text(out, version);
+
+ if (entry) {
+ prt_str(out, " type=");
+ prt_str(out, bch2_jset_entry_types[entry->type]);
+ }
+
+ if (!jset) {
+ prt_printf(out, " in superblock");
+ } else {
+
+ prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
+
+ if (entry)
+ prt_printf(out, " offset=%zi/%u",
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s));
+ }
+
prt_str(out, ": ");
}
-#define journal_entry_err(c, jset, entry, msg, ...) \
+#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \
({ \
- struct printbuf buf = PRINTBUF; \
+ struct printbuf _buf = PRINTBUF; \
\
- journal_entry_err_msg(&buf, jset, entry); \
- prt_printf(&buf, msg, ##__VA_ARGS__); \
+ journal_entry_err_msg(&_buf, version, jset, entry); \
+ prt_printf(&_buf, msg, ##__VA_ARGS__); \
\
- switch (write) { \
+ switch (flags & BKEY_INVALID_WRITE) { \
case READ: \
- mustfix_fsck_err(c, "%s", buf.buf); \
+ mustfix_fsck_err(c, _err, "%s", _buf.buf); \
break; \
case WRITE: \
- bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
+ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
+ bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
if (bch2_fs_inconsistent(c)) { \
ret = -BCH_ERR_fsck_errors_not_fixed; \
goto fsck_err; \
break; \
} \
\
- printbuf_exit(&buf); \
+ printbuf_exit(&_buf); \
true; \
})
-#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \
- ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, ...) \
+ ((cond) ? journal_entry_err(__VA_ARGS__) : false)
#define FSCK_DELETED_KEY 5
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
struct bkey_i *k,
- unsigned version, int big_endian, int write)
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
+ int write = flags & BKEY_INVALID_WRITE;
void *next = vstruct_next(entry);
struct printbuf buf = PRINTBUF;
int ret = 0;
- if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
+ if (journal_entry_err_on(!k->k.u64s,
+ c, version, jset, entry,
+ journal_entry_bkey_u64s_0,
+ "k->u64s 0")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
return FSCK_DELETED_KEY;
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry),
- c, jset, entry,
+ c, version, jset, entry,
+ journal_entry_bkey_past_end,
"extends past end of journal entry")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
}
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
- c, jset, entry,
+ c, version, jset, entry,
+ journal_entry_bkey_bad_format,
"bad format %u", k->k.format)) {
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id), write, &buf)) {
printbuf_reset(&buf);
- prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
- bch2_jset_entry_types[entry->type],
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s),
- le64_to_cpu(jset->seq));
+ journal_entry_err_msg(&buf, version, jset, entry);
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id), write, &buf);
- mustfix_fsck_err(c, "%s", buf.buf);
+ mustfix_fsck_err(c, journal_entry_bkey_invalid,
+ "%s", buf.buf);
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
}
static int journal_entry_btree_keys_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
struct bkey_i *k = entry->start;
entry->level,
entry->btree_id,
k, version, big_endian,
- write|BKEY_INVALID_JOURNAL);
+ flags|BKEY_INVALID_JOURNAL);
if (ret == FSCK_DELETED_KEY)
continue;
prt_newline(out);
prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
}
- prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+ prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
first = false;
}
}
static int journal_entry_btree_root_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
struct bkey_i *k = entry->start;
int ret = 0;
if (journal_entry_err_on(!entry->u64s ||
le16_to_cpu(entry->u64s) != k->k.u64s,
- c, jset, entry,
+ c, version, jset, entry,
+ journal_entry_btree_root_bad_size,
"invalid btree root journal entry: wrong number of keys")) {
void *next = vstruct_next(entry);
/*
}
return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
- version, big_endian, write);
+ version, big_endian, flags);
fsck_err:
return ret;
}
}
static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
/* obsolete, don't care: */
return 0;
}
static int journal_entry_blacklist_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
int ret = 0;
if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
- c, jset, entry,
+ c, version, jset, entry,
+ journal_entry_blacklist_bad_size,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
}
static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
- c, jset, entry,
+ c, version, jset, entry,
+ journal_entry_blacklist_v2_bad_size,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
goto out;
if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
le64_to_cpu(bl_entry->end),
- c, jset, entry,
+ c, version, jset, entry,
+ journal_entry_blacklist_v2_start_past_end,
"invalid journal seq blacklist entry: start > end")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
}
static int journal_entry_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
int ret = 0;
if (journal_entry_err_on(bytes < sizeof(*u),
- c, jset, entry,
+ c, version, jset, entry,
+ journal_entry_usage_bad_size,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
static int journal_entry_data_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
if (journal_entry_err_on(bytes < sizeof(*u) ||
bytes < sizeof(*u) + u->r.nr_devs,
- c, jset, entry,
+ c, version, jset, entry,
+ journal_entry_data_usage_bad_size,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
static int journal_entry_clock_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
int ret = 0;
if (journal_entry_err_on(bytes != sizeof(*clock),
- c, jset, entry, "bad size")) {
+ c, version, jset, entry,
+ journal_entry_clock_bad_size,
+ "bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(clock->rw > 1,
- c, jset, entry, "bad rw")) {
+ c, version, jset, entry,
+ journal_entry_clock_bad_rw,
+ "bad rw")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
}
static int journal_entry_dev_usage_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
int ret = 0;
if (journal_entry_err_on(bytes < expected,
- c, jset, entry, "bad size (%u < %u)",
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_size,
+ "bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
dev = le32_to_cpu(u->dev);
if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
- c, jset, entry, "bad dev")) {
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_dev,
+ "bad dev")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(u->pad,
- c, jset, entry, "bad pad")) {
+ c, version, jset, entry,
+ journal_entry_dev_usage_bad_pad,
+ "bad pad")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
}
static int journal_entry_log_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
return 0;
}
}
static int journal_entry_overwrite_validate(struct bch_fs *c,
- struct jset *jset,
- struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
return journal_entry_btree_keys_validate(c, jset, entry,
version, big_endian, READ);
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
- struct jset_entry *, unsigned, int, int);
+ struct jset_entry *, unsigned, int,
+ enum bkey_invalid_flags);
void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
int bch2_journal_entry_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
- unsigned version, int big_endian, int write)
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
{
return entry->type < BCH_JSET_ENTRY_NR
? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
- version, big_endian, write)
+ version, big_endian, flags)
: 0;
}
}
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
- int write)
+ enum bkey_invalid_flags flags)
{
struct jset_entry *entry;
+ unsigned version = le32_to_cpu(jset->version);
int ret = 0;
vstruct_for_each(jset, entry) {
- if (journal_entry_err_on(vstruct_next(entry) >
- vstruct_last(jset), c, jset, entry,
+ if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
+ c, version, jset, entry,
+ journal_entry_past_jset_end,
"journal entry extends past end of jset")) {
jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
break;
}
ret = bch2_journal_entry_validate(c, jset, entry,
- le32_to_cpu(jset->version),
- JSET_BIG_ENDIAN(jset), write);
+ version, JSET_BIG_ENDIAN(jset), flags);
if (ret)
break;
}
static int jset_validate(struct bch_fs *c,
struct bch_dev *ca,
struct jset *jset, u64 sector,
- int write)
+ enum bkey_invalid_flags flags)
{
unsigned version;
int ret = 0;
return JOURNAL_ENTRY_NONE;
version = le32_to_cpu(jset->version);
- if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+ if (journal_entry_err_on(!bch2_version_compatible(version),
+ c, version, jset, NULL,
+ jset_unsupported_version,
"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
}
if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
- c, jset, NULL,
+ c, version, jset, NULL,
+ jset_unknown_csum,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
/* last_seq is ignored when JSET_NO_FLUSH is true */
if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
- c, jset, NULL,
+ c, version, jset, NULL,
+ jset_last_seq_newer_than_seq,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(jset->last_seq),
le64_to_cpu(jset->seq))) {
return JOURNAL_ENTRY_BAD;
}
- ret = jset_validate_entries(c, jset, write);
+ ret = jset_validate_entries(c, jset, flags);
fsck_err:
return ret;
}
{
size_t bytes = vstruct_bytes(jset);
unsigned version;
- int write = READ;
+ enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
int ret = 0;
if (le64_to_cpu(jset->magic) != jset_magic(c))
return JOURNAL_ENTRY_NONE;
version = le32_to_cpu(jset->version);
- if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+ if (journal_entry_err_on(!bch2_version_compatible(version),
+ c, version, jset, NULL,
+ jset_unsupported_version,
"%s sector %llu seq %llu: unknown journal entry version %u.%u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
return JOURNAL_ENTRY_REREAD;
if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
- c, jset, NULL,
+ c, version, jset, NULL,
+ jset_past_bucket_end,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq), bytes))
ret = submit_bio_wait(bio);
kfree(bio);
- if (bch2_dev_io_err_on(ret, ca,
+ if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
"journal read error: sector %llu",
offset) ||
bch2_meta_read_fault("journal")) {
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
csum_good = jset_csum_good(c, j);
- if (!csum_good)
+ if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
+ "journal checksum error"))
saw_bad = true;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
if (ja->bucket_seq[ja->cur_idx] &&
ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+ /*
+ * Debug code for ZNS support, where we (probably) want to be
+ * correlated where we stopped in the journal to the zone write
+ * points:
+ */
bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
for (i = 0; i < 3; i++) {
unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+
bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
}
+#endif
ja->sectors_free = 0;
}
* those entries will be blacklisted:
*/
genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
- int write = READ;
+ enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
i = *_i;
}
if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
- c, &i->j, NULL,
+ c, le32_to_cpu(i->j.version), &i->j, NULL,
+ jset_last_seq_newer_than_seq,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(i->j.last_seq),
le64_to_cpu(i->j.seq)))
}
if (!*last_seq) {
- fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+ fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
+ "journal read done, but no entries found after dropping non-flushes");
return 0;
}
if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+ jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
i->ignore = true;
}
bch2_journal_ptrs_to_text(&buf2, c, i);
missing_end = seq - 1;
- fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+ fsck_err(c, journal_entries_missing,
+ "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
" prev at %s\n"
" next at %s",
missing_start, missing_end,
continue;
for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+ ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
if (!i->ptrs[ptr].csum_good)
bch_err_dev_offset(ca, i->ptrs[ptr].sector,
bch2_replicas_entry_sort(&replicas.e);
- /*
- * If we're mounting in degraded mode - if we didn't read all
- * the devices - this is wrong:
- */
-
printbuf_reset(&buf);
bch2_replicas_entry_to_text(&buf, &replicas.e);
if (!degraded &&
- fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
- "superblock not marked as containing replicas %s",
- buf.buf)) {
+ !bch2_replicas_marked(c, &replicas.e) &&
+ (le64_to_cpu(i->j.seq) == *last_seq ||
+ fsck_err(c, journal_entry_replicas_not_marked,
+ "superblock not marked as containing replicas for journal entry %llu\n %s",
+ le64_to_cpu(i->j.seq), buf.buf))) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
goto err;
}
/**
- * journal_next_bucket - move on to the next journal bucket if possible
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j: journal object
+ * @w: journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
*/
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
- unsigned sectors)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_devs_mask devs;
struct journal_device *ja;
struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
+ unsigned sectors = vstruct_sectors(w->data, c->block_bits);
unsigned target = c->opts.metadata_target ?:
c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_replicas_padded replicas;
union journal_res_state old, new;
u64 v, seq;
int err = 0;
if (!w->devs_written.nr) {
bch_err(c, "unable to write journal to sufficient devices");
err = -EIO;
+ } else {
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+ w->devs_written);
+ if (bch2_mark_replicas(c, &replicas.e))
+ err = -EIO;
}
+
if (err)
bch2_fatal_error(c);
bch2_journal_space_available(j);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, false);
+
closure_wake_up(&w->wait);
journal_wake(j);
if (!journal_state_count(new, new.unwritten_idx) &&
journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+ spin_unlock(&j->lock);
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
* might want to be written now:
*/
+ spin_unlock(&j->lock);
mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+ } else {
+ spin_unlock(&j->lock);
}
-
- spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ "error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
}
continue_at(cl, journal_write_done, c->io_complete_wq);
- return;
}
-static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
- struct jset_entry *i, *next, *prev = NULL;
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct jset_entry *start, *end, *i, *next, *prev = NULL;
+ struct jset *jset = w->data;
+ unsigned sectors, bytes, u64s;
+ bool validate_before_checksum = false;
+ unsigned long btree_roots_have = 0;
+ int ret;
/*
* Simple compaction, dropping empty jset_entries (from journal
if (!u64s)
continue;
- if (i->type == BCH_JSET_ENTRY_btree_root)
+ /*
+ * New btree roots are set by journalling them; when the journal
+ * entry gets written we have to propagate them to
+ * c->btree_roots
+ *
+ * But, every journal entry we write has to contain all the
+ * btree roots (at least for now); so after we copy btree roots
+ * to c->btree_roots we have to get any missing btree roots and
+ * add them to this journal entry:
+ */
+ if (i->type == BCH_JSET_ENTRY_btree_root) {
bch2_journal_entry_to_btree_root(c, i);
+ __set_bit(i->btree_id, &btree_roots_have);
+ }
/* Can we merge with previous entry? */
if (prev &&
prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
-void bch2_journal_write(struct closure *cl)
-{
- struct journal *j = container_of(cl, struct journal, io);
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bch_replicas_padded replicas;
- struct jset_entry *start, *end;
- struct jset *jset;
- struct bio *bio;
- struct printbuf journal_debug_buf = PRINTBUF;
- bool validate_before_checksum = false;
- unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
- int ret;
-
- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
- journal_buf_realloc(j, w);
- jset = w->data;
-
- j->write_start_time = local_clock();
-
- spin_lock(&j->lock);
-
- /*
- * If the journal is in an error state - we did an emergency shutdown -
- * we prefer to continue doing journal writes. We just mark them as
- * noflush so they'll never be used, but they'll still be visible by the
- * list_journal tool - this helps in debugging.
- *
- * There's a caveat: the first journal write after marking the
- * superblock dirty must always be a flush write, because on startup
- * from a clean shutdown we didn't necessarily read the journal and the
- * new journal write might overwrite whatever was in the journal
- * previously - we can't leave the journal without any flush writes in
- * it.
- *
- * So if we're in an error state, and we're still starting up, we don't
- * write anything at all.
- */
- if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
- (bch2_journal_error(j) ||
- w->noflush ||
- (!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
- w->noflush = true;
- SET_JSET_NO_FLUSH(jset, true);
- jset->last_seq = 0;
- w->last_seq = 0;
-
- j->nr_noflush_writes++;
- } else if (!bch2_journal_error(j)) {
- j->last_flush_write = jiffies;
- j->nr_flush_writes++;
- clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
- } else {
- spin_unlock(&j->lock);
- goto err;
- }
- spin_unlock(&j->lock);
-
- /*
- * New btree roots are set by journalling them; when the journal entry
- * gets written we have to propagate them to c->btree_roots
- *
- * But, every journal entry we write has to contain all the btree roots
- * (at least for now); so after we copy btree roots to c->btree_roots we
- * have to get any missing btree roots and add them to this journal
- * entry:
- */
-
- bch2_journal_entries_postprocess(c, jset);
start = end = vstruct_last(jset);
- end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+ end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq));
bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
- goto err;
+ return -EINVAL;
}
jset->magic = cpu_to_le64(jset_magic(c));
validate_before_checksum = true;
if (validate_before_checksum &&
- jset_validate(c, NULL, jset, 0, WRITE))
- goto err;
+ (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+ return ret;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
if (bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret))
- goto err;
+ return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
if (!validate_before_checksum &&
- jset_validate(c, NULL, jset, 0, WRITE))
- goto err;
+ (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+ return ret;
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+ return 0;
+}
+
+static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ int error = bch2_journal_error(j);
+
+ /*
+ * If the journal is in an error state - we did an emergency shutdown -
+ * we prefer to continue doing journal writes. We just mark them as
+ * noflush so they'll never be used, but they'll still be visible by the
+ * list_journal tool - this helps in debugging.
+ *
+ * There's a caveat: the first journal write after marking the
+ * superblock dirty must always be a flush write, because on startup
+ * from a clean shutdown we didn't necessarily read the journal and the
+ * new journal write might overwrite whatever was in the journal
+ * previously - we can't leave the journal without any flush writes in
+ * it.
+ *
+ * So if we're in an error state, and we're still starting up, we don't
+ * write anything at all.
+ */
+ if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+ return -EIO;
+
+ if (error ||
+ w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+ w->noflush = true;
+ SET_JSET_NO_FLUSH(w->data, true);
+ w->data->last_seq = 0;
+ w->last_seq = 0;
+
+ j->nr_noflush_writes++;
+ } else {
+ j->last_flush_write = jiffies;
+ j->nr_flush_writes++;
+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ }
+
+ return 0;
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+ struct journal *j = container_of(cl, struct journal, io);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_replicas_padded replicas;
+ struct bio *bio;
+ struct printbuf journal_debug_buf = PRINTBUF;
+ unsigned i, nr_rw_members = 0;
+ int ret;
+
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+ j->write_start_time = local_clock();
-retry_alloc:
spin_lock(&j->lock);
- ret = journal_write_alloc(j, w, sectors);
+ ret = bch2_journal_write_pick_flush(j, w);
+ spin_unlock(&j->lock);
+ if (ret)
+ goto err;
+
+ journal_buf_realloc(j, w);
+
+ ret = bch2_journal_write_prep(j, w);
+ if (ret)
+ goto err;
+
+ j->entry_bytes_written += vstruct_bytes(w->data);
+
+ while (1) {
+ spin_lock(&j->lock);
+ ret = journal_write_alloc(j, w);
+ if (!ret || !j->can_discard)
+ break;
- if (ret && j->can_discard) {
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
- goto retry_alloc;
}
- if (ret)
+ if (ret) {
__bch2_journal_debug_to_text(&journal_debug_buf, j);
+ spin_unlock(&j->lock);
+ bch_err(c, "Unable to allocate journal write:\n%s",
+ journal_debug_buf.buf);
+ printbuf_exit(&journal_debug_buf);
+ goto err;
+ }
/*
* write is allocated, no longer need to account for it in
bch2_journal_space_available(j);
spin_unlock(&j->lock);
- if (ret) {
- bch_err(c, "Unable to allocate journal write:\n%s",
- journal_debug_buf.buf);
- printbuf_exit(&journal_debug_buf);
- goto err;
- }
-
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
if (c->opts.nochanges)
if (ret)
goto err;
- if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+ if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(ca, c, i) {
percpu_ref_get(&ca->io_ref);
jset_entry_for_each_key(entry, k)
int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
- struct jset_entry *, unsigned, int, int);
+ struct jset_entry *, unsigned, int,
+ enum bkey_invalid_flags);
void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
struct jset_entry *);
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "buckets.h"
#include "errcode.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
-#include "super.h"
+#include "sb-members.h"
#include "trace.h"
#include <linux/kthread.h>
return available;
}
-static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+static inline void journal_set_watermark(struct journal *j)
{
- union journal_preres_state old, new;
- u64 v = atomic64_read(&j->prereserved.counter);
-
- do {
- old.v = new.v = v;
- new.remaining = u64s_remaining;
- } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
- old.v, new.v)) != old.v);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool low_on_space = j->space[journal_space_clean].total * 4 <=
+ j->space[journal_space_total].total;
+ bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
+ unsigned watermark = low_on_space || low_on_pin
+ ? BCH_WATERMARK_reclaim
+ : BCH_WATERMARK_stripe;
+
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
+ &j->low_on_space_start, low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
+ &j->low_on_pin_start, low_on_pin))
+ trace_and_count(c, journal_full, c);
+
+ swap(watermark, j->watermark);
+ if (watermark > j->watermark)
+ journal_wake(j);
}
static struct journal_space
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
- s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want;
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
- u64s_remaining = (u64) clean << 6;
- u64s_remaining -= (u64) total << 3;
- u64s_remaining = max(0LL, u64s_remaining);
- u64s_remaining /= 4;
- u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+ journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
- journal_set_remaining(j, u64s_remaining);
- journal_set_watermark(j);
if (!ret)
journal_wake(j);
* entry, holding it open to ensure it gets replayed during recovery:
*/
-static void bch2_journal_reclaim_fast(struct journal *j)
+void bch2_journal_reclaim_fast(struct journal *j)
{
- struct journal_entry_pin_list temp;
bool popped = false;
lockdep_assert_held(&j->lock);
*/
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
- fifo_pop(&j->pin, temp);
+ j->pin.front++;
popped = true;
}
bch2_journal_space_available(j);
}
-void __bch2_journal_pin_put(struct journal *j, u64 seq)
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
- if (atomic_dec_and_test(&pin_list->count))
- bch2_journal_reclaim_fast(j);
+ return atomic_dec_and_test(&pin_list->count);
}
void bch2_journal_pin_put(struct journal *j, u64 seq)
{
- struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
- if (atomic_dec_and_test(&pin_list->count)) {
+ if (__bch2_journal_pin_put(j, seq)) {
spin_lock(&j->lock);
bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
list_del_init(&pin->list);
/*
- * Unpinning a journal entry make make journal_next_bucket() succeed, if
+ * Unpinning a journal entry may make journal_next_bucket() succeed, if
* writing a new last_seq will now make another bucket available:
*/
return atomic_dec_and_test(&pin_list->count) &&
return JOURNAL_PIN_other;
}
-void bch2_journal_pin_set(struct journal *j, u64 seq,
+static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+ journal_pin_flush_fn flush_fn,
+ enum journal_pin_type type)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ /*
+ * flush_fn is how we identify journal pins in debugfs, so must always
+ * exist, even if it doesn't do anything:
+ */
+ BUG_ON(!flush_fn);
+
+ atomic_inc(&pin_list->count);
+ pin->seq = seq;
+ pin->flush = flush_fn;
+ list_add(&pin->list, &pin_list->list[type]);
+}
+
+void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
{
- struct journal_entry_pin_list *pin_list;
bool reclaim;
spin_lock(&j->lock);
+ u64 seq = READ_ONCE(src->seq);
+
if (seq < journal_last_seq(j)) {
/*
* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
return;
}
- pin_list = journal_seq_pin(j, seq);
+ reclaim = __journal_pin_drop(j, dst);
- reclaim = __journal_pin_drop(j, pin);
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
+ if (reclaim)
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
- if (flush_fn)
- list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
- else
- list_add(&pin->list, &pin_list->flushed);
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ bool reclaim;
+
+ spin_lock(&j->lock);
+
+ BUG_ON(seq < journal_last_seq(j));
+
+ reclaim = __journal_pin_drop(j, pin);
+
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
/**
* bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j: journal object
+ * @pin: pin to flush
*/
void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
{
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
- /* And include pre-reservations: */
- nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
- (ca->mi.bucket_size << 6) -
- journal_entry_overhead(j));
-
nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
}
/**
- * bch2_journal_reclaim - free up journal buckets
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j: journal object
+ * @direct: direct or background reclaim?
+ * @kicked: requested to run since we last ran?
+ * Returns: 0 on success, or -EIO if the journal has been shutdown
*
* Background journal reclaim writes out btree nodes. It should be run
* early enough so that we never completely run out of journal buckets.
msecs_to_jiffies(c->opts.journal_reclaim_delay)))
min_nr = 1;
- if (j->prereserved.reserved * 4 > j->prereserved.remaining)
- min_nr = 1;
-
- if (fifo_free(&j->pin) <= 32)
+ if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
- j->prereserved.reserved,
- j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
c->btree_cache.used,
atomic_long_read(&c->btree_key_cache.nr_dirty),
"bch-reclaim/%s", c->name);
ret = PTR_ERR_OR_ZERO(p);
if (ret) {
- bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "creating journal reclaim thread");
return ret;
}
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
+ /* time_stats this */
bool did_work = false;
if (!test_bit(JOURNAL_STARTED, &j->flags))
return &j->pin.data[seq & j->pin.mask];
}
-void __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_put(struct journal *, u64);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
bch2_journal_pin_set(j, seq, pin, flush_fn);
}
-static inline void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- /* Guard against racing with journal_pin_drop(src): */
- u64 seq = READ_ONCE(src->seq);
-
- if (seq)
- bch2_journal_pin_add(j, seq, dst, flush_fn);
-}
+void bch2_journal_pin_copy(struct journal *,
+ struct journal_entry_pin *,
+ struct journal_entry_pin *,
+ journal_pin_flush_fn);
static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
struct printbuf *err)
{
struct bch_sb_field_journal *journal = field_to_type(f, journal);
- struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+ struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
int ret = -BCH_ERR_invalid_sb_journal;
unsigned nr;
unsigned i;
goto err;
}
- if (b[0] < le16_to_cpu(m->first_bucket)) {
+ if (b[0] < le16_to_cpu(m.first_bucket)) {
prt_printf(err, "journal bucket %llu before first bucket %u",
- b[0], le16_to_cpu(m->first_bucket));
+ b[0], le16_to_cpu(m.first_bucket));
goto err;
}
- if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+ if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
- b[nr - 1], le64_to_cpu(m->nbuckets));
+ b[nr - 1], le64_to_cpu(m.nbuckets));
goto err;
}
struct printbuf *err)
{
struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
- struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+ struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
int ret = -BCH_ERR_invalid_sb_journal;
unsigned nr;
unsigned i;
goto err;
}
- if (b[0].start < le16_to_cpu(m->first_bucket)) {
+ if (b[0].start < le16_to_cpu(m.first_bucket)) {
prt_printf(err, "journal bucket %llu before first bucket %u",
- b[0].start, le16_to_cpu(m->first_bucket));
+ b[0].start, le16_to_cpu(m.first_bucket));
goto err;
}
- if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+ if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
- b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
+ b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
goto err;
}
if (buckets[i] + 1 != buckets[i + 1])
nr_compacted++;
- j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+ j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
(sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
if (!j)
return -BCH_ERR_ENOSPC_sb_journal;
&bl->start[i + 1],
sizeof(bl->start[0]) * (nr - i));
- bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
- sb_blacklist_u64s(nr));
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+ sb_blacklist_u64s(nr));
BUG_ON(!bl);
}
int ret = 0;
mutex_lock(&c->sb_lock);
- bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+ bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
nr = blacklist_nr_entries(bl);
for (i = 0; i < nr; i++) {
}
}
- bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
- sb_blacklist_u64s(nr + 1));
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+ sb_blacklist_u64s(nr + 1));
if (!bl) {
ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
goto out;
int bch2_blacklist_table_initialize(struct bch_fs *c)
{
struct bch_sb_field_journal_seq_blacklist *bl =
- bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+ bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
struct journal_seq_blacklist_table *t;
unsigned i, nr = blacklist_nr_entries(bl);
struct journal_seq_blacklist_table *t;
struct bch_sb_field_journal_seq_blacklist *bl;
struct journal_seq_blacklist_entry *src, *dst;
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
unsigned i, nr, new_nr;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
for (i = 0; i < BTREE_ID_NR; i++) {
struct btree_iter iter;
struct btree *b;
- bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+ bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
0, 0, BTREE_ITER_PREFETCH);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
b = bch2_btree_iter_peek_node(&iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
return;
mutex_lock(&c->sb_lock);
- bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+ bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
if (!bl)
goto out;
bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
if (new_nr != nr) {
- bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+ bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
new_nr ? sb_blacklist_u64s(new_nr) : 0);
BUG_ON(new_nr && !bl);
u64 seq;
};
-/*
- * For reserving space in the journal prior to getting a reservation on a
- * particular journal entry:
- */
-struct journal_preres {
- unsigned u64s;
-};
-
union journal_res_state {
struct {
atomic64_t counter;
};
};
-union journal_preres_state {
- struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
- struct {
- u64 waiting:1,
- reserved:31,
- remaining:32;
- };
-};
-
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
union journal_res_state reservations;
enum bch_watermark watermark;
- union journal_preres_state prereserved;
-
} __aligned(SMP_CACHE_BYTES);
unsigned long flags;
unsigned long last_flush_write;
- u64 res_get_blocked_start;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
+ u64 entry_bytes_written;
+
+ u64 low_on_space_start;
+ u64 low_on_pin_start;
+ u64 max_in_flight_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
- struct bch2_time_stats *blocked_time;
struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#include "recovery.h"
/* KEY_TYPE_lru is obsolete: */
-int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (!lru_pos_time(k.k->p)) {
- prt_printf(err, "lru entry at time=0");
- return -BCH_ERR_invalid_bkey;
-
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
+ lru_entry_at_time_0,
+ "lru entry at time=0");
+fsck_err:
+ return ret;
}
void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
int ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+ lru_entry_to_invalid_bucket,
"lru key points to nonexistent device:bucket %llu:%llu",
alloc_pos.inode, alloc_pos.offset))
return bch2_btree_delete_at(trans, lru_iter, 0);
}
if (c->opts.reconstruct_alloc ||
- fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+ fsck_err(c, lru_entry_bad,
+ "incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",
bch2_lru_types[type],
int ret = 0;
ret = bch2_trans_run(c,
- for_each_btree_key_commit(&trans, iter,
+ for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
- bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos)));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
if (ret)
bch_err_fn(c, ret);
return ret;
return BCH_LRU_read;
}
-int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#include "buckets.h"
#include "errcode.h"
#include "extents.h"
-#include "io.h"
+#include "io_write.h"
#include "journal.h"
#include "keylist.h"
#include "migrate.h"
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
enum btree_id id;
int ret = 0;
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
for (id = 0; id < BTREE_ID_NR; id++) {
if (!btree_type_has_ptrs(id))
continue;
- ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+ ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
break;
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct closure cl;
struct btree *b;
if (flags & BCH_FORCE_IF_METADATA_LOST)
return -EINVAL;
+ trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
- bch2_trans_init(&trans, c, 0, 0);
closure_init_stack(&cl);
for (id = 0; id < BTREE_ID_NR; id++) {
- bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+ bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
BTREE_ITER_PREFETCH);
retry:
ret = 0;
- while (bch2_trans_begin(&trans),
+ while (bch2_trans_begin(trans),
(b = bch2_btree_iter_peek_node(&iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
break;
}
- ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false);
+ ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
}
if (ret) {
- bch_err(c, "Error updating btree node key: %s",
- bch2_err_str(ret));
+ bch_err_msg(c, ret, "updating btree node key");
break;
}
next:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (ret)
goto err;
bch2_btree_interior_updates_flush(c);
ret = 0;
err:
- bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&k, c);
+ bch2_trans_put(trans);
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
#include "errcode.h"
#include "error.h"
#include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "replicas.h"
+#include "snapshot.h"
#include "super-io.h"
#include "trace.h"
}
}
-static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_add(&stats->list, &c->data_progress_list);
- mutex_unlock(&c->data_progress_lock);
-}
-
-static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_del(&stats->list);
- mutex_unlock(&c->data_progress_lock);
-}
-
struct moving_io {
struct list_head read_list;
struct list_head io_list;
closure_put(&ctxt->cl);
}
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
{
struct moving_io *io;
- if (trans)
- bch2_trans_unlock(trans);
-
while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+ bch2_trans_unlock_long(ctxt->trans);
list_del(&io->read_list);
move_write(io);
}
}
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
- struct bch_fs *c = ctxt->c;
+ struct bch_fs *c = ctxt->trans->c;
- move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
EBUG_ON(atomic_read(&ctxt->write_sectors));
EBUG_ON(atomic_read(&ctxt->read_sectors));
EBUG_ON(atomic_read(&ctxt->read_ios));
- if (ctxt->stats) {
- progress_list_del(c, ctxt->stats);
- trace_move_data(c,
- atomic64_read(&ctxt->stats->sectors_moved),
- atomic64_read(&ctxt->stats->keys_moved));
- }
-
mutex_lock(&c->moving_context_lock);
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
+
+ bch2_trans_put(ctxt->trans);
+ memset(ctxt, 0, sizeof(*ctxt));
}
void bch2_moving_ctxt_init(struct moving_context *ctxt,
{
memset(ctxt, 0, sizeof(*ctxt));
- ctxt->c = c;
+ ctxt->trans = bch2_trans_get(c);
ctxt->fn = (void *) _RET_IP_;
ctxt->rate = rate;
ctxt->stats = stats;
mutex_lock(&c->moving_context_lock);
list_add(&ctxt->list, &c->moving_context_list);
mutex_unlock(&c->moving_context_lock);
+}
- if (stats) {
- progress_list_add(c, stats);
- stats->data_type = BCH_DATA_user;
- }
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+ trace_move_data(c, stats);
}
void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
memset(stats, 0, sizeof(*stats));
+ stats->data_type = BCH_DATA_user;
scnprintf(stats->name, sizeof(stats->name), "%s", name);
}
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
-static int bch2_move_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct bch_io_opts io_opts,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct data_update_opts data_opts)
+int bch2_move_extent(struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bch_io_opts io_opts,
+ struct data_update_opts data_opts)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ if (ctxt->stats)
+ ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
- io_opts, data_opts, btree_id, k);
+ io_opts, data_opts, iter->btree_id, k);
if (ret && ret != -BCH_ERR_unwritten_extent_update)
goto err_free_pages;
BUG_ON(ret);
- io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
if (ctxt->stats) {
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
closure_get(&ctxt->cl);
bch2_read_extent(trans, &io->rbio,
bkey_start_pos(k.k),
- btree_id, k, 0,
+ iter->btree_id, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
return ret;
}
-static int lookup_inode(struct btree_trans *trans, struct bpos pos,
- struct bch_inode_unpacked *inode)
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+ struct per_snapshot_io_opts *io_opts,
+ struct bkey_s_c extent_k)
+{
+ struct bch_fs *c = trans->c;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ if (io_opts->cur_inum != extent_k.k->p.inode) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ io_opts->d.nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != extent_k.k->p.inode)
+ break;
+
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ struct bch_inode_unpacked inode;
+ BUG_ON(bch2_inode_unpack(k, &inode));
+
+ struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+ bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
+
+ ret = darray_push(&io_opts->d, e);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ io_opts->cur_inum = extent_k.k->p.inode;
+ }
+
+ ret = ret ?: trans_was_restarted(trans, restart_count);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (extent_k.k->p.snapshot) {
+ struct snapshot_io_opts_entry *i;
+ darray_for_each(io_opts->d, i)
+ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
+ return &i->io_opts;
+ }
+
+ return &io_opts->fs_io_opts;
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct bkey_s_c extent_k)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(&iter);
+ /* reflink btree? */
+ if (!extent_k.k->p.inode) {
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+ return 0;
+ }
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+ BTREE_ITER_CACHED);
ret = bkey_err(k);
- if (ret)
- goto err;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
- if (!k.k || !bkey_eq(k.k->p, pos)) {
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
+ if (!ret && bkey_is_inode(k.k)) {
+ struct bch_inode_unpacked inode;
+ bch2_inode_unpack(k, &inode);
+ bch2_inode_opts_get(io_opts, trans->c, &inode);
+ } else {
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
}
- ret = bkey_is_inode(k.k) ? 0 : -EIO;
- if (ret)
- goto err;
-
- ret = bch2_inode_unpack(k, inode);
- if (ret)
- goto err;
-err:
bch2_trans_iter_exit(trans, &iter);
- return ret;
+ return 0;
}
-static int move_ratelimit(struct btree_trans *trans,
- struct moving_context *ctxt)
+int bch2_move_ratelimit(struct moving_context *ctxt)
{
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = ctxt->trans->c;
u64 delay;
- if (ctxt->wait_on_copygc) {
- bch2_trans_unlock(trans);
+ if (ctxt->wait_on_copygc && !c->copygc_running) {
+ bch2_trans_unlock_long(ctxt->trans);
wait_event_killable(c->copygc_running_wq,
!c->copygc_running ||
kthread_should_stop());
do {
delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
+
if (delay) {
- bch2_trans_unlock(trans);
+ if (delay > HZ / 10)
+ bch2_trans_unlock_long(ctxt->trans);
+ else
+ bch2_trans_unlock(ctxt->trans);
set_current_state(TASK_INTERRUPTIBLE);
}
schedule_timeout(delay);
if (unlikely(freezing(current))) {
- move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
* XXX: these limits really ought to be per device, SSDs and hard drives
* will want different limits
*/
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
return 0;
}
-static int move_get_io_opts(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct bkey_s_c k, u64 *cur_inum)
-{
- struct bch_inode_unpacked inode;
- int ret;
-
- if (*cur_inum == k.k->p.inode)
- return 0;
-
- ret = lookup_inode(trans,
- SPOS(0, k.k->p.inode, k.k->p.snapshot),
- &inode);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
-
- if (!ret)
- bch2_inode_opts_get(io_opts, trans->c, &inode);
- else
- *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
- *cur_inum = k.k->p.inode;
- return 0;
-}
-
-static int __bch2_move_data(struct moving_context *ctxt,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- enum btree_id btree_id)
+static int bch2_move_data_btree(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id)
{
- struct bch_fs *c = ctxt->c;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct per_snapshot_io_opts snapshot_io_opts;
+ struct bch_io_opts *io_opts;
struct bkey_buf sk;
- struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct data_update_opts data_opts;
- u64 cur_inum = U64_MAX;
int ret = 0, ret2;
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
if (ctxt->stats) {
ctxt->stats->data_type = BCH_DATA_user;
- ctxt->stats->btree_id = btree_id;
- ctxt->stats->pos = start;
+ ctxt->stats->pos = BBPOS(btree_id, start);
}
- bch2_trans_iter_init(&trans, &iter, btree_id, start,
+ bch2_trans_iter_init(trans, &iter, btree_id, start,
BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS);
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
- while (!move_ratelimit(&trans, ctxt)) {
- bch2_trans_begin(&trans);
+ while (!bch2_move_ratelimit(ctxt)) {
+ bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
if (ctxt->stats)
- ctxt->stats->pos = iter.pos;
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+ ret = PTR_ERR_OR_ZERO(io_opts);
if (ret)
continue;
memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, k, &io_opts, &data_opts))
+ if (!pred(c, arg, k, io_opts, &data_opts))
goto next;
/*
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
- io_opts, btree_id, k, data_opts);
+ ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt, &trans);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
/* XXX signal failure */
goto next;
}
-
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
bch2_btree_iter_advance(&iter);
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
return ret;
}
-int bch2_move_data(struct bch_fs *c,
- enum btree_id start_btree_id, struct bpos start_pos,
- enum btree_id end_btree_id, struct bpos end_pos,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
+int __bch2_move_data(struct moving_context *ctxt,
+ struct bbpos start,
+ struct bbpos end,
+ move_pred_fn pred, void *arg)
{
- struct moving_context ctxt;
+ struct bch_fs *c = ctxt->trans->c;
enum btree_id id;
- int ret;
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ int ret = 0;
- for (id = start_btree_id;
- id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+ for (id = start.btree;
+ id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
- stats->btree_id = id;
-
- if (id != BTREE_ID_extents &&
- id != BTREE_ID_reflink)
- continue;
+ ctxt->stats->pos = BBPOS(id, POS_MIN);
- if (!bch2_btree_id_root(c, id)->b)
+ if (!btree_type_has_ptrs(id) ||
+ !bch2_btree_id_root(c, id)->b)
continue;
- ret = __bch2_move_data(&ctxt,
- id == start_btree_id ? start_pos : POS_MIN,
- id == end_btree_id ? end_pos : POS_MAX,
+ ret = bch2_move_data_btree(ctxt,
+ id == start.btree ? start.pos : POS_MIN,
+ id == end.btree ? end.pos : POS_MAX,
pred, arg, id);
if (ret)
break;
}
+ return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+ struct bbpos start,
+ struct bbpos end,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
+{
+
+ struct moving_context ctxt;
+ int ret;
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
-int __bch2_evacuate_bucket(struct btree_trans *trans,
- struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
{
- struct bch_fs *c = ctxt->c;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
struct bkey_buf sk;
struct data_update_opts data_opts;
unsigned dirty_sectors, bucket_size;
u64 fragmentation;
- u64 cur_inum = U64_MAX;
struct bpos bp_pos = POS_MIN;
int ret = 0;
goto err;
}
- while (!(ret = move_ratelimit(trans, ctxt))) {
+ while (!(ret = bch2_move_ratelimit(ctxt))) {
bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen,
if (!bp.level) {
const struct bch_extent_ptr *ptr;
- struct bkey_s_c k;
unsigned i = 0;
k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
i++;
}
- ret = bch2_move_extent(trans, &iter, ctxt,
- bucket_in_flight,
- io_opts, bp.btree_id, k, data_opts);
+ ret = bch2_move_extent(ctxt, bucket_in_flight,
+ &iter, k, io_opts, data_opts);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt, trans);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
if (ret)
goto err;
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
} else {
struct write_point_specifier wp,
bool wait_on_copygc)
{
- struct btree_trans trans;
struct moving_context ctxt;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
+ ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
bch2_moving_ctxt_exit(&ctxt);
- bch2_trans_exit(&trans);
return ret;
}
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_trans trans;
+ struct moving_context ctxt;
+ struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
enum btree_id id;
struct data_update_opts data_opts;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
- progress_list_add(c, stats);
+ bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+ writepoint_ptr(&c->btree_write_point),
+ true);
+ trans = ctxt.trans;
stats->data_type = BCH_DATA_btree;
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
- stats->btree_id = id;
+ stats->pos = BBPOS(id, POS_MIN);
if (!bch2_btree_id_root(c, id)->b)
continue;
- bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+ bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
BTREE_ITER_PREFETCH);
retry:
ret = 0;
- while (bch2_trans_begin(&trans),
+ while (bch2_trans_begin(trans),
(b = bch2_btree_iter_peek_node(&iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
if (kthread && kthread_should_stop())
bpos_cmp(b->key.k.p, end_pos)) > 0)
break;
- stats->pos = iter.pos;
+ stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
- ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
if (kthread && kthread_should_stop())
break;
}
- bch2_trans_exit(&trans);
-
- if (ret)
- bch_err_fn(c, ret);
-
+ bch_err_fn(c, ret);
+ bch2_moving_ctxt_exit(&ctxt);
bch2_btree_interior_updates_flush(c);
- progress_list_del(c, stats);
return ret;
}
mutex_unlock(&c->sb_lock);
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_MIGRATE:
if (op.migrate.dev >= c->sb.nr_devices)
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
+ bch2_move_stats_exit(stats, c);
break;
default:
ret = -EINVAL;
return ret;
}
-void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- struct bch_move_stats *stats;
-
- mutex_lock(&c->data_progress_lock);
- list_for_each_entry(stats, &c->data_progress_list, list) {
- prt_printf(out, "%s: data type %s btree_id %s position: ",
- stats->name,
- bch2_data_types[stats->data_type],
- bch2_btree_ids[stats->btree_id]);
- bch2_bpos_to_text(out, stats->pos);
- prt_printf(out, "%s", "\n");
- }
- mutex_unlock(&c->data_progress_lock);
+ prt_printf(out, "%s: data type=%s pos=",
+ stats->name,
+ bch2_data_types[stats->data_type]);
+ bch2_bbpos_to_text(out, stats->pos);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_str(out, "keys moved: ");
+ prt_u64(out, atomic64_read(&stats->keys_moved));
+ prt_newline(out);
+
+ prt_str(out, "keys raced: ");
+ prt_u64(out, atomic64_read(&stats->keys_raced));
+ prt_newline(out);
+
+ prt_str(out, "bytes seen: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+ prt_newline(out);
+
+ prt_str(out, "bytes moved: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
+ prt_newline(out);
+
+ prt_str(out, "bytes raced: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
}
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
{
struct moving_io *io;
- prt_printf(out, "%ps:", ctxt->fn);
- prt_newline(out);
+ bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
- prt_printf(out, "reads: %u sectors %u",
+ prt_printf(out, "reads: ios %u/%u sectors %u/%u",
atomic_read(&ctxt->read_ios),
- atomic_read(&ctxt->read_sectors));
+ c->opts.move_ios_in_flight,
+ atomic_read(&ctxt->read_sectors),
+ c->opts.move_bytes_in_flight >> 9);
prt_newline(out);
- prt_printf(out, "writes: %u sectors %u",
+ prt_printf(out, "writes: ios %u/%u sectors %u/%u",
atomic_read(&ctxt->write_ios),
- atomic_read(&ctxt->write_sectors));
+ c->opts.move_ios_in_flight,
+ atomic_read(&ctxt->write_sectors),
+ c->opts.move_bytes_in_flight >> 9);
prt_newline(out);
printbuf_indent_add(out, 2);
mutex_lock(&ctxt->lock);
- list_for_each_entry(io, &ctxt->ios, io_list) {
+ list_for_each_entry(io, &ctxt->ios, io_list)
bch2_write_op_to_text(out, &io->write.op);
- }
mutex_unlock(&ctxt->lock);
printbuf_indent_sub(out, 4);
mutex_lock(&c->moving_context_lock);
list_for_each_entry(ctxt, &c->moving_context_list, list)
- bch2_moving_ctxt_to_text(out, ctxt);
+ bch2_moving_ctxt_to_text(out, c, ctxt);
mutex_unlock(&c->moving_context_lock);
}
{
INIT_LIST_HEAD(&c->moving_context_list);
mutex_init(&c->moving_context_lock);
-
- INIT_LIST_HEAD(&c->data_progress_list);
- mutex_init(&c->data_progress_lock);
}
#ifndef _BCACHEFS_MOVE_H
#define _BCACHEFS_MOVE_H
+#include "bbpos.h"
+#include "bcachefs_ioctl.h"
#include "btree_iter.h"
#include "buckets.h"
#include "data_update.h"
struct bch_read_bio;
struct moving_context {
- struct bch_fs *c;
+ struct btree_trans *trans;
struct list_head list;
void *fn;
wait_queue_head_t wait;
};
-#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
+#define move_ctxt_wait_event(_ctxt, _cond) \
do { \
bool cond_finished = false; \
- bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \
+ bch2_moving_ctxt_do_pending_writes(_ctxt); \
\
if (_cond) \
break; \
+ bch2_trans_unlock_long((_ctxt)->trans); \
__wait_event((_ctxt)->wait, \
bch2_moving_ctxt_next_pending_write(_ctxt) || \
(cond_finished = (_cond))); \
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
- struct btree_trans *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+ u32 snapshot;
+ struct bch_io_opts io_opts;
+};
+
+struct per_snapshot_io_opts {
+ u64 cur_inum;
+ struct bch_io_opts fs_io_opts;
+ DARRAY(struct snapshot_io_opts_entry) d;
+};
+
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+ memset(io_opts, 0, sizeof(*io_opts));
+ io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
+}
+
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+ darray_exit(&io_opts->d);
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+int bch2_move_extent(struct moving_context *,
+ struct move_bucket_in_flight *,
+ struct btree_iter *,
+ struct bkey_s_c,
+ struct bch_io_opts,
+ struct data_update_opts);
+
+int __bch2_move_data(struct moving_context *,
+ struct bbpos,
+ struct bbpos,
+ move_pred_fn, void *);
int bch2_move_data(struct bch_fs *,
- enum btree_id, struct bpos,
- enum btree_id, struct bpos,
+ struct bbpos start,
+ struct bbpos end,
struct bch_ratelimit *,
struct bch_move_stats *,
struct write_point_specifier,
bool,
move_pred_fn, void *);
-int __bch2_evacuate_bucket(struct btree_trans *,
- struct moving_context *,
+int __bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
struct bch_move_stats *,
struct bch_ioctl_data);
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
-void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *);
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, char *);
+
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_move_init(struct bch_fs *);
#ifndef _BCACHEFS_MOVE_TYPES_H
#define _BCACHEFS_MOVE_TYPES_H
+#include "bbpos_types.h"
+
struct bch_move_stats {
enum bch_data_type data_type;
- enum btree_id btree_id;
- struct bpos pos;
- struct list_head list;
+ struct bbpos pos;
char name[32];
atomic64_t keys_moved;
atomic64_t keys_raced;
- atomic64_t sectors_moved;
atomic64_t sectors_seen;
+ atomic64_t sectors_moved;
atomic64_t sectors_raced;
};
#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
-#include "disk_groups.h"
#include "errcode.h"
#include "error.h"
-#include "extents.h"
-#include "eytzinger.h"
-#include "io.h"
-#include "keylist.h"
#include "lru.h"
#include "move.h"
#include "movinggc.h"
-#include "super-io.h"
#include "trace.h"
-#include <linux/bsearch.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/math64.h>
#include <linux/sched/task.h>
-#include <linux/sort.h>
#include <linux/wait.h>
struct buckets_in_flight {
return ret;
}
-static void move_buckets_wait(struct btree_trans *trans,
- struct moving_context *ctxt,
+static void move_buckets_wait(struct moving_context *ctxt,
struct buckets_in_flight *list,
bool flush)
{
while ((i = list->first)) {
if (flush)
- move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
+ move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
if (atomic_read(&i->count))
break;
kfree(i);
}
- bch2_trans_unlock(trans);
+ bch2_trans_unlock_long(ctxt->trans);
}
static bool bucket_in_flight(struct buckets_in_flight *list,
typedef DARRAY(struct move_bucket) move_buckets;
-static int bch2_copygc_get_buckets(struct btree_trans *trans,
- struct moving_context *ctxt,
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight,
move_buckets *buckets)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
+ size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
- move_buckets_wait(trans, ctxt, buckets_in_flight, false);
+ move_buckets_wait(ctxt, buckets_in_flight, false);
ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
0, k, ({
struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
- int ret = 0;
+ int ret2 = 0;
saw++;
else if (bucket_in_flight(buckets_in_flight, b.k))
in_flight++;
else {
- ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
- if (ret >= 0)
+ ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+ if (ret2 >= 0)
sectors += b.sectors;
}
- ret;
+ ret2;
}));
pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
}
noinline
-static int bch2_copygc(struct btree_trans *trans,
- struct moving_context *ctxt,
- struct buckets_in_flight *buckets_in_flight)
+static int bch2_copygc(struct moving_context *ctxt,
+ struct buckets_in_flight *buckets_in_flight,
+ bool *did_work)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
- ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
if (ret)
goto err;
f = move_bucket_in_flight_add(buckets_in_flight, *i);
ret = PTR_ERR_OR_ZERO(f);
- if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
+ if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
+ ret = 0;
continue;
+ }
if (ret == -ENOMEM) { /* flush IO, continue later */
ret = 0;
break;
}
- ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+ ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
+
+ *did_work = true;
}
err:
darray_exit(&buckets);
ret = 0;
if (ret < 0 && !bch2_err_matches(ret, EROFS))
- bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "from bch2_move_data()");
moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
trace_and_count(c, copygc, c, moved, 0, 0, 0);
static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
- struct btree_trans trans;
struct moving_context ctxt;
struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
- struct buckets_in_flight move_buckets;
+ struct buckets_in_flight *buckets;
u64 last, wait;
int ret = 0;
- memset(&move_buckets, 0, sizeof(move_buckets));
-
- ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
+ buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
+ if (!buckets)
+ return -ENOMEM;
+ ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
if (ret) {
- bch_err(c, "error allocating copygc buckets in flight: %s",
- bch2_err_str(ret));
+ kfree(buckets);
+ bch_err_msg(c, ret, "allocating copygc buckets in flight");
return ret;
}
set_freezable();
- bch2_trans_init(&trans, c, 0, 0);
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
false);
while (!ret && !kthread_should_stop()) {
- bch2_trans_unlock(&trans);
+ bool did_work = false;
+
+ bch2_trans_unlock_long(ctxt.trans);
cond_resched();
if (!c->copy_gc_enabled) {
- move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+ move_buckets_wait(&ctxt, buckets, true);
kthread_wait_freezable(c->copy_gc_enabled);
}
if (unlikely(freezing(current))) {
- move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+ move_buckets_wait(&ctxt, buckets, true);
__refrigerator(false);
continue;
}
if (wait > clock->max_slop) {
c->copygc_wait_at = last;
c->copygc_wait = last + wait;
- move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+ move_buckets_wait(&ctxt, buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
c->copygc_wait = 0;
c->copygc_running = true;
- ret = bch2_copygc(&trans, &ctxt, &move_buckets);
+ ret = bch2_copygc(&ctxt, buckets, &did_work);
c->copygc_running = false;
wake_up(&c->copygc_running_wq);
+
+ if (!wait && !did_work) {
+ u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+ if (min_member_capacity == U64_MAX)
+ min_member_capacity = 128 * 2048;
+
+ bch2_trans_unlock_long(ctxt.trans);
+ bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
+ MAX_SCHEDULE_TIMEOUT);
+ }
}
- move_buckets_wait(&trans, &ctxt, &move_buckets, true);
- rhashtable_destroy(&move_buckets.table);
- bch2_trans_exit(&trans);
+ move_buckets_wait(&ctxt, buckets, true);
+
+ rhashtable_destroy(&buckets->table);
+ kfree(buckets);
bch2_moving_ctxt_exit(&ctxt);
+ bch2_move_stats_exit(&move_stats, c);
return 0;
}
t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
- bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "creating copygc thread");
return ret;
}
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_methods.h"
#include "nocow_locking.h"
#include "util.h"
for (i = 0; i < ARRAY_SIZE(l->b); i++)
if (l->b[i] == dev_bucket) {
- BUG_ON(sign(atomic_read(&l->l[i])) != lock_val);
+ int v = atomic_sub_return(lock_val, &l->l[i]);
- if (!atomic_sub_return(lock_val, &l->l[i]))
+ BUG_ON(v && sign(v) != lock_val);
+ if (!v)
closure_wake_up(&l->wait);
return;
}
if (lock_val > 0 ? v < 0 : v > 0)
goto fail;
take_lock:
+ v = atomic_read(&l->l[i]);
+ /* Overflow? */
+ if (v && sign(v + lock_val) != sign(v))
+ goto fail;
+
atomic_add(lock_val, &l->l[i]);
spin_unlock(&l->lock);
return true;
}
void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
+
{
unsigned i, nr_zero = 0;
struct nocow_lock_bucket *l;
prt_printf(out, "(%u empty entries)\n", nr_zero);
nr_zero = 0;
- for (i = 0; i < ARRAY_SIZE(l->l); i++)
- if (atomic_read(&l->l[i]))
- prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i]));
+ for (i = 0; i < ARRAY_SIZE(l->l); i++) {
+ int v = atomic_read(&l->l[i]);
+ if (v) {
+ bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
+ prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
+ }
+ }
prt_newline(out);
}
prt_printf(out, "(%u empty entries)\n", nr_zero);
}
+void bch2_fs_nocow_locking_exit(struct bch_fs *c)
+{
+ struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+ for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+ for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
+ BUG_ON(atomic_read(&l->l[j]));
+}
+
int bch2_fs_nocow_locking_init(struct bch_fs *c)
{
- unsigned i;
+ struct bucket_nocow_lock_table *t = &c->nocow_locks;
- for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
- spin_lock_init(&c->nocow_locks.l[i].lock);
+ for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+ spin_lock_init(&l->lock);
return 0;
}
void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
+void bch2_fs_nocow_locking_exit(struct bch_fs *);
int bch2_fs_nocow_locking_init(struct bch_fs *);
#endif /* _BCACHEFS_NOCOW_LOCKING_H */
NULL
};
-const char * const bch2_btree_ids[] = {
+const char * const __bch2_btree_ids[] = {
BCH_BTREE_IDS()
- "interior btree node",
NULL
};
if (err)
prt_printf(err, "%s: too small (min %llu)",
opt->attr.name, opt->min);
- return -ERANGE;
+ return -BCH_ERR_ERANGE_option_too_small;
}
if (opt->max && v >= opt->max) {
if (err)
prt_printf(err, "%s: too big (max %llu)",
opt->attr.name, opt->max);
- return -ERANGE;
+ return -BCH_ERR_ERANGE_option_too_big;
}
if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
return -EINVAL;
}
+ if (opt->fn.validate)
+ return opt->fn.validate(v, err);
+
return 0;
}
val = "0";
}
+ /* Unknown options are ignored: */
if (id < 0)
- goto bad_opt;
+ continue;
if (!(bch2_opt_table[id].flags & OPT_MOUNT))
goto bad_opt;
extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
-extern const char * const bch2_btree_ids[];
+extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_types[];
struct bch_opt_fn {
int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+ int (*validate)(u64, struct printbuf *);
};
/**
#undef x
};
-static const struct bch_opts bch2_opts_default = {
+static const __maybe_unused struct bch_opts bch2_opts_default = {
#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \
._name##_defined = true, \
._name = _default, \
}
/**
- * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
- * terminated
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf: printbuf to terminate
+ * Returns: Printbuf contents, as a nul terminated C string
*/
const char *bch2_printbuf_str(const struct printbuf *buf)
{
}
/**
- * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
* against accidental use.
+ * @buf: printbuf to exit
*/
void bch2_printbuf_exit(struct printbuf *buf)
{
}
/*
- * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
*
* @buf: printbuf to control
* @spaces: number of spaces from previous tabpstop
}
/**
- * printbuf_indent_add - add to the current indent level
+ * bch2_printbuf_indent_add() - add to the current indent level
*
* @buf: printbuf to control
* @spaces: number of spaces to add to the current indent level
}
/**
- * printbuf_indent_sub - subtract from the current indent level
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
*
* @buf: printbuf to control
* @spaces: number of spaces to subtract from the current indent level
}
/**
- * prt_tab - Advance printbuf to the next tabstop
- *
- * @buf: printbuf to control
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out: printbuf to control
*
* Advance output to the next tabstop by printing spaces.
*/
}
/**
- * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
* previous output
*
* @buf: printbuf to control
}
/**
- * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
*
- * @out: printbuf to output to
- * @str: string to print
- * @count: number of bytes to print
+ * @out: output printbuf
+ * @str: string to print
+ * @count: number of bytes to print
*
* The following contol characters are handled as so:
* \n: prt_newline newline that obeys current indent level
}
/**
- * prt_human_readable_u64 - Print out a u64 in human readable units
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out: output printbuf
+ * @v: integer to print
*
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
*/
-void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v)
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
{
- bch2_printbuf_make_room(buf, 10);
- buf->pos += string_get_size(v, 1, !buf->si_units,
- buf->buf + buf->pos,
- printbuf_remaining_size(buf));
+ bch2_printbuf_make_room(out, 10);
+ out->pos += string_get_size(v, 1, !out->si_units,
+ out->buf + out->pos,
+ printbuf_remaining_size(out));
}
/**
- * prt_human_readable_s64 - Print out a s64 in human readable units
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out: output printbuf
+ * @v: integer to print
*
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
*/
-void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v)
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
{
if (v < 0)
- prt_char(buf, '-');
- bch2_prt_human_readable_u64(buf, abs(v));
+ prt_char(out, '-');
+ bch2_prt_human_readable_u64(out, abs(v));
}
/**
- * prt_units_u64 - Print out a u64 according to printbuf unit options
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out: output printbuf
+ * @v: integer to print
*
* Units are either raw (default), or human reabable units (controlled via
* @buf->human_readable_units)
}
/**
- * prt_units_s64 - Print out a s64 according to printbuf unit options
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out: output printbuf
+ * @v: integer to print
*
* Units are either raw (default), or human reabable units (controlled via
* @buf->human_readable_units)
while (list[nr])
nr++;
- while (flags && (bit = __ffs(flags)) < nr) {
+ while (flags && (bit = __ffs64(flags)) < nr) {
if (!first)
bch2_prt_printf(out, ",");
first = false;
bch2_prt_printf(out, "%s", list[bit]);
- flags ^= 1 << bit;
+ flags ^= BIT_ULL(bit);
}
}
#include "error.h"
#include "inode.h"
#include "quota.h"
-#include "subvolume.h"
+#include "snapshot.h"
#include "super-io.h"
static const char * const bch2_quota_types[] = {
.to_text = bch2_sb_quota_to_text,
};
-int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- if (k.k->p.inode >= QTYP_NR) {
- prt_printf(err, "invalid quota type (%llu >= %u)",
- k.k->p.inode, QTYP_NR);
- return -BCH_ERR_invalid_bkey;
- }
+ int ret = 0;
- return 0;
+ bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
+ quota_type_invalid,
+ "invalid quota type (%llu >= %u)",
+ k.k->p.inode, QTYP_NR);
+fsck_err:
+ return ret;
}
void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
{
- struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
+ struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
if (sb_quota)
return sb_quota;
- sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
+ sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
if (sb_quota) {
unsigned qtype, qc;
struct bch_sb_field_quota *sb_quota;
unsigned i, j;
- sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+ sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
if (!sb_quota)
return;
if (!s_t.master_subvol)
goto advance;
- ret = bch2_inode_find_by_inum_trans(trans,
+ ret = bch2_inode_find_by_inum_nowarn_trans(trans,
(subvol_inum) {
le32_to_cpu(s_t.master_subvol),
k.k->p.offset,
int bch2_fs_quota_read(struct bch_fs *c)
{
struct bch_sb_field_quota *sb_quota;
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_sb_quota_read(c);
mutex_unlock(&c->sb_lock);
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
- ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+ ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
POS_MIN, BTREE_ITER_PREFETCH, k,
__bch2_quota_set(c, k, NULL)) ?:
- for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+ for_each_btree_key2(trans, iter, BTREE_ID_inodes,
POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- bch2_fs_quota_read_inode(&trans, &iter, k));
+ bch2_fs_quota_read_inode(trans, &iter, k));
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
bch_err_fn(c, ret);
{
struct bch_fs *c = sb->s_fs_info;
struct bch_sb_field_quota *sb_quota;
- struct bch_memquota_type *q;
int ret = 0;
if (0) {
~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
return -EINVAL;
- q = &c->quotas[type];
-
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
+ bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
return bch2_err_class(ret);
enum bkey_invalid_flags;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
-int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
#include "errcode.h"
-#include "extents.h"
-#include "io.h"
+#include "error.h"
+#include "inode.h"
#include "move.h"
#include "rebalance.h"
+#include "subvolume.h"
#include "super-io.h"
#include "trace.h"
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
-/*
- * Check if an extent should be moved:
- * returns -1 if it should not be moved, or
- * device of pointer that should be moved, if known, or INT_MAX if unknown
- */
-static bool rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
+#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
+
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+ BCH_REBALANCE_STATES()
+ NULL
+#undef x
+};
+
+static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i;
-
- data_opts->rewrite_ptrs = 0;
- data_opts->target = io_opts->background_target;
- data_opts->extra_replicas = 0;
- data_opts->btree_insert_flags = 0;
-
- if (io_opts->background_compression &&
- !bch2_bkey_is_incompressible(k)) {
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- i = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (!p.ptr.cached &&
- p.crc.compression_type !=
- bch2_compression_opt_to_type(io_opts->background_compression))
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
- }
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie *cookie;
+ u64 v;
+ int ret;
- if (io_opts->background_target) {
- const struct bch_extent_ptr *ptr;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+ ret = PTR_ERR_OR_ZERO(cookie);
+ if (ret)
+ goto err;
+
+ bkey_cookie_init(&cookie->k_i);
+ cookie->k.p = iter.pos;
+ cookie->v.cookie = cpu_to_le64(v + 1);
+
+ ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
- i = 0;
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached &&
- !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
- bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
- }
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
+{
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_set_rebalance_needs_scan(trans, inum));
+ rebalance_wakeup(c);
+ return ret;
+}
- return data_opts->rewrite_ptrs != 0;
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
+{
+ return bch2_set_rebalance_needs_scan(c, 0);
}
-void bch2_rebalance_add_key(struct bch_fs *c,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts)
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
{
- struct data_update_opts update_opts = { 0 };
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
- unsigned i;
-
- if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
- return;
-
- i = 0;
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- if ((1U << i) && update_opts.rewrite_ptrs)
- if (atomic64_add_return(k.k->size,
- &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
- k.k->size)
- rebalance_wakeup(c);
- i++;
- }
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 v;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ if (v == cookie)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+ struct btree_iter *work_iter)
{
- if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
- sectors)
- rebalance_wakeup(c);
+ return !kthread_should_stop()
+ ? bch2_btree_iter_peek(work_iter)
+ : bkey_s_c_null;
}
-struct rebalance_work {
- int dev_most_full_idx;
- unsigned dev_most_full_percent;
- u64 dev_most_full_work;
- u64 dev_most_full_capacity;
- u64 total_work;
-};
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ extent_entry_drop(bkey_i_to_s(n),
+ (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+}
-static void rebalance_work_accumulate(struct rebalance_work *w,
- u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter,
+ struct data_update_opts *data_opts)
{
- unsigned percent_full;
- u64 work = dev_work + unknown_dev;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+
+ bch2_trans_iter_exit(trans, extent_iter);
+ bch2_trans_iter_init(trans, extent_iter,
+ work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+ work_pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek_slot(extent_iter);
+ if (bkey_err(k))
+ return k;
+
+ const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
+ if (!r) {
+ /* raced due to btree write buffer, nothing to do */
+ return bkey_s_c_null;
+ }
- if (work < dev_work || work < unknown_dev)
- work = U64_MAX;
- work = min(work, capacity);
+ memset(data_opts, 0, sizeof(*data_opts));
- percent_full = div64_u64(work * 100, capacity);
+ data_opts->rewrite_ptrs =
+ bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
+ data_opts->target = r->target;
- if (percent_full >= w->dev_most_full_percent) {
- w->dev_most_full_idx = idx;
- w->dev_most_full_percent = percent_full;
- w->dev_most_full_work = work;
- w->dev_most_full_capacity = capacity;
+ if (!data_opts->rewrite_ptrs) {
+ /*
+ * device we would want to write to offline? devices in target
+ * changed?
+ *
+ * We'll now need a full scan before this extent is picked up
+ * again:
+ */
+ int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+ if (ret)
+ return bkey_s_c_err(ret);
+ return bkey_s_c_null;
}
- if (w->total_work + dev_work >= w->total_work &&
- w->total_work + dev_work >= dev_work)
- w->total_work += dev_work;
+ return k;
}
-static struct rebalance_work rebalance_work(struct bch_fs *c)
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter)
{
- struct bch_dev *ca;
- struct rebalance_work ret = { .dev_most_full_idx = -1 };
- u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
- unsigned i;
-
- for_each_online_member(ca, c, i)
- rebalance_work_accumulate(&ret,
- atomic64_read(&ca->rebalance_work),
- unknown_dev,
- bucket_to_sector(ca, ca->mi.nbuckets -
- ca->mi.first_bucket),
- i);
-
- rebalance_work_accumulate(&ret,
- unknown_dev, 0, c->capacity, -1);
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ struct data_update_opts data_opts;
+ struct bch_io_opts io_opts;
+ struct bkey_s_c k;
+ struct bkey_buf sk;
+ int ret;
+
+ ctxt->stats = &r->work_stats;
+ r->state = BCH_REBALANCE_working;
+
+ bch2_bkey_buf_init(&sk);
+
+ ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+ extent_iter, &data_opts));
+ if (ret || !k.k)
+ goto out;
+
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+ if (ret)
+ goto out;
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+ if (ret) {
+ if (bch2_err_matches(ret, ENOMEM)) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ ret = -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto out;
+
+ /* skip it and continue, XXX signal failure */
+ ret = 0;
+ }
+out:
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
-static void rebalance_work_reset(struct bch_fs *c)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
- struct bch_dev *ca;
- unsigned i;
+ unsigned target, compression;
- for_each_online_member(ca, c, i)
- atomic64_set(&ca->rebalance_work, 0);
+ if (k.k->p.inode) {
+ target = io_opts->background_target;
+ compression = io_opts->background_compression ?: io_opts->compression;
+ } else {
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
- atomic64_set(&c->rebalance.work_unknown_dev, 0);
+ target = r ? r->target : io_opts->background_target;
+ compression = r ? r->compression :
+ (io_opts->background_compression ?: io_opts->compression);
+ }
+
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
+ data_opts->target = target;
+ return data_opts->rewrite_ptrs != 0;
}
-static unsigned long curr_cputime(void)
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
{
- u64 utime, stime;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ int ret;
+
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+ ctxt->stats = &r->scan_stats;
+
+ if (!inum) {
+ r->scan_start = BBPOS_MIN;
+ r->scan_end = BBPOS_MAX;
+ } else {
+ r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
+ r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+ }
+
+ r->state = BCH_REBALANCE_scanning;
+
+ ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
- task_cputime_adjusted(current, &utime, &stime);
- return nsecs_to_jiffies(utime + stime);
+ bch2_move_stats_exit(&r->scan_stats, trans->c);
+ return ret;
}
-static int bch2_rebalance_thread(void *arg)
+static void rebalance_wait(struct bch_fs *c)
{
- struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
- struct rebalance_work w, p;
- struct bch_move_stats move_stats;
- unsigned long start, prev_start;
- unsigned long prev_run_time, prev_run_cputime;
- unsigned long cputime, prev_cputime;
- u64 io_start;
- long throttle;
+ u64 now = atomic64_read(&clock->now);
+ u64 min_member_capacity = bch2_min_rw_member_capacity(c);
- set_freezable();
+ if (min_member_capacity == U64_MAX)
+ min_member_capacity = 128 * 2048;
+
+ r->wait_iotime_end = now + (min_member_capacity >> 6);
- io_start = atomic64_read(&clock->now);
- p = rebalance_work(c);
- prev_start = jiffies;
- prev_cputime = curr_cputime();
+ if (r->state != BCH_REBALANCE_waiting) {
+ r->wait_iotime_start = now;
+ r->wait_wallclock_start = ktime_get_real_ns();
+ r->state = BCH_REBALANCE_waiting;
+ }
- bch2_move_stats_init(&move_stats, "rebalance");
- while (!kthread_wait_freezable(r->enabled)) {
- cond_resched();
+ bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
- start = jiffies;
- cputime = curr_cputime();
+static int do_rebalance(struct moving_context *ctxt)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+ struct bkey_s_c k;
+ int ret = 0;
- prev_run_time = start - prev_start;
- prev_run_cputime = cputime - prev_cputime;
+ bch2_move_stats_init(&r->work_stats, "rebalance_work");
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
- w = rebalance_work(c);
- BUG_ON(!w.dev_most_full_capacity);
+ bch2_trans_iter_init(trans, &rebalance_work_iter,
+ BTREE_ID_rebalance_work, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS);
- if (!w.total_work) {
- r->state = REBALANCE_WAITING;
- kthread_wait_freezable(rebalance_work(c).total_work);
+ while (!bch2_move_ratelimit(ctxt) &&
+ !kthread_wait_freezable(r->enabled)) {
+ bch2_trans_begin(trans);
+
+ ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
- }
+ if (ret || !k.k)
+ break;
- /*
- * If there isn't much work to do, throttle cpu usage:
- */
- throttle = prev_run_cputime * 100 /
- max(1U, w.dev_most_full_percent) -
- prev_run_time;
-
- if (w.dev_most_full_percent < 20 && throttle > 0) {
- r->throttled_until_iotime = io_start +
- div_u64(w.dev_most_full_capacity *
- (20 - w.dev_most_full_percent),
- 50);
-
- if (atomic64_read(&clock->now) + clock->max_slop <
- r->throttled_until_iotime) {
- r->throttled_until_cputime = start + throttle;
- r->state = REBALANCE_THROTTLED;
-
- bch2_kthread_io_clock_wait(clock,
- r->throttled_until_iotime,
- throttle);
- continue;
- }
- }
+ ret = k.k->type == KEY_TYPE_cookie
+ ? do_rebalance_scan(ctxt, k.k->p.inode,
+ le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+ : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
- /* minimum 1 mb/sec: */
- r->pd.rate.rate =
- max_t(u64, 1 << 11,
- r->pd.rate.rate *
- max(p.dev_most_full_percent, 1U) /
- max(w.dev_most_full_percent, 1U));
-
- io_start = atomic64_read(&clock->now);
- p = w;
- prev_start = start;
- prev_cputime = cputime;
-
- r->state = REBALANCE_RUNNING;
- memset(&move_stats, 0, sizeof(move_stats));
- rebalance_work_reset(c);
-
- bch2_move_data(c,
- 0, POS_MIN,
- BTREE_ID_NR, POS_MAX,
- /* ratelimiting disabled for now */
- NULL, /* &r->pd.rate, */
- &move_stats,
- writepoint_ptr(&c->rebalance_write_point),
- true,
- rebalance_pred, NULL);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
+
+ bch2_btree_iter_advance(&rebalance_work_iter);
}
- return 0;
+ bch2_trans_iter_exit(trans, &extent_iter);
+ bch2_trans_iter_exit(trans, &rebalance_work_iter);
+ bch2_move_stats_exit(&r->scan_stats, c);
+
+ if (!ret &&
+ !kthread_should_stop() &&
+ !atomic64_read(&r->work_stats.sectors_seen) &&
+ !atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_trans_unlock_long(trans);
+ rebalance_wait(c);
+ }
+
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
+ return ret;
}
-void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
+static int bch2_rebalance_thread(void *arg)
{
+ struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
- struct rebalance_work w = rebalance_work(c);
+ struct moving_context ctxt;
+ int ret;
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 20);
+ set_freezable();
- prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
- prt_tab(out);
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+ writepoint_ptr(&c->rebalance_write_point),
+ true);
- prt_human_readable_u64(out, w.dev_most_full_work << 9);
- prt_printf(out, "/");
- prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
- prt_newline(out);
+ while (!kthread_should_stop() &&
+ !(ret = do_rebalance(&ctxt)))
+ ;
- prt_printf(out, "total work:");
- prt_tab(out);
+ bch2_moving_ctxt_exit(&ctxt);
- prt_human_readable_u64(out, w.total_work << 9);
- prt_printf(out, "/");
- prt_human_readable_u64(out, c->capacity << 9);
- prt_newline(out);
+ return 0;
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_fs_rebalance *r = &c->rebalance;
- prt_printf(out, "rate:");
- prt_tab(out);
- prt_printf(out, "%u", r->pd.rate.rate);
+ prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
+ printbuf_indent_add(out, 2);
switch (r->state) {
- case REBALANCE_WAITING:
- prt_printf(out, "waiting");
+ case BCH_REBALANCE_waiting: {
+ u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+ prt_str(out, "io wait duration: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ prt_newline(out);
+
+ prt_str(out, "io wait remaining: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ prt_newline(out);
+
+ prt_str(out, "duration waited: ");
+ bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+ prt_newline(out);
break;
- case REBALANCE_THROTTLED:
- prt_printf(out, "throttled for %lu sec or ",
- (r->throttled_until_cputime - jiffies) / HZ);
- prt_human_readable_u64(out,
- (r->throttled_until_iotime -
- atomic64_read(&c->io_clock[WRITE].now)) << 9);
- prt_printf(out, " io");
+ }
+ case BCH_REBALANCE_working:
+ bch2_move_stats_to_text(out, &r->work_stats);
break;
- case REBALANCE_RUNNING:
- prt_printf(out, "running");
+ case BCH_REBALANCE_scanning:
+ bch2_move_stats_to_text(out, &r->scan_stats);
break;
}
prt_newline(out);
+ printbuf_indent_sub(out, 2);
}
void bch2_rebalance_stop(struct bch_fs *c)
p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
ret = PTR_ERR_OR_ZERO(p);
if (ret) {
- bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "creating rebalance thread");
return ret;
}
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
-
- atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
}
#include "rebalance_types.h"
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
rcu_read_unlock();
}
-void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
- struct bch_io_opts *);
-void bch2_rebalance_add_work(struct bch_fs *, u64);
-
-void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
#ifndef _BCACHEFS_REBALANCE_TYPES_H
#define _BCACHEFS_REBALANCE_TYPES_H
+#include "bbpos_types.h"
#include "move_types.h"
-enum rebalance_state {
- REBALANCE_WAITING,
- REBALANCE_THROTTLED,
- REBALANCE_RUNNING,
+#define BCH_REBALANCE_STATES() \
+ x(waiting) \
+ x(working) \
+ x(scanning)
+
+enum bch_rebalance_states {
+#define x(t) BCH_REBALANCE_##t,
+ BCH_REBALANCE_STATES()
+#undef x
};
struct bch_fs_rebalance {
- struct task_struct __rcu *thread;
+ struct task_struct __rcu *thread;
struct bch_pd_controller pd;
- atomic64_t work_unknown_dev;
+ enum bch_rebalance_states state;
+ u64 wait_iotime_start;
+ u64 wait_iotime_end;
+ u64 wait_wallclock_start;
+
+ struct bch_move_stats work_stats;
- enum rebalance_state state;
- u64 throttled_until_iotime;
- unsigned long throttled_until_cputime;
+ struct bbpos scan_start;
+ struct bbpos scan_end;
+ struct bch_move_stats scan_stats;
- unsigned enabled:1;
+ unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
#include "bkey_buf.h"
#include "alloc_background.h"
#include "btree_gc.h"
+#include "btree_journal_iter.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "lru.h"
+#include "logged_ops.h"
#include "move.h"
#include "quota.h"
+#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
+#include "sb-clean.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "super-io.h"
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+static bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* for -o reconstruct_alloc: */
static void drop_alloc_keys(struct journal_keys *keys)
{
size_t src, dst;
for (src = 0, dst = 0; src < keys->nr; src++)
- if (keys->d[src].btree_id != BTREE_ID_alloc)
+ if (!btree_id_is_alloc(keys->d[src].btree_id))
keys->d[dst++] = keys->d[src];
keys->nr = dst;
bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
}
-/* iterate over keys read from the journal: */
-
-static int __journal_key_cmp(enum btree_id l_btree_id,
- unsigned l_level,
- struct bpos l_pos,
- const struct journal_key *r)
-{
- return (cmp_int(l_btree_id, r->btree_id) ?:
- cmp_int(l_level, r->level) ?:
- bpos_cmp(l_pos, r->k->k.p));
-}
-
-static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
- return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
- size_t gap_size = keys->size - keys->nr;
-
- if (idx >= keys->gap)
- idx += gap_size;
- return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
- return keys->d + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- size_t l = 0, r = keys->nr, m;
-
- while (l < r) {
- m = l + ((r - l) >> 1);
- if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
- l = m + 1;
- else
- r = m;
- }
-
- BUG_ON(l < keys->nr &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
- BUG_ON(l &&
- __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
- return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos,
- struct bpos end_pos, size_t *idx)
-{
- struct journal_keys *keys = &c->journal_keys;
- unsigned iters = 0;
- struct journal_key *k;
-search:
- if (!*idx)
- *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
- while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
- if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
- return NULL;
-
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
- !k->overwritten)
- return k->k;
-
- (*idx)++;
- iters++;
- if (iters == 10) {
- *idx = 0;
- goto search;
- }
- }
-
- return NULL;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
- unsigned level, struct bpos pos)
-{
- size_t idx = 0;
-
- return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
- struct journal_keys *keys = &c->journal_keys;
- /* The key we just inserted is immediately before the gap: */
- size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct btree_and_journal_iter *iter;
-
- /*
- * If an iterator points one after the key we just inserted, decrement
- * the iterator so it points at the key we just inserted - if the
- * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
- * handle that:
- */
- list_for_each_entry(iter, &c->journal_iters, journal.list)
- if (iter->journal.idx == gap_end)
- iter->journal.idx = keys->gap - 1;
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
- struct journal_keys *keys = &c->journal_keys;
- struct journal_iter *iter;
- size_t gap_size = keys->size - keys->nr;
-
- list_for_each_entry(iter, &c->journal_iters, list) {
- if (iter->idx > old_gap)
- iter->idx -= gap_size;
- if (iter->idx >= new_gap)
- iter->idx += gap_size;
- }
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
-{
- struct journal_key n = {
- .btree_id = id,
- .level = level,
- .k = k,
- .allocated = true,
- /*
- * Ensure these keys are done last by journal replay, to unblock
- * journal reclaim:
- */
- .journal_seq = U32_MAX,
- };
- struct journal_keys *keys = &c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
- BUG_ON(test_bit(BCH_FS_RW, &c->flags));
-
- if (idx < keys->size &&
- journal_key_cmp(&n, &keys->d[idx]) == 0) {
- if (keys->d[idx].allocated)
- kfree(keys->d[idx].k);
- keys->d[idx] = n;
- return 0;
- }
-
- if (idx > keys->gap)
- idx -= keys->size - keys->nr;
-
- if (keys->nr == keys->size) {
- struct journal_keys new_keys = {
- .nr = keys->nr,
- .size = max_t(size_t, keys->size, 8) * 2,
- };
-
- new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
- if (!new_keys.d) {
- bch_err(c, "%s: error allocating new key array (size %zu)",
- __func__, new_keys.size);
- return -BCH_ERR_ENOMEM_journal_key_insert;
- }
-
- /* Since @keys was full, there was no gap: */
- memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
- kvfree(keys->d);
- *keys = new_keys;
-
- /* And now the gap is at the end: */
- keys->gap = keys->nr;
- }
-
- journal_iters_move_gap(c, keys->gap, idx);
-
- move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
- keys->gap = idx;
-
- keys->nr++;
- keys->d[keys->gap++] = n;
-
- journal_iters_fix(c);
-
- return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
-{
- struct bkey_i *n;
- int ret;
-
- n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
- if (!n)
- return -BCH_ERR_ENOMEM_journal_key_insert;
-
- bkey_copy(n, k);
- ret = bch2_journal_key_insert_take(c, id, level, n);
- if (ret)
- kfree(n);
- return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bpos pos)
-{
- struct bkey_i whiteout;
-
- bkey_init(&whiteout.k);
- whiteout.k.p = pos;
-
- return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
- unsigned level, struct bpos pos)
-{
- struct journal_keys *keys = &c->journal_keys;
- size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
- if (idx < keys->size &&
- keys->d[idx].btree_id == btree &&
- keys->d[idx].level == level &&
- bpos_eq(keys->d[idx].k->k.p, pos))
- keys->d[idx].overwritten = true;
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
- if (iter->idx < iter->keys->size) {
- iter->idx++;
- if (iter->idx == iter->keys->gap)
- iter->idx += iter->keys->size - iter->keys->nr;
- }
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
- struct journal_key *k = iter->keys->d + iter->idx;
-
- while (k < iter->keys->d + iter->keys->size &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level) {
- if (!k->overwritten)
- return bkey_i_to_s_c(k->k);
-
- bch2_journal_iter_advance(iter);
- k = iter->keys->d + iter->idx;
- }
-
- return bkey_s_c_null;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
- list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
- struct journal_iter *iter,
- enum btree_id id, unsigned level,
- struct bpos pos)
-{
- iter->btree_id = id;
- iter->level = level;
- iter->keys = &c->journal_keys;
- iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
- return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
- iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
- bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
- if (bpos_eq(iter->pos, SPOS_MAX))
- iter->at_end = true;
- else
- iter->pos = bpos_successor(iter->pos);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
- struct bkey_s_c btree_k, journal_k, ret;
-again:
- if (iter->at_end)
- return bkey_s_c_null;
-
- while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
- bpos_lt(btree_k.k->p, iter->pos))
- bch2_journal_iter_advance_btree(iter);
-
- while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
- bpos_lt(journal_k.k->p, iter->pos))
- bch2_journal_iter_advance(&iter->journal);
-
- ret = journal_k.k &&
- (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
- ? journal_k
- : btree_k;
-
- if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
- ret = bkey_s_c_null;
-
- if (ret.k) {
- iter->pos = ret.k->p;
- if (bkey_deleted(ret.k)) {
- bch2_btree_and_journal_iter_advance(iter);
- goto again;
- }
- } else {
- iter->pos = SPOS_MAX;
- iter->at_end = true;
- }
-
- return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
- bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct bpos pos)
-{
- memset(iter, 0, sizeof(*iter));
-
- iter->b = b;
- iter->node_iter = node_iter;
- bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
- INIT_LIST_HEAD(&iter->journal.list);
- iter->pos = b->data->min_key;
- iter->at_end = false;
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
- struct btree *b)
-{
- struct btree_node_iter node_iter;
-
- bch2_btree_node_iter_init_from_start(&node_iter, b);
- __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
- list_add(&iter->journal.list, &c->journal_iters);
-}
-
-/* sort and dedup all keys in the journal: */
-
-void bch2_journal_entries_free(struct bch_fs *c)
-{
- struct journal_replay **i;
- struct genradix_iter iter;
-
- genradix_for_each(&c->journal_entries, iter, i)
- if (*i)
- kvpfree(*i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&(*i)->j));
- genradix_free(&c->journal_entries);
-}
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
- const struct journal_key *l = _l;
- const struct journal_key *r = _r;
-
- return journal_key_cmp(l, r) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset);
-}
-
-void bch2_journal_keys_free(struct journal_keys *keys)
-{
- struct journal_key *i;
-
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- for (i = keys->d; i < keys->d + keys->nr; i++)
- if (i->allocated)
- kfree(i->k);
-
- kvfree(keys->d);
- keys->d = NULL;
- keys->nr = keys->gap = keys->size = 0;
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
- struct journal_key *src, *dst;
-
- sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
-
- src = dst = keys->d;
- while (src < keys->d + keys->nr) {
- while (src + 1 < keys->d + keys->nr &&
- src[0].btree_id == src[1].btree_id &&
- src[0].level == src[1].level &&
- bpos_eq(src[0].k->k.p, src[1].k->k.p))
- src++;
-
- *dst++ = *src++;
- }
-
- keys->nr = dst - keys->d;
-}
-
-static int journal_keys_sort(struct bch_fs *c)
-{
- struct genradix_iter iter;
- struct journal_replay *i, **_i;
- struct jset_entry *entry;
- struct bkey_i *k;
- struct journal_keys *keys = &c->journal_keys;
- size_t nr_keys = 0, nr_read = 0;
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (!i || i->ignore)
- continue;
-
- for_each_jset_key(k, entry, &i->j)
- nr_keys++;
- }
-
- if (!nr_keys)
- return 0;
-
- keys->size = roundup_pow_of_two(nr_keys);
-
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- if (!keys->d) {
- bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
- nr_keys);
-
- do {
- keys->size >>= 1;
- keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
- } while (!keys->d && keys->size > nr_keys / 8);
-
- if (!keys->d) {
- bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
- keys->size);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
-
- genradix_for_each(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (!i || i->ignore)
- continue;
-
- cond_resched();
-
- for_each_jset_key(k, entry, &i->j) {
- if (keys->nr == keys->size) {
- __journal_keys_sort(keys);
-
- if (keys->nr > keys->size * 7 / 8) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
- keys->nr, keys->size, nr_read, nr_keys);
- return -BCH_ERR_ENOMEM_journal_keys_sort;
- }
- }
-
- keys->d[keys->nr++] = (struct journal_key) {
- .btree_id = entry->btree_id,
- .level = entry->level,
- .k = k,
- .journal_seq = le64_to_cpu(i->j.seq),
- .journal_offset = k->_data - i->j._data,
- };
-
- nr_read++;
- }
- }
-
- __journal_keys_sort(keys);
- keys->gap = keys->nr;
-
- bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
- return 0;
-}
-
/* journal replay: */
static void replay_now_at(struct journal *j, u64 seq)
unsigned update_flags = BTREE_TRIGGER_NORUN;
int ret;
+ if (k->overwritten)
+ return 0;
+
+ trans->journal_res.seq = k->journal_seq;
+
/*
* BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
static int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key **keys_sorted, *k;
+ DARRAY(struct journal_key *) keys_sorted = { 0 };
+ struct journal_key **kp;
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
- size_t i;
+ struct btree_trans *trans = bch2_trans_get(c);
int ret;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
- if (!keys_sorted)
- return -BCH_ERR_ENOMEM_journal_replay;
-
- for (i = 0; i < keys->nr; i++)
- keys_sorted[i] = &keys->d[i];
-
- sort(keys_sorted, keys->nr,
- sizeof(keys_sorted[0]),
- journal_sort_seq_cmp, NULL);
-
if (keys->nr) {
ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
keys->nr, start_seq, end_seq);
goto err;
}
- for (i = 0; i < keys->nr; i++) {
- k = keys_sorted[i];
+ /*
+ * First, attempt to replay keys in sorted order. This is more
+ * efficient - better locality of btree access - but some might fail if
+ * that would cause a journal deadlock.
+ */
+ for (size_t i = 0; i < keys->nr; i++) {
+ cond_resched();
+
+ struct journal_key *k = keys->d + i;
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+ bch2_journal_replay_key(trans, k));
+ BUG_ON(!ret && !k->overwritten);
+ if (ret) {
+ ret = darray_push(&keys_sorted, k);
+ if (ret)
+ goto err;
+ }
+ }
+
+ /*
+ * Now, replay any remaining keys in the order in which they appear in
+ * the journal, unpinning those journal entries as we go:
+ */
+ sort(keys_sorted.data, keys_sorted.nr,
+ sizeof(keys_sorted.data[0]),
+ journal_sort_seq_cmp, NULL);
+
+ darray_for_each(keys_sorted, kp) {
cond_resched();
+ struct journal_key *k = *kp;
+
replay_now_at(j, k->journal_seq);
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL|
- (!k->allocated
- ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
- : 0),
- bch2_journal_replay_key(&trans, k));
- if (ret) {
- bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
- bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ (!k->allocated
+ ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
+ : 0),
+ bch2_journal_replay_key(trans, k));
+ bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
+ bch2_btree_id_str(k->btree_id), k->level);
+ if (ret)
goto err;
- }
+
+ BUG_ON(!k->overwritten);
}
+ /*
+ * We need to put our btree_trans before calling flush_all_pins(), since
+ * that will use a btree_trans internally
+ */
+ bch2_trans_put(trans);
+ trans = NULL;
+
replay_now_at(j, j->replay_journal_seq_end);
j->replay_journal_seq = 0;
if (keys->nr && !ret)
bch2_journal_log_msg(c, "journal replay finished");
err:
- kvfree(keys_sorted);
-
- if (ret)
- bch_err_fn(c, ret);
+ if (trans)
+ bch2_trans_put(trans);
+ darray_exit(&keys_sorted);
+ bch_err_fn(c, ret);
return ret;
}
if (entry->u64s) {
r->level = entry->level;
- bkey_copy(&r->key, &entry->start[0]);
+ bkey_copy(&r->key, (struct bkey_i *) entry->start);
r->error = 0;
} else {
r->error = -EIO;
/* sb clean section: */
-static struct bkey_i *btree_root_find(struct bch_fs *c,
- struct bch_sb_field_clean *clean,
- struct jset *j,
- enum btree_id id, unsigned *level)
-{
- struct bkey_i *k;
- struct jset_entry *entry, *start, *end;
-
- if (clean) {
- start = clean->start;
- end = vstruct_end(&clean->field);
- } else {
- start = j->start;
- end = vstruct_last(j);
- }
-
- for (entry = start; entry < end; entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_btree_root &&
- entry->btree_id == id)
- goto found;
-
- return NULL;
-found:
- if (!entry->u64s)
- return ERR_PTR(-EINVAL);
-
- k = entry->start;
- *level = entry->level;
- return k;
-}
-
-static int verify_superblock_clean(struct bch_fs *c,
- struct bch_sb_field_clean **cleanp,
- struct jset *j)
-{
- unsigned i;
- struct bch_sb_field_clean *clean = *cleanp;
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
- int ret = 0;
-
- if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
- "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
- le64_to_cpu(clean->journal_seq),
- le64_to_cpu(j->seq))) {
- kfree(clean);
- *cleanp = NULL;
- return 0;
- }
-
- for (i = 0; i < BTREE_ID_NR; i++) {
- struct bkey_i *k1, *k2;
- unsigned l1 = 0, l2 = 0;
-
- k1 = btree_root_find(c, clean, NULL, i, &l1);
- k2 = btree_root_find(c, NULL, j, i, &l2);
-
- if (!k1 && !k2)
- continue;
-
- printbuf_reset(&buf1);
- printbuf_reset(&buf2);
-
- if (k1)
- bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
- else
- prt_printf(&buf1, "(none)");
-
- if (k2)
- bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
- else
- prt_printf(&buf2, "(none)");
-
- mustfix_fsck_err_on(!k1 || !k2 ||
- IS_ERR(k1) ||
- IS_ERR(k2) ||
- k1->k.u64s != k2->k.u64s ||
- memcmp(k1, k2, bkey_bytes(&k1->k)) ||
- l1 != l2, c,
- "superblock btree root %u doesn't match journal after clean shutdown\n"
- "sb: l=%u %s\n"
- "journal: l=%u %s\n", i,
- l1, buf1.buf,
- l2, buf2.buf);
- }
-fsck_err:
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- return ret;
-}
-
-static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-{
- struct bch_sb_field_clean *clean, *sb_clean;
- int ret;
-
- mutex_lock(&c->sb_lock);
- sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-
- if (fsck_err_on(!sb_clean, c,
- "superblock marked clean but clean section not present")) {
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
- c->sb.clean = false;
- mutex_unlock(&c->sb_lock);
- return NULL;
- }
-
- clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
- GFP_KERNEL);
- if (!clean) {
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
- }
-
- ret = bch2_sb_clean_validate_late(c, clean, READ);
- if (ret) {
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(ret);
- }
-
- mutex_unlock(&c->sb_lock);
-
- return clean;
-fsck_err:
- mutex_unlock(&c->sb_lock);
- return ERR_PTR(ret);
-}
-
-static bool btree_id_is_alloc(enum btree_id id)
-{
- switch (id) {
- case BTREE_ID_alloc:
- case BTREE_ID_backpointers:
- case BTREE_ID_need_discard:
- case BTREE_ID_freespace:
- case BTREE_ID_bucket_gens:
- return true;
- default:
- return false;
- }
-}
-
static int read_btree_roots(struct bch_fs *c)
{
unsigned i;
}
if (r->error) {
- __fsck_err(c, btree_id_is_alloc(i)
+ __fsck_err(c,
+ btree_id_is_alloc(i)
? FSCK_CAN_IGNORE : 0,
+ btree_root_bkey_invalid,
"invalid btree root %s",
- bch2_btree_ids[i]);
+ bch2_btree_id_str(i));
if (i == BTREE_ID_alloc)
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
ret = bch2_btree_root_read(c, i, &r->key, r->level);
if (ret) {
- __fsck_err(c,
- btree_id_is_alloc(i)
- ? FSCK_CAN_IGNORE : 0,
- "error reading btree root %s",
- bch2_btree_ids[i]);
+ fsck_err(c,
+ btree_root_read_error,
+ "error reading btree root %s",
+ bch2_btree_id_str(i));
if (btree_id_is_alloc(i))
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+ ret = 0;
}
}
root_volume.v.snapshot = cpu_to_le32(U32_MAX);
root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees,
- &root_tree.k_i,
- NULL, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots,
- &root_snapshot.k_i,
- NULL, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes,
- &root_volume.k_i,
- NULL, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
if (ret)
bch_err_fn(c, ret);
return ret;
noinline_for_stack
static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
- __bch2_fs_upgrade_for_subvolumes(&trans));
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_fs_upgrade_for_subvolumes(trans));
if (ret)
bch_err_fn(c, ret);
return ret;
}
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, _when) #_fn,
+ BCH_RECOVERY_PASSES()
+#undef x
+ NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+ return bch2_gc(c, true, c->opts.norecovery);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
+ set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ if (keys->nr)
+ return bch2_fs_read_write_early(c);
+ return 0;
+}
+
+struct recovery_pass_fn {
+ int (*fn)(struct bch_fs *);
+ unsigned when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _when) { .fn = bch2_##_fn, .when = _when },
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
static void check_version_upgrade(struct bch_fs *c)
{
- unsigned latest_compatible = bch2_version_compatible(c->sb.version);
+ unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
unsigned latest_version = bcachefs_metadata_version_current;
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
unsigned new_version = 0;
recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
if (recovery_passes) {
- prt_str(&buf, "fsck required");
+ if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
+ prt_str(&buf, "fsck required");
+ else {
+ prt_str(&buf, "running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
+ }
c->recovery_passes_explicit |= recovery_passes;
c->opts.fix_errors = FSCK_FIX_yes;
}
}
-static int bch2_check_allocations(struct bch_fs *c)
-{
- return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
- set_bit(BCH_FS_MAY_GO_RW, &c->flags);
- return 0;
-}
-
-struct recovery_pass_fn {
- int (*fn)(struct bch_fs *);
- const char *name;
- unsigned when;
-};
-
-static struct recovery_pass_fn recovery_passes[] = {
-#define x(_fn, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when },
- BCH_RECOVERY_PASSES()
-#undef x
-};
-
u64 bch2_fsck_recovery_passes(void)
{
u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++)
- if (recovery_passes[i].when & PASS_FSCK)
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & PASS_FSCK)
ret |= BIT_ULL(i);
return ret;
}
static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
- struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass;
+ struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
return false;
c->curr_recovery_pass = pass;
if (should_run_recovery_pass(c, pass)) {
- struct recovery_pass_fn *p = recovery_passes + pass;
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
if (!(p->when & PASS_SILENT))
- printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name);
+ printk(KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
ret = p->fn(c);
if (ret)
return ret;
if (!(p->when & PASS_SILENT))
printk(KERN_CONT " done\n");
+
+ c->recovery_passes_complete |= BIT_ULL(pass);
}
return 0;
{
int ret = 0;
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) {
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
continue;
{
struct bch_sb_field_clean *clean = NULL;
struct jset *last_journal_entry = NULL;
- u64 last_seq, blacklist_seq, journal_seq;
+ u64 last_seq = 0, blacklist_seq, journal_seq;
bool write_sb = false;
int ret = 0;
- if (c->sb.clean)
- clean = read_superblock_clean(c);
- ret = PTR_ERR_OR_ZERO(clean);
- if (ret)
- goto err;
+ if (c->sb.clean) {
+ clean = bch2_read_superblock_clean(c);
+ ret = PTR_ERR_OR_ZERO(clean);
+ if (ret)
+ goto err;
- if (c->sb.clean)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
- else
+ } else {
bch_info(c, "recovering from unclean shutdown");
+ }
if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
goto err;
}
- if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
- bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
- ret = -EINVAL;
- goto err;
- }
-
if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery))
check_version_upgrade(c);
if (mustfix_fsck_err_on(c->sb.clean &&
last_journal_entry &&
!journal_entry_empty(last_journal_entry), c,
+ clean_but_journal_not_empty,
"filesystem marked clean but journal not empty")) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
}
if (!last_journal_entry) {
- fsck_err_on(!c->sb.clean, c, "no journal entries found");
+ fsck_err_on(!c->sb.clean, c,
+ dirty_but_no_journal_entries,
+ "no journal entries found");
if (clean)
goto use_clean;
if (*i) {
last_journal_entry = &(*i)->j;
(*i)->ignore = false;
+ /*
+ * This was probably a NO_FLUSH entry,
+ * so last_seq was garbage - but we know
+ * we're only using a single journal
+ * entry, set it here:
+ */
+ (*i)->j.last_seq = (*i)->j.seq;
break;
}
}
- ret = journal_keys_sort(c);
+ ret = bch2_journal_keys_sort(c);
if (ret)
goto err;
if (c->sb.clean && last_journal_entry) {
- ret = verify_superblock_clean(c, &clean,
+ ret = bch2_verify_superblock_clean(c, &clean,
last_journal_entry);
if (ret)
goto err;
}
c->journal_replay_seq_start = last_seq;
- c->journal_replay_seq_end = blacklist_seq - 1;;
+ c->journal_replay_seq_end = blacklist_seq - 1;
if (c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
!test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
!test_bit(BCH_FS_ERROR, &c->flags)) {
+ bch2_flush_fsck_errs(c);
+
bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
mutex_unlock(&c->sb_lock);
if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
struct bch_move_stats stats;
}
kfree(clean);
- if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
+ if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
bch2_fs_read_write_early(c);
bch2_delete_dead_snapshots_async(c);
}
}
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = ARRAY_SIZE(recovery_passes);
+ c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
set_bit(BCH_FS_MAY_GO_RW, &c->flags);
set_bit(BCH_FS_FSCK_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- for_each_online_member(ca, c, i)
+ for_each_member_device(ca, c, i)
bch2_dev_usage_init(ca);
- for_each_online_member(ca, c, i) {
- ret = bch2_dev_journal_alloc(ca);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
+ ret = bch2_fs_journal_alloc(c);
+ if (ret)
+ goto err;
/*
* journal_res_get() will crash if called before this has
* btree updates
*/
bch_verbose(c, "marking superblocks");
- for_each_member_device(ca, c, i) {
- ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
+ ret = bch2_trans_mark_dev_sbs(c);
+ bch_err_msg(c, ret, "marking superblocks");
+ if (ret)
+ goto err;
+ for_each_online_member(ca, c, i)
ca->new_fs_bucket_idx = 0;
- }
ret = bch2_fs_freespace_init(c);
if (ret)
bch2_inode_pack(&packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_inodes,
- &packed_inode.inode.k_i,
- NULL, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
if (ret) {
bch_err_msg(c, ret, "creating root directory");
goto err;
bch2_inode_init_early(c, &lostfound_inode);
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_create_trans(&trans,
+ bch2_create_trans(trans,
BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
&lostfound,
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-struct journal_iter {
- struct list_head list;
- enum btree_id btree_id;
- unsigned level;
- size_t idx;
- struct journal_keys *keys;
-};
+extern const char * const bch2_recovery_passes[];
/*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ * For when we need to rewind recovery passes and run a pass we skipped:
*/
-
-struct btree_and_journal_iter {
- struct btree *b;
- struct btree_node_iter node_iter;
- struct bkey unpacked;
-
- struct journal_iter journal;
- struct bpos pos;
- bool at_end;
-};
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
- unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
- unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
- unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
- unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *, struct btree *,
- struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
- struct bch_fs *,
- struct btree *);
-
-void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct bch_fs *);
+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+ c->recovery_passes_explicit |= BIT_ULL(pass);
+
+ if (c->curr_recovery_pass >= pass) {
+ c->curr_recovery_pass = pass;
+ c->recovery_passes_complete &= (1ULL << pass) >> 1;
+ return -BCH_ERR_restart_recovery;
+ } else {
+ return 0;
+ }
+}
u64 bch2_fsck_recovery_passes(void);
x(snapshots_read, PASS_ALWAYS) \
x(check_topology, 0) \
x(check_allocations, PASS_FSCK) \
+ x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \
+ x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \
x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \
x(journal_replay, PASS_ALWAYS) \
x(check_alloc_info, PASS_FSCK) \
x(check_snapshot_trees, PASS_FSCK) \
x(check_snapshots, PASS_FSCK) \
x(check_subvols, PASS_FSCK) \
- x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \
+ x(delete_dead_snapshots, PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 0) \
+ x(resume_logged_ops, PASS_ALWAYS) \
x(check_inodes, PASS_FSCK) \
x(check_extents, PASS_FSCK) \
+ x(check_indirect_extents, PASS_FSCK) \
x(check_dirents, PASS_FSCK) \
x(check_xattrs, PASS_FSCK) \
x(check_root, PASS_FSCK) \
x(check_nlinks, PASS_FSCK) \
x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 0) \
+ x(set_fs_needs_rebalance, 0) \
enum bch_recovery_pass {
#define x(n, when) BCH_RECOVERY_PASS_##n,
#include "buckets.h"
#include "extents.h"
#include "inode.h"
-#include "io.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "rebalance.h"
#include "reflink.h"
#include "subvolume.h"
+#include "super-io.h"
#include <linux/sched/signal.h>
/* reflink pointers */
-int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
/* indirect extents */
-int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
bch2_bkey_ptrs_to_text(out, c, k);
}
+#if 0
+Currently disabled, needs to be debugged:
+
bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
{
struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l);
return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
}
+#endif
+
+static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+{
+ if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+ new->k.type = KEY_TYPE_deleted;
+ new->k.size = 0;
+ set_bkey_val_u64s(&new->k, 0);;
+ *flags &= ~BTREE_TRIGGER_INSERT;
+ }
+}
int bch2_trans_mark_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
- if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
- struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
-
- if (!r->v.refcount) {
- r->k.type = KEY_TYPE_deleted;
- r->k.size = 0;
- set_bkey_val_u64s(&r->k, 0);
- return 0;
- }
- }
+ check_indirect_extent_deleting(new, &flags);
return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
}
/* indirect inline data */
-int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
}
void bch2_indirect_inline_data_to_text(struct printbuf *out,
- struct bch_fs *c, struct bkey_s_c k)
+ struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
unsigned datalen = bkey_inline_data_bytes(k.k);
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
- if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
- struct bkey_i_indirect_inline_data *r =
- bkey_i_to_indirect_inline_data(new);
-
- if (!r->v.refcount) {
- r->k.type = KEY_TYPE_deleted;
- r->k.size = 0;
- set_bkey_val_u64s(&r->k, 0);
- }
- }
+ check_indirect_extent_deleting(new, &flags);
return 0;
}
u64 remap_sectors,
u64 new_i_size, s64 *i_sectors_delta)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter dst_iter, src_iter;
struct bkey_s_c src_k;
struct bkey_buf new_dst, new_src;
struct bpos dst_start = POS(dst_inum.inum, dst_offset);
struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
+ struct bch_io_opts opts;
struct bpos src_want;
- u64 dst_done;
+ u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
bch2_bkey_buf_init(&new_dst);
bch2_bkey_buf_init(&new_src);
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+ trans = bch2_trans_get(c);
- bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+ ret = bch2_inum_opts_get(trans, src_inum, &opts);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
BTREE_ITER_INTENT);
- bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+ bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
BTREE_ITER_INTENT);
while ((ret == 0 ||
bkey_lt(dst_iter.pos, dst_end)) {
struct disk_reservation disk_res = { 0 };
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
- ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+ ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
&src_snapshot);
if (ret)
continue;
bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
- ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+ ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
&dst_snapshot);
if (ret)
continue;
continue;
if (bkey_lt(src_want, src_iter.pos)) {
- ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+ ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
min(dst_end.offset,
dst_iter.pos.offset +
src_iter.pos.offset - src_want.offset),
bch2_bkey_buf_reassemble(&new_src, c, src_k);
src_k = bkey_i_to_s_c(new_src.k);
- ret = bch2_make_extent_indirect(&trans, &src_iter,
+ ret = bch2_make_extent_indirect(trans, &src_iter,
new_src.k);
if (ret)
continue;
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res,
- new_i_size, i_sectors_delta,
- true);
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_extent_update(trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res,
+ new_i_size, i_sectors_delta,
+ true);
bch2_disk_reservation_put(c, &disk_res);
}
- bch2_trans_iter_exit(&trans, &dst_iter);
- bch2_trans_iter_exit(&trans, &src_iter);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ bch2_trans_iter_exit(trans, &src_iter);
BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
BUG_ON(bkey_gt(dst_iter.pos, dst_end));
struct bch_inode_unpacked inode_u;
struct btree_iter inode_iter = { NULL };
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
- ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
+ ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
dst_inum, BTREE_ITER_INTENT);
if (!ret2 &&
inode_u.bi_size < new_i_size) {
inode_u.bi_size = new_i_size;
- ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc);
}
- bch2_trans_iter_exit(&trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-
- bch2_trans_exit(&trans);
+err:
+ bch2_trans_put(trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
enum bkey_invalid_flags;
-int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
.min_val_size = 16, \
})
-int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
.min_val_size = 8, \
})
-int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
return ret;
err:
- bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "adding replicas entry");
goto out;
}
{
lockdep_assert_held(&c->replicas_gc_lock);
- if (ret)
- goto err;
-
mutex_lock(&c->sb_lock);
percpu_down_write(&c->mark_lock);
- ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
- if (ret)
- goto err;
+ ret = ret ?:
+ bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
+ replicas_table_update(c, &c->replicas_gc);
- ret = replicas_table_update(c, &c->replicas_gc);
-err:
kfree(c->replicas_gc.entries);
c->replicas_gc.entries = NULL;
bch2_cpu_replicas_sort(&new);
- ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
- if (ret)
- goto err;
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
+ replicas_table_update(c, &new);
- ret = replicas_table_update(c, &new);
-err:
kfree(new.entries);
percpu_up_write(&c->mark_lock);
struct bch_replicas_cpu new_r = { 0, 0, NULL };
int ret = 0;
- if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
+ if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
- else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
+ else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
if (ret)
return ret;
for_each_cpu_replicas_entry(r, src)
bytes += replicas_entry_bytes(src) - 1;
- sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
+ sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
return -BCH_ERR_ENOSPC_sb_replicas;
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
- sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
+ sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
memset(&sb_r->entries, 0,
vstruct_end(&sb_r->field) -
if (!need_v1)
return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
- sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+ sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
return -BCH_ERR_ENOSPC_sb_replicas;
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
- sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
+ sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
memset(&sb_r->entries, 0,
vstruct_end(&sb_r->field) -
struct bch_sb *sb,
struct printbuf *err)
{
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
unsigned i, j;
sort_cmp_size(cpu_r->entries,
}
for (j = 0; j < e->nr_devs; j++)
- if (!bch2_dev_exists(sb, mi, e->devs[j])) {
+ if (!bch2_dev_exists(sb, e->devs[j])) {
prt_printf(err, "invalid device %u in entry ", e->devs[j]);
bch2_replicas_entry_to_text(err, e);
return -BCH_ERR_invalid_sb_replicas;
struct bch_sb_field_replicas_v0 *replicas_v0;
unsigned i, data_has = 0;
- replicas = bch2_sb_get_replicas(sb);
- replicas_v0 = bch2_sb_get_replicas_v0(sb);
+ replicas = bch2_sb_field_get(sb, replicas);
+ replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
if (replicas) {
struct bch_replicas_entry *r;
#include <crypto/hash.h>
#include <crypto/sha2.h>
+typedef unsigned __bitwise bch_str_hash_flags_t;
+
+enum bch_str_hash_flags {
+ __BCH_HASH_SET_MUST_CREATE,
+ __BCH_HASH_SET_MUST_REPLACE,
+};
+
+#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
+
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
- int flags,
+ bch_str_hash_flags_t str_hash_flags,
int update_flags)
{
struct btree_iter iter, slot = { NULL };
}
if (!slot.path &&
- !(flags & BCH_HASH_SET_MUST_REPLACE))
+ !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
found = true;
not_found:
- if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+ if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+ } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
if (!found && slot.path)
swap(iter, slot);
insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, 0);
+ ret = bch2_trans_update(trans, &iter, insert, update_flags);
}
goto out;
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
- struct bkey_i *insert, int flags)
+ struct bkey_i *insert,
+ bch_str_hash_flags_t str_hash_flags)
{
u32 snapshot;
int ret;
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, flags, 0);
+ snapshot, insert, str_hash_flags, 0);
}
static __always_inline
#include "errcode.h"
#include "error.h"
#include "fs.h"
+#include "snapshot.h"
#include "subvolume.h"
#include <linux/random.h>
static int bch2_subvolume_delete(struct btree_trans *, u32);
-static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
-{
- const struct snapshot_t *s = __snapshot_t(t, id);
-
- if (s->skip[2] <= ancestor)
- return s->skip[2];
- if (s->skip[1] <= ancestor)
- return s->skip[1];
- if (s->skip[0] <= ancestor)
- return s->skip[0];
- return s->parent;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
- struct snapshot_table *t;
- bool ret;
-
- EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
-
- rcu_read_lock();
- t = rcu_dereference(c->snapshots);
-
- while (id && id < ancestor - IS_ANCESTOR_BITMAP)
- id = get_ancestor_below(t, id, ancestor);
-
- ret = id && id < ancestor
- ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor)
- : id == ancestor;
- rcu_read_unlock();
-
- return ret;
-}
-
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-{
- struct snapshot_table *t;
-
- rcu_read_lock();
- t = rcu_dereference(c->snapshots);
-
- while (id && id < ancestor)
- id = __snapshot_t(t, id)->parent;
- rcu_read_unlock();
-
- return id == ancestor;
-}
-
-static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
-{
- u32 depth;
-
- rcu_read_lock();
- depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
- rcu_read_unlock();
-
- return depth;
-}
-
-static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
-{
- size_t idx = U32_MAX - id;
- size_t new_size;
- struct snapshot_table *new, *old;
-
- new_size = max(16UL, roundup_pow_of_two(idx + 1));
-
- new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
- if (!new)
- return NULL;
-
- old = rcu_dereference_protected(c->snapshots, true);
- if (old)
- memcpy(new->s,
- rcu_dereference_protected(c->snapshots, true)->s,
- sizeof(new->s[0]) * c->snapshot_table_size);
-
- rcu_assign_pointer(c->snapshots, new);
- c->snapshot_table_size = new_size;
- if (old)
- kvfree_rcu(old);
-
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
-}
-
-static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
-{
- size_t idx = U32_MAX - id;
-
- lockdep_assert_held(&c->snapshot_table_lock);
-
- if (likely(idx < c->snapshot_table_size))
- return &rcu_dereference_protected(c->snapshots, true)->s[idx];
-
- return __snapshot_t_mut(c, id);
-}
-
-/* Snapshot tree: */
-
-void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
-
- prt_printf(out, "subvol %u root snapshot %u",
- le32_to_cpu(t.v->master_subvol),
- le32_to_cpu(t.v->root_snapshot));
-}
-
-int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
-{
- if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1))) {
- prt_printf(err, "bad pos");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
-}
-
-int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
- struct bch_snapshot_tree *s)
-{
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
- BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
-
- if (bch2_err_matches(ret, ENOENT))
- ret = -BCH_ERR_ENOENT_snapshot_tree;
- return ret;
-}
-
-static struct bkey_i_snapshot_tree *
-__snapshot_tree_create(struct btree_trans *trans)
-{
- struct btree_iter iter;
- int ret = bch2_bkey_get_empty_slot(trans, &iter,
- BTREE_ID_snapshot_trees, POS(0, U32_MAX));
- struct bkey_i_snapshot_tree *s_t;
-
- if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_snapshot_tree;
- if (ret)
- return ERR_PTR(ret);
-
- s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(s_t);
- bch2_trans_iter_exit(trans, &iter);
- return ret ? ERR_PTR(ret) : s_t;
-}
-
-static int snapshot_tree_create(struct btree_trans *trans,
- u32 root_id, u32 subvol_id, u32 *tree_id)
-{
- struct bkey_i_snapshot_tree *n_tree =
- __snapshot_tree_create(trans);
-
- if (IS_ERR(n_tree))
- return PTR_ERR(n_tree);
-
- n_tree->v.master_subvol = cpu_to_le32(subvol_id);
- n_tree->v.root_snapshot = cpu_to_le32(root_id);
- *tree_id = n_tree->k.p.offset;
- return 0;
-}
-
-/* Snapshot nodes: */
-
-void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-
- prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
- BCH_SNAPSHOT_SUBVOL(s.v),
- BCH_SNAPSHOT_DELETED(s.v),
- le32_to_cpu(s.v->parent),
- le32_to_cpu(s.v->children[0]),
- le32_to_cpu(s.v->children[1]),
- le32_to_cpu(s.v->subvol),
- le32_to_cpu(s.v->tree));
-
- if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
- prt_printf(out, " depth %u skiplist %u %u %u",
- le32_to_cpu(s.v->depth),
- le32_to_cpu(s.v->skip[0]),
- le32_to_cpu(s.v->skip[1]),
- le32_to_cpu(s.v->skip[2]));
-}
-
-int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
- enum bkey_invalid_flags flags,
- struct printbuf *err)
-{
- struct bkey_s_c_snapshot s;
- u32 i, id;
-
- if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
- bkey_lt(k.k->p, POS(0, 1))) {
- prt_printf(err, "bad pos");
- return -BCH_ERR_invalid_bkey;
- }
-
- s = bkey_s_c_to_snapshot(k);
-
- id = le32_to_cpu(s.v->parent);
- if (id && id <= k.k->p.offset) {
- prt_printf(err, "bad parent node (%u <= %llu)",
- id, k.k->p.offset);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
- prt_printf(err, "children not normalized");
- return -BCH_ERR_invalid_bkey;
- }
-
- if (s.v->children[0] &&
- s.v->children[0] == s.v->children[1]) {
- prt_printf(err, "duplicate child nodes");
- return -BCH_ERR_invalid_bkey;
- }
-
- for (i = 0; i < 2; i++) {
- id = le32_to_cpu(s.v->children[i]);
-
- if (id >= k.k->p.offset) {
- prt_printf(err, "bad child node (%u >= %llu)",
- id, k.k->p.offset);
- return -BCH_ERR_invalid_bkey;
- }
- }
-
- if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
- if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
- le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
- prt_printf(err, "skiplist not normalized");
- return -BCH_ERR_invalid_bkey;
- }
-
- for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
- id = le32_to_cpu(s.v->skip[i]);
-
- if (!id != !s.v->parent ||
- (s.v->parent &&
- id <= k.k->p.offset)) {
- prt_printf(err, "bad skiplist node %u)", id);
- return -BCH_ERR_invalid_bkey;
- }
- }
- }
-
- return 0;
-}
-
-int bch2_mark_snapshot(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct snapshot_t *t;
- u32 id = new.k->p.offset;
- int ret = 0;
-
- mutex_lock(&c->snapshot_table_lock);
-
- t = snapshot_t_mut(c, id);
- if (!t) {
- ret = -BCH_ERR_ENOMEM_mark_snapshot;
- goto err;
- }
-
- if (new.k->type == KEY_TYPE_snapshot) {
- struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
- u32 parent = id;
-
- t->parent = le32_to_cpu(s.v->parent);
- t->children[0] = le32_to_cpu(s.v->children[0]);
- t->children[1] = le32_to_cpu(s.v->children[1]);
- t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
- t->tree = le32_to_cpu(s.v->tree);
-
- if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
- t->depth = le32_to_cpu(s.v->depth);
- t->skip[0] = le32_to_cpu(s.v->skip[0]);
- t->skip[1] = le32_to_cpu(s.v->skip[1]);
- t->skip[2] = le32_to_cpu(s.v->skip[2]);
- } else {
- t->depth = 0;
- t->skip[0] = 0;
- t->skip[1] = 0;
- t->skip[2] = 0;
- }
-
- while ((parent = bch2_snapshot_parent_early(c, parent)) &&
- parent - id - 1 < IS_ANCESTOR_BITMAP)
- __set_bit(parent - id - 1, t->is_ancestor);
-
- if (BCH_SNAPSHOT_DELETED(s.v)) {
- set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
- c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
- }
- } else {
- memset(t, 0, sizeof(*t));
- }
-err:
- mutex_unlock(&c->snapshot_table_lock);
- return ret;
-}
-
-static int snapshot_lookup(struct btree_trans *trans, u32 id,
- struct bch_snapshot *s)
-{
- return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_WITH_UPDATES, snapshot, s);
-}
-
-static int snapshot_live(struct btree_trans *trans, u32 id)
-{
- struct bch_snapshot v;
- int ret;
-
- if (!id)
- return 0;
-
- ret = snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot node %u not found", id);
- if (ret)
- return ret;
-
- return !BCH_SNAPSHOT_DELETED(&v);
-}
-
-static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- unsigned i, nr_live = 0, live_idx = 0;
- struct bkey_s_c_snapshot snap;
- u32 id = k.k->p.offset, child[2];
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- snap = bkey_s_c_to_snapshot(k);
-
- child[0] = le32_to_cpu(snap.v->children[0]);
- child[1] = le32_to_cpu(snap.v->children[1]);
-
- for (i = 0; i < 2; i++) {
- int ret = snapshot_live(trans, child[i]);
-
- if (ret < 0)
- return ret;
-
- if (ret)
- live_idx = i;
- nr_live += ret;
- }
-
- mutex_lock(&c->snapshot_table_lock);
-
- snapshot_t_mut(c, id)->equiv = nr_live == 1
- ? snapshot_t_mut(c, child[live_idx])->equiv
- : id;
-
- mutex_unlock(&c->snapshot_table_lock);
-
- return 0;
-}
-
-/* fsck: */
-
-static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
-{
- return snapshot_t(c, id)->children[child];
-}
-
-static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
-{
- return bch2_snapshot_child(c, id, 0);
-}
-
-static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
-{
- return bch2_snapshot_child(c, id, 1);
-}
-
-static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
-{
- u32 n, parent;
-
- n = bch2_snapshot_left_child(c, id);
- if (n)
- return n;
-
- while ((parent = bch2_snapshot_parent(c, id))) {
- n = bch2_snapshot_right_child(c, parent);
- if (n && n != id)
- return n;
- id = parent;
- }
-
- return 0;
-}
-
-static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
-{
- u32 id = snapshot_root;
- u32 subvol = 0, s;
-
- while (id) {
- s = snapshot_t(c, id)->subvol;
-
- if (s && (!subvol || s < subvol))
- subvol = s;
-
- id = bch2_snapshot_tree_next(c, id);
- }
-
- return subvol;
-}
-
-static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
- u32 snapshot_root, u32 *subvol_id)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_subvolume s;
- bool found = false;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_subvolume)
- continue;
-
- s = bkey_s_c_to_subvolume(k);
- if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
- continue;
- if (!BCH_SUBVOLUME_SNAP(s.v)) {
- *subvol_id = s.k->p.offset;
- found = true;
- break;
- }
- }
-
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && !found) {
- struct bkey_i_subvolume *s;
-
- *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
-
- s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_subvolumes, POS(0, *subvol_id),
- 0, subvolume);
- ret = PTR_ERR_OR_ZERO(s);
- if (ret)
- return ret;
-
- SET_BCH_SUBVOLUME_SNAP(&s->v, false);
- }
-
- return ret;
-}
-
-static int check_snapshot_tree(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_snapshot_tree st;
- struct bch_snapshot s;
- struct bch_subvolume subvol;
- struct printbuf buf = PRINTBUF;
- u32 root_id;
- int ret;
-
- if (k.k->type != KEY_TYPE_snapshot_tree)
- return 0;
-
- st = bkey_s_c_to_snapshot_tree(k);
- root_id = le32_to_cpu(st.v->root_snapshot);
-
- ret = snapshot_lookup(trans, root_id, &s);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret ||
- root_id != bch2_snapshot_root(c, root_id) ||
- st.k->p.offset != le32_to_cpu(s.tree),
- c,
- "snapshot tree points to missing/incorrect snapshot:\n %s",
- (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto err;
- }
-
- ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
- false, 0, &subvol);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- if (fsck_err_on(ret, c,
- "snapshot tree points to missing subvolume:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
- le32_to_cpu(subvol.snapshot),
- root_id), c,
- "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
- fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
- "snapshot tree points to snapshot subvolume:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
- struct bkey_i_snapshot_tree *u;
- u32 subvol_id;
-
- ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
- if (ret)
- goto err;
-
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.master_subvol = cpu_to_le32(subvol_id);
- st = snapshot_tree_i_to_s_c(u);
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-/*
- * For each snapshot_tree, make sure it points to the root of a snapshot tree
- * and that snapshot entry points back to it, or delete it.
- *
- * And, make sure it points to a subvolume within that snapshot tree, or correct
- * it to point to the oldest subvolume within that snapshot tree.
- */
-int bch2_check_snapshot_trees(struct bch_fs *c)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
- for_each_btree_key_commit(&trans, iter,
- BTREE_ID_snapshot_trees, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_snapshot_tree(&trans, &iter, k)));
-
- if (ret)
- bch_err(c, "error %i checking snapshot trees", ret);
- return ret;
-}
-
-/*
- * Look up snapshot tree for @tree_id and find root,
- * make sure @snap_id is a descendent:
- */
-static int snapshot_tree_ptr_good(struct btree_trans *trans,
- u32 snap_id, u32 tree_id)
-{
- struct bch_snapshot_tree s_t;
- int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-
- if (bch2_err_matches(ret, ENOENT))
- return 0;
- if (ret)
- return ret;
-
- return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
-}
-
-static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s;
-
- if (!id)
- return 0;
-
- rcu_read_lock();
- s = snapshot_t(c, id);
- if (s->parent)
- id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
- rcu_read_unlock();
-
- return id;
-}
-
-static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
-{
- struct bch_snapshot a;
- unsigned i;
- int ret;
-
- for (i = 0; i < 3; i++) {
- if (!s.parent != !s.skip[i])
- return false;
-
- if (!s.parent)
- continue;
-
- ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
- if (bch2_err_matches(ret, ENOENT))
- return false;
- if (ret)
- return ret;
-
- if (a.tree != s.tree)
- return false;
- }
-
- return true;
-}
-
-/*
- * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
- * its snapshot_tree pointer is correct (allocate new one if necessary), then
- * update this node's pointer to root node's pointer:
- */
-static int snapshot_tree_ptr_repair(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- struct bch_snapshot *s)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter root_iter;
- struct bch_snapshot_tree s_t;
- struct bkey_s_c_snapshot root;
- struct bkey_i_snapshot *u;
- u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
- int ret;
-
- root = bch2_bkey_get_iter_typed(trans, &root_iter,
- BTREE_ID_snapshots, POS(0, root_id),
- BTREE_ITER_WITH_UPDATES, snapshot);
- ret = bkey_err(root);
- if (ret)
- goto err;
-
- tree_id = le32_to_cpu(root.v->tree);
-
- ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
- u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u) ?:
- snapshot_tree_create(trans, root_id,
- bch2_snapshot_tree_oldest_subvol(c, root_id),
- &tree_id);
- if (ret)
- goto err;
-
- u->v.tree = cpu_to_le32(tree_id);
- if (k.k->p.offset == root_id)
- *s = u->v;
- }
-
- if (k.k->p.offset != root_id) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.tree = cpu_to_le32(tree_id);
- *s = u->v;
- }
-err:
- bch2_trans_iter_exit(trans, &root_iter);
- return ret;
-}
-
-static int cmp_le32(__le32 l, __le32 r)
-{
- return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
-}
-
-static int check_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct bch_snapshot s;
- struct bch_subvolume subvol;
- struct bch_snapshot v;
- struct bkey_i_snapshot *u;
- u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
- u32 real_depth;
- struct printbuf buf = PRINTBUF;
- bool should_have_subvol;
- u32 i, id;
- int ret = 0;
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- memset(&s, 0, sizeof(s));
- memcpy(&s, k.v, bkey_val_bytes(k.k));
-
- id = le32_to_cpu(s.parent);
- if (id) {
- ret = snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot with nonexistent parent:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (ret)
- goto err;
-
- if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
- le32_to_cpu(v.children[1]) != k.k->p.offset) {
- bch_err(c, "snapshot parent %u missing pointer to child %llu",
- id, k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- }
-
- for (i = 0; i < 2 && s.children[i]; i++) {
- id = le32_to_cpu(s.children[i]);
-
- ret = snapshot_lookup(trans, id, &v);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot node %llu has nonexistent child %u",
- k.k->p.offset, id);
- if (ret)
- goto err;
-
- if (le32_to_cpu(v.parent) != k.k->p.offset) {
- bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
- id, le32_to_cpu(v.parent), k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- }
-
- should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
- !BCH_SNAPSHOT_DELETED(&s);
-
- if (should_have_subvol) {
- id = le32_to_cpu(s.subvol);
- ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
- if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (ret)
- goto err;
-
- if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
- bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
- k.k->p.offset);
- ret = -EINVAL;
- goto err;
- }
- } else {
- if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.subvol = 0;
- s = u->v;
- }
- }
-
- ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
- if (ret < 0)
- goto err;
-
- if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
- if (ret)
- goto err;
- }
- ret = 0;
-
- real_depth = bch2_snapshot_depth(c, parent_id);
-
- if (le32_to_cpu(s.depth) != real_depth &&
- (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s",
- real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- u->v.depth = cpu_to_le32(real_depth);
- s = u->v;
- }
-
- ret = snapshot_skiplist_good(trans, s);
- if (ret < 0)
- goto err;
-
- if (!ret &&
- (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, "snapshot with bad skiplist field:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
- u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- goto err;
-
- for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
- u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id));
-
- bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
- s = u->v;
- }
- ret = 0;
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-int bch2_check_snapshots(struct bch_fs *c)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- /*
- * We iterate backwards as checking/fixing the depth field requires that
- * the parent's depth already be correct:
- */
- ret = bch2_trans_run(c,
- for_each_btree_key_reverse_commit(&trans, iter,
- BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_snapshot(&trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
- return ret;
-}
-
static int check_subvol(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
subvol = bkey_s_c_to_subvolume(k);
snapid = le32_to_cpu(subvol.v->snapshot);
- ret = snapshot_lookup(trans, snapid, &snapshot);
+ ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
if (bch2_err_matches(ret, ENOENT))
bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
ret = bch2_subvolume_delete(trans, iter->pos.offset);
if (ret)
- bch_err(c, "error deleting subvolume %llu: %s",
- iter->pos.offset, bch2_err_str(ret));
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
return ret ?: -BCH_ERR_transaction_restart_nested;
}
if (ret)
return ret;
- if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
+ if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
+ c, subvol_not_master_and_not_snapshot,
"subvolume %llu is not set as snapshot but is not master subvolume",
k.k->p.offset)) {
struct bkey_i_subvolume *s =
int ret;
ret = bch2_trans_run(c,
- for_each_btree_key_commit(&trans, iter,
+ for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_subvol(&trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_fs_snapshots_exit(struct bch_fs *c)
-{
- kfree(rcu_dereference_protected(c->snapshots, true));
-}
-
-int bch2_snapshots_read(struct bch_fs *c)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
- bch2_snapshot_set_equiv(&trans, k)));
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+ check_subvol(trans, &iter, k)));
if (ret)
bch_err_fn(c, ret);
return ret;
}
-/*
- * Mark a snapshot as deleted, for future cleanup:
- */
-static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
-{
- struct btree_iter iter;
- struct bkey_i_snapshot *s;
- int ret = 0;
-
- s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshots, POS(0, id),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(s);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
- trans->c, "missing snapshot %u", id);
- return ret;
- }
-
- /* already deleted? */
- if (BCH_SNAPSHOT_DELETED(&s->v))
- goto err;
-
- SET_BCH_SNAPSHOT_DELETED(&s->v, true);
- SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
- s->v.subvol = 0;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
- struct btree_iter tree_iter = (struct btree_iter) { NULL };
- struct bkey_s_c_snapshot s;
- u32 parent_id;
- unsigned i;
- int ret = 0;
-
- s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_INTENT, snapshot);
- ret = bkey_err(s);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", id);
-
- if (ret)
- goto err;
-
- BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
- parent_id = le32_to_cpu(s.v->parent);
-
- if (parent_id) {
- struct bkey_i_snapshot *parent;
-
- parent = bch2_bkey_get_mut_typed(trans, &p_iter,
- BTREE_ID_snapshots, POS(0, parent_id),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(parent);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "missing snapshot %u", parent_id);
- goto err;
- }
-
- for (i = 0; i < 2; i++)
- if (le32_to_cpu(parent->v.children[i]) == id)
- break;
-
- if (i == 2)
- bch_err(c, "snapshot %u missing child pointer to %u",
- parent_id, id);
- else
- parent->v.children[i] = 0;
-
- if (le32_to_cpu(parent->v.children[0]) <
- le32_to_cpu(parent->v.children[1]))
- swap(parent->v.children[0],
- parent->v.children[1]);
- } else {
- /*
- * We're deleting the root of a snapshot tree: update the
- * snapshot_tree entry to point to the new root, or delete it if
- * this is the last snapshot ID in this tree:
- */
- struct bkey_i_snapshot_tree *s_t;
-
- BUG_ON(s.v->children[1]);
-
- s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
- BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
- 0, snapshot_tree);
- ret = PTR_ERR_OR_ZERO(s_t);
- if (ret)
- goto err;
-
- if (s.v->children[0]) {
- s_t->v.root_snapshot = s.v->children[0];
- } else {
- s_t->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&s_t->k, 0);
- }
- }
-
- ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
- bch2_trans_iter_exit(trans, &tree_iter);
- bch2_trans_iter_exit(trans, &p_iter);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_snapshot *n;
- struct bkey_s_c k;
- unsigned i, j;
- u32 depth = bch2_snapshot_depth(c, parent);
- int ret;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
- POS_MIN, BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- for (i = 0; i < nr_snapids; i++) {
- k = bch2_btree_iter_prev_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- if (!k.k || !k.k->p.offset) {
- ret = -BCH_ERR_ENOSPC_snapshot_create;
- goto err;
- }
-
- n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- n->v.flags = 0;
- n->v.parent = cpu_to_le32(parent);
- n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
- n->v.tree = cpu_to_le32(tree);
- n->v.depth = cpu_to_le32(depth);
-
- for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
- n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent));
-
- bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
- SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
-
- ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
- bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
- if (ret)
- goto err;
-
- new_snapids[i] = iter.pos.offset;
- }
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * Create new snapshot IDs as children of an existing snapshot ID:
- */
-static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct btree_iter iter;
- struct bkey_i_snapshot *n_parent;
- int ret = 0;
-
- n_parent = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_snapshots, POS(0, parent),
- 0, snapshot);
- ret = PTR_ERR_OR_ZERO(n_parent);
- if (unlikely(ret)) {
- if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot %u not found", parent);
- return ret;
- }
-
- if (n_parent->v.children[0] || n_parent->v.children[1]) {
- bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
- ret = -EINVAL;
- goto err;
- }
-
- ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
- new_snapids, snapshot_subvols, nr_snapids);
- if (ret)
- goto err;
-
- n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
- n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
- n_parent->v.subvol = 0;
- SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-/*
- * Create a snapshot node that is the root of a new tree:
- */
-static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- struct bkey_i_snapshot_tree *n_tree;
- int ret;
-
- n_tree = __snapshot_tree_create(trans);
- ret = PTR_ERR_OR_ZERO(n_tree) ?:
- create_snapids(trans, 0, n_tree->k.p.offset,
- new_snapids, snapshot_subvols, nr_snapids);
- if (ret)
- return ret;
-
- n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
- n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
- return 0;
-}
-
-int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
- u32 *new_snapids,
- u32 *snapshot_subvols,
- unsigned nr_snapids)
-{
- BUG_ON((parent == 0) != (nr_snapids == 1));
- BUG_ON((parent != 0) != (nr_snapids == 2));
-
- return parent
- ? bch2_snapshot_node_create_children(trans, parent,
- new_snapids, snapshot_subvols, nr_snapids)
- : bch2_snapshot_node_create_tree(trans,
- new_snapids, snapshot_subvols, nr_snapids);
-
-}
-
-static int snapshot_delete_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k,
- snapshot_id_list *deleted,
- snapshot_id_list *equiv_seen,
- struct bpos *last_pos)
-{
- struct bch_fs *c = trans->c;
- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
- if (!bkey_eq(k.k->p, *last_pos))
- equiv_seen->nr = 0;
- *last_pos = k.k->p;
-
- if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
- snapshot_list_has_id(equiv_seen, equiv)) {
- return bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- } else {
- return snapshot_list_add(c, equiv_seen, equiv);
- }
-}
-
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k)
-{
- struct bkey_s_c_snapshot snap;
- u32 children[2];
- int ret;
-
- if (k.k->type != KEY_TYPE_snapshot)
- return 0;
-
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
- BCH_SNAPSHOT_SUBVOL(snap.v))
- return 0;
-
- children[0] = le32_to_cpu(snap.v->children[0]);
- children[1] = le32_to_cpu(snap.v->children[1]);
-
- ret = snapshot_live(trans, children[0]) ?:
- snapshot_live(trans, children[1]);
- if (ret < 0)
- return ret;
-
- if (!ret)
- return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
- return 0;
-}
+/* Subvolumes: */
-int bch2_delete_dead_snapshots(struct bch_fs *c)
+int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bkey_invalid_flags flags, struct printbuf *err)
{
- struct btree_trans trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_snapshot snap;
- snapshot_id_list deleted = { 0 };
- u32 i, id;
int ret = 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags)) {
- ret = bch2_fs_read_write_early(c);
- if (ret) {
- bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
- return ret;
- }
- }
-
- bch2_trans_init(&trans, c, 0, 0);
-
- /*
- * For every snapshot node: If we have no live children and it's not
- * pointed to by a subvolume, delete it:
- */
- ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- NULL, NULL, 0,
- bch2_delete_redundant_snapshot(&trans, &iter, k));
- if (ret) {
- bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
- goto err;
- }
-
- for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- bch2_snapshot_set_equiv(&trans, k));
- if (ret) {
- bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
- goto err;
- }
-
- for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
- if (k.k->type != KEY_TYPE_snapshot)
- continue;
-
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v)) {
- ret = snapshot_list_add(c, &deleted, k.k->p.offset);
- if (ret)
- break;
- }
- }
- bch2_trans_iter_exit(&trans, &iter);
-
- if (ret) {
- bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
- goto err;
- }
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- struct bpos last_pos = POS_MIN;
- snapshot_id_list equiv_seen = { 0 };
-
- if (!btree_type_has_snapshots(id))
- continue;
-
- ret = for_each_btree_key_commit(&trans, iter,
- id, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
-
- darray_exit(&equiv_seen);
-
- if (ret) {
- bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
- goto err;
- }
- }
-
- for (i = 0; i < deleted.nr; i++) {
- ret = commit_do(&trans, NULL, NULL, 0,
- bch2_snapshot_node_delete(&trans, deleted.data[i]));
- if (ret) {
- bch_err(c, "error deleting snapshot %u: %s",
- deleted.data[i], bch2_err_str(ret));
- goto err;
- }
- }
-
- clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-err:
- darray_exit(&deleted);
- bch2_trans_exit(&trans);
- if (ret)
- bch_err_fn(c, ret);
+ bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+ bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
+ subvol_pos_bad,
+ "invalid pos");
+fsck_err:
return ret;
}
-static void bch2_delete_dead_snapshots_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
-
- if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
- bch2_delete_dead_snapshots(c);
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_delete_dead_snapshots_async(struct bch_fs *c)
-{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
- !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
- struct btree_trans_commit_hook *h)
-{
- struct bch_fs *c = trans->c;
-
- set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
- return 0;
-
- bch2_delete_dead_snapshots_async(c);
- return 0;
-}
-
-/* Subvolumes: */
-
-int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
- unsigned flags, struct printbuf *err)
-{
- if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
- bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
- prt_printf(err, "invalid pos");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
-}
-
void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
{
struct bch_snapshot snap;
- return snapshot_lookup(trans, snapshot, &snap) ?:
+ return bch2_snapshot_lookup(trans, snapshot, &snap) ?:
bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
}
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
u32 *snapid)
{
struct btree_iter iter;
- struct bkey_s_c k;
+ struct bkey_s_c_subvolume subvol;
int ret;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
- BTREE_ITER_CACHED|
- BTREE_ITER_WITH_UPDATES);
- ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume;
+ subvol = bch2_bkey_get_iter_typed(trans, &iter,
+ BTREE_ID_subvolumes, POS(0, subvolid),
+ BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+ subvolume);
+ ret = bkey_err(subvol);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing subvolume %u", subvolid);
if (likely(!ret))
- *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
- else if (bch2_err_matches(ret, ENOENT))
- bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+ *snapid = le32_to_cpu(subvol.v->snapshot);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
}
/*
- * Scan for subvolumes with parent @subvolid_to_delete, reparent:
+ * Separate from the snapshot tree in the snapshots btree, we record the tree
+ * structure of how snapshot subvolumes were created - the parent subvolume of
+ * each snapshot subvolume.
+ *
+ * When a subvolume is deleted, we scan for child subvolumes and reparant them,
+ * to avoid dangling references:
*/
static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
{
BTREE_ITER_CACHED, &s)) ?:
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.parent)));
}
{
struct btree_iter iter;
struct bkey_s_c_subvolume subvol;
- struct btree_trans_commit_hook *h;
u32 snapid;
int ret = 0;
snapid = le32_to_cpu(subvol.v->snapshot);
- ret = bch2_btree_delete_at(trans, &iter, 0);
- if (ret)
- goto err;
-
- ret = bch2_snapshot_node_set_deleted(trans, snapid);
- if (ret)
- goto err;
-
- h = bch2_trans_kmalloc(trans, sizeof(*h));
- ret = PTR_ERR_OR_ZERO(h);
- if (ret)
- goto err;
-
- h->fn = bch2_delete_dead_snapshots_hook;
- bch2_trans_commit_hook(trans, h);
-err:
+ ret = bch2_btree_delete_at(trans, &iter, 0) ?:
+ bch2_snapshot_node_set_deleted(trans, snapid);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
return bch2_subvolumes_reparent(trans, subvolid) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
}
bch2_evict_subvolume_inodes(c, &s);
for (id = s.data; id < s.data + s.nr; id++) {
- ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
+ ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
if (ret) {
- bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
+ bch_err_msg(c, ret, "deleting subvolume %u", *id);
break;
}
}
enum bkey_invalid_flags;
-void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-
-#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
- .key_invalid = bch2_snapshot_tree_invalid, \
- .val_to_text = bch2_snapshot_tree_to_text, \
- .min_val_size = 8, \
-})
-
-int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
-
-void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
- enum bkey_invalid_flags, struct printbuf *);
-int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-
-#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
- .key_invalid = bch2_snapshot_invalid, \
- .val_to_text = bch2_snapshot_to_text, \
- .atomic_trigger = bch2_mark_snapshot, \
- .min_val_size = 24, \
-})
-
-static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
-{
- return &t->s[U32_MAX - id];
-}
-
-static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
-{
- return __snapshot_t(rcu_dereference(c->snapshots), id);
-}
-
-static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- id = snapshot_t(c, id)->tree;
- rcu_read_unlock();
-
- return id;
-}
-
-static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
- return snapshot_t(c, id)->parent;
-}
-
-static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- id = __bch2_snapshot_parent_early(c, id);
- rcu_read_unlock();
-
- return id;
-}
-
-static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- u32 parent = snapshot_t(c, id)->parent;
-
- if (parent &&
- snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
- panic("id %u depth=%u parent %u depth=%u\n",
- id, snapshot_t(c, id)->depth,
- parent, snapshot_t(c, parent)->depth);
-
- return parent;
-#else
- return snapshot_t(c, id)->parent;
-#endif
-}
-
-static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
- return id;
-}
-
-static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
-{
- rcu_read_lock();
- while (n--)
- id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
- return id;
-}
-
-static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
-{
- u32 parent;
-
- rcu_read_lock();
- while ((parent = __bch2_snapshot_parent(c, id)))
- id = parent;
- rcu_read_unlock();
-
- return id;
-}
-
-static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-{
- return snapshot_t(c, id)->equiv;
-}
-
-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-{
- rcu_read_lock();
- id = __bch2_snapshot_equiv(c, id);
- rcu_read_unlock();
-
- return id;
-}
-
-static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
-{
- return id == bch2_snapshot_equiv(c, id);
-}
-
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s;
- bool ret;
-
- rcu_read_lock();
- s = snapshot_t(c, id);
- ret = s->children[0];
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
- return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s;
- u32 parent = __bch2_snapshot_parent(c, id);
-
- if (!parent)
- return 0;
-
- s = snapshot_t(c, __bch2_snapshot_parent(c, id));
- if (id == s->children[0])
- return s->children[1];
- if (id == s->children[1])
- return s->children[0];
- return 0;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
-
-static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
- return id == ancestor
- ? true
- : __bch2_snapshot_is_ancestor(c, id, ancestor);
-}
-
-static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *t;
- bool ret;
-
- rcu_read_lock();
- t = snapshot_t(c, id);
- ret = (t->children[0]|t->children[1]) != 0;
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
-{
- u32 *i;
-
- darray_for_each(*s, i)
- if (*i == id)
- return true;
- return false;
-}
-
-static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
- u32 *i;
-
- darray_for_each(*s, i)
- if (bch2_snapshot_is_ancestor(c, id, *i))
- return true;
- return false;
-}
-
-static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
- int ret;
-
- BUG_ON(snapshot_list_has_id(s, id));
- ret = darray_push(s, id);
- if (ret)
- bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
- return ret;
-}
-
-int bch2_check_snapshot_trees(struct bch_fs *);
-int bch2_check_snapshots(struct bch_fs *);
int bch2_check_subvols(struct bch_fs *);
-void bch2_fs_snapshots_exit(struct bch_fs *);
-int bch2_snapshots_read(struct bch_fs *);
-
-int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
- unsigned, struct printbuf *);
+int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bkey_invalid_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
int bch2_subvolume_get(struct btree_trans *, unsigned,
bool, int, struct bch_subvolume *);
-int bch2_snapshot_get_subvol(struct btree_trans *, u32,
- struct bch_subvolume *);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
-/* only exported for tests: */
-int bch2_snapshot_node_create(struct btree_trans *, u32,
- u32 *, u32 *, unsigned);
-
int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
};
struct snapshot_table {
- struct snapshot_t s[0];
+ DECLARE_FLEX_ARRAY(struct snapshot_t, s);
};
typedef struct {
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
#include "checksum.h"
#include "counters.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
-#include "io.h"
#include "journal.h"
-#include "journal_io.h"
#include "journal_sb.h"
#include "journal_seq_blacklist.h"
#include "recovery.h"
#include "replicas.h"
#include "quota.h"
+#include "sb-clean.h"
+#include "sb-errors.h"
+#include "sb-members.h"
#include "super-io.h"
#include "super.h"
#include "trace.h"
#include <linux/backing-dev.h>
#include <linux/sort.h>
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
struct bch2_metadata_version {
u16 version;
const char *name;
static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
struct printbuf *);
-struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
enum bch_sb_field_type type)
{
struct bch_sb_field *f;
void bch2_sb_field_delete(struct bch_sb_handle *sb,
enum bch_sb_field_type type)
{
- struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+ struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
if (f)
__bch2_sb_field_resize(sb, f, 0);
{
kfree(sb->bio);
if (!IS_ERR_OR_NULL(sb->bdev))
- blkdev_put(sb->bdev, sb->mode);
+ blkdev_put(sb->bdev, sb->holder);
+ kfree(sb->holder);
kfree(sb->sb);
memset(sb, 0, sizeof(*sb));
if (sb->sb && sb->buffer_size >= new_buffer_size)
return 0;
- if (sb->have_layout) {
+ if (sb->sb && sb->have_layout) {
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
if (new_bytes > max_bytes) {
if (dynamic_fault("bcachefs:add:super_realloc"))
return -BCH_ERR_ENOMEM_sb_realloc_injected;
+ new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
+ if (!new_sb)
+ return -BCH_ERR_ENOMEM_sb_buf_realloc;
+
+ sb->sb = new_sb;
+
if (sb->have_bio) {
- unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
+ unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!bio)
sb->bio = bio;
}
- new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
- if (!new_sb)
- return -BCH_ERR_ENOMEM_sb_buf_realloc;
-
- sb->sb = new_sb;
sb->buffer_size = new_buffer_size;
return 0;
}
-struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
enum bch_sb_field_type type,
unsigned u64s)
{
- struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+ struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
ssize_t d = -old_u64s + u64s;
/* XXX: we're not checking that offline device have enough space */
for_each_online_member(ca, c, i) {
- struct bch_sb_handle *sb = &ca->disk_sb;
+ struct bch_sb_handle *dev_sb = &ca->disk_sb;
- if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+ if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
percpu_ref_put(&ca->ref);
return NULL;
}
}
}
- f = bch2_sb_field_get(sb->sb, type);
+ f = bch2_sb_field_get_id(sb->sb, type);
f = __bch2_sb_field_resize(sb, f, u64s);
if (f)
f->type = cpu_to_le32(type);
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
- struct bch_sb_field_members *mi;
+ struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
u16 block_size;
int ret;
}
if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
- prt_printf(out, "Bad intenal UUID (got zeroes)");
+ prt_printf(out, "Bad internal UUID (got zeroes)");
return -BCH_ERR_invalid_sb_uuid;
}
}
/* members must be validated first: */
- mi = bch2_sb_get_members(sb);
+ mi = bch2_sb_field_get(sb, members_v1);
if (!mi) {
prt_printf(out, "Invalid superblock: member info area missing");
return -BCH_ERR_invalid_sb_members_missing;
return ret;
vstruct_for_each(sb, f) {
- if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
+ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
continue;
ret = bch2_sb_field_validate(sb, f, out);
static void bch2_sb_update(struct bch_fs *c)
{
struct bch_sb *src = c->disk_sb.sb;
- struct bch_sb_field_members *mi = bch2_sb_get_members(src);
struct bch_dev *ca;
unsigned i;
c->sb.features = le64_to_cpu(src->features[0]);
c->sb.compat = le64_to_cpu(src->compat[0]);
- for_each_member_device(ca, c, i)
- ca->mi = bch2_mi_to_cpu(mi->members + i);
+ for_each_member_device(ca, c, i) {
+ struct bch_member m = bch2_sb_member_get(src, i);
+ ca->mi = bch2_mi_to_cpu(&m);
+ }
}
static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
continue;
- src_f = bch2_sb_field_get(src, i);
- dst_f = bch2_sb_field_get(dst, i);
+ src_f = bch2_sb_field_get_id(src, i);
+ dst_f = bch2_sb_field_get_id(dst, i);
d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
(dst_f ? le32_to_cpu(dst_f->u64s) : 0);
if (d > 0) {
- int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d);
+ int ret = bch2_sb_realloc(dst_handle,
+ le32_to_cpu(dst_handle->sb->u64s) + d);
+
if (ret)
return ret;
dst = dst_handle->sb;
- dst_f = bch2_sb_field_get(dst, i);
+ dst_f = bch2_sb_field_get_id(dst, i);
}
dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
retry:
#endif
memset(sb, 0, sizeof(*sb));
- sb->mode = FMODE_READ;
+ sb->mode = BLK_OPEN_READ;
sb->have_bio = true;
+ sb->holder = kmalloc(1, GFP_KERNEL);
+ if (!sb->holder)
+ return -ENOMEM;
#ifndef __KERNEL__
if (opt_get(*opts, direct_io) == false)
- sb->mode |= FMODE_BUFFERED;
+ sb->mode |= BLK_OPEN_BUFFERED;
#endif
if (!opt_get(*opts, noexcl))
- sb->mode |= FMODE_EXCL;
+ sb->mode |= BLK_OPEN_EXCL;
if (!opt_get(*opts, nochanges))
- sb->mode |= FMODE_WRITE;
+ sb->mode |= BLK_OPEN_WRITE;
- sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
if (IS_ERR(sb->bdev) &&
PTR_ERR(sb->bdev) == -EACCES &&
opt_get(*opts, read_only)) {
- sb->mode &= ~FMODE_WRITE;
+ sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
if (!IS_ERR(sb->bdev))
opt_set(*opts, nochanges, true);
}
if (opt_defined(*opts, sb))
goto err;
- printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
+ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
printbuf_reset(&err);
ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
- printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
+ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
goto err_no_print;
}
printbuf_exit(&err);
return ret;
err:
- printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
+ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
path, err.buf);
err_no_print:
bch2_free_super(sb);
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
+ if (bch2_dev_io_err_on(bio->bi_status, ca,
+ bio_data_dir(bio)
+ ? BCH_MEMBER_ERROR_write
+ : BCH_MEMBER_ERROR_read,
+ "superblock %s error: %s",
+ bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
bch2_sb_counters_from_cpu(c);
+ bch2_sb_members_from_cpu(c);
+ bch2_sb_members_cpy_v2_v1(&c->disk_sb);
+ bch2_sb_errors_from_cpu(c);
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
mutex_unlock(&c->sb_lock);
}
-/* BCH_SB_FIELD_members: */
-
-static int bch2_sb_members_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
-{
- struct bch_sb_field_members *mi = field_to_type(f, members);
- unsigned i;
-
- if ((void *) (mi->members + sb->nr_devices) >
- vstruct_end(&mi->field)) {
- prt_printf(err, "too many devices for section size");
- return -BCH_ERR_invalid_sb_members;
- }
-
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member *m = mi->members + i;
-
- if (!bch2_member_exists(m))
- continue;
-
- if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
- prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
- i, le64_to_cpu(m->nbuckets), LONG_MAX);
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le64_to_cpu(m->nbuckets) -
- le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
- prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
- i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le16_to_cpu(m->bucket_size) <
- le16_to_cpu(sb->block_size)) {
- prt_printf(err, "device %u: bucket size %u smaller than block size %u",
- i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
- return -BCH_ERR_invalid_sb_members;
- }
-
- if (le16_to_cpu(m->bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(sb)) {
- prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
- i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
- return -BCH_ERR_invalid_sb_members;
- }
- }
-
- return 0;
-}
-
-static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_members *mi = field_to_type(f, members);
- struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
- unsigned i;
-
- for (i = 0; i < sb->nr_devices; i++) {
- struct bch_member *m = mi->members + i;
- unsigned data_have = bch2_sb_dev_has_data(sb, i);
- u64 bucket_size = le16_to_cpu(m->bucket_size);
- u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
-
- if (!bch2_member_exists(m))
- continue;
-
- prt_printf(out, "Device:");
- prt_tab(out);
- prt_printf(out, "%u", i);
- prt_newline(out);
-
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "UUID:");
- prt_tab(out);
- pr_uuid(out, m->uuid.b);
- prt_newline(out);
-
- prt_printf(out, "Size:");
- prt_tab(out);
- prt_units_u64(out, device_size << 9);
- prt_newline(out);
-
- prt_printf(out, "Bucket size:");
- prt_tab(out);
- prt_units_u64(out, bucket_size << 9);
- prt_newline(out);
-
- prt_printf(out, "First bucket:");
- prt_tab(out);
- prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
- prt_newline(out);
-
- prt_printf(out, "Buckets:");
- prt_tab(out);
- prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
- prt_newline(out);
-
- prt_printf(out, "Last mount:");
- prt_tab(out);
- if (m->last_mount)
- pr_time(out, le64_to_cpu(m->last_mount));
- else
- prt_printf(out, "(never)");
- prt_newline(out);
-
- prt_printf(out, "State:");
- prt_tab(out);
- prt_printf(out, "%s",
- BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
- ? bch2_member_states[BCH_MEMBER_STATE(m)]
- : "unknown");
- prt_newline(out);
-
- prt_printf(out, "Label:");
- prt_tab(out);
- if (BCH_MEMBER_GROUP(m)) {
- unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-
- if (idx < disk_groups_nr(gi))
- prt_printf(out, "%s (%u)",
- gi->entries[idx].label, idx);
- else
- prt_printf(out, "(bad disk labels section)");
- } else {
- prt_printf(out, "(none)");
- }
- prt_newline(out);
-
- prt_printf(out, "Data allowed:");
- prt_tab(out);
- if (BCH_MEMBER_DATA_ALLOWED(m))
- prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
- else
- prt_printf(out, "(none)");
- prt_newline(out);
-
- prt_printf(out, "Has data:");
- prt_tab(out);
- if (data_have)
- prt_bitflags(out, bch2_data_types, data_have);
- else
- prt_printf(out, "(none)");
- prt_newline(out);
-
- prt_printf(out, "Discard:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
- prt_newline(out);
-
- prt_printf(out, "Freespace initialized:");
- prt_tab(out);
- prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
- }
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_members = {
- .validate = bch2_sb_members_validate,
- .to_text = bch2_sb_members_to_text,
-};
-
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
-{
- struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
- if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
- prt_printf(err, "wrong size (got %zu should be %zu)",
- vstruct_bytes(&crypt->field), sizeof(*crypt));
- return -BCH_ERR_invalid_sb_crypt;
- }
-
- if (BCH_CRYPT_KDF_TYPE(crypt)) {
- prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
- return -BCH_ERR_invalid_sb_crypt;
- }
-
- return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
- prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt));
- prt_newline(out);
- prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt));
- prt_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
- .validate = bch2_sb_crypt_validate,
- .to_text = bch2_sb_crypt_to_text,
-};
-
-/* BCH_SB_FIELD_clean: */
-
-int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
-{
- struct jset_entry *entry;
- int ret;
-
- for (entry = clean->start;
- entry < (struct jset_entry *) vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- ret = bch2_journal_entry_validate(c, NULL, entry,
- le16_to_cpu(c->disk_sb.sb->version),
- BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
- write);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
/* Downgrade if superblock is at a higher version than currently supported: */
void bch2_sb_maybe_downgrade(struct bch_fs *c)
{
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
}
-int bch2_fs_mark_dirty(struct bch_fs *c)
-{
- int ret;
-
- /*
- * Unconditionally write superblock, to verify it hasn't changed before
- * we go rw:
- */
-
- mutex_lock(&c->sb_lock);
- SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
- bch2_sb_maybe_downgrade(c);
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
- ret = bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
- return ret;
-}
-
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
- struct jset_entry *entry = *end;
- unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
- memset(entry, 0, u64s * sizeof(u64));
- /*
- * The u64s field counts from the start of data, ignoring the shared
- * fields.
- */
- entry->u64s = cpu_to_le16(u64s - 1);
-
- *end = vstruct_next(*end);
- return entry;
-}
-
-void bch2_journal_super_entries_add_common(struct bch_fs *c,
- struct jset_entry **end,
- u64 journal_seq)
-{
- struct bch_dev *ca;
- unsigned i, dev;
-
- percpu_down_read(&c->mark_lock);
-
- if (!journal_seq) {
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
- } else {
- bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
- }
-
- {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_inodes;
- u->v = cpu_to_le64(c->usage_base->nr_inodes);
- }
-
- {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_key_version;
- u->v = cpu_to_le64(atomic64_read(&c->key_version));
- }
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_reserved;
- u->entry.level = i;
- u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
- }
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
- cpu_replicas_entry(&c->replicas, i);
- struct jset_entry_data_usage *u =
- container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
- struct jset_entry_data_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_data_usage;
- u->v = cpu_to_le64(c->usage_base->replicas[i]);
- unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
- "embedded variable length struct");
- }
-
- for_each_member_device(ca, c, dev) {
- unsigned b = sizeof(struct jset_entry_dev_usage) +
- sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
- struct jset_entry_dev_usage *u =
- container_of(jset_entry_init(end, b),
- struct jset_entry_dev_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_dev_usage;
- u->dev = cpu_to_le32(dev);
- u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
- u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
- u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
- }
- }
-
- percpu_up_read(&c->mark_lock);
-
- for (i = 0; i < 2; i++) {
- struct jset_entry_clock *clock =
- container_of(jset_entry_init(end, sizeof(*clock)),
- struct jset_entry_clock, entry);
-
- clock->entry.type = BCH_JSET_ENTRY_clock;
- clock->rw = i;
- clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
- }
-}
-
-void bch2_fs_mark_clean(struct bch_fs *c)
-{
- struct bch_sb_field_clean *sb_clean;
- struct jset_entry *entry;
- unsigned u64s;
- int ret;
-
- mutex_lock(&c->sb_lock);
- if (BCH_SB_CLEAN(c->disk_sb.sb))
- goto out;
-
- SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
- c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-
- u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-
- sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
- if (!sb_clean) {
- bch_err(c, "error resizing superblock while setting filesystem clean");
- goto out;
- }
-
- sb_clean->flags = 0;
- sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq));
-
- /* Trying to catch outstanding bug: */
- BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-
- entry = sb_clean->start;
- bch2_journal_super_entries_add_common(c, &entry, 0);
- entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
- BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-
- memset(entry, 0,
- vstruct_end(&sb_clean->field) - (void *) entry);
-
- /*
- * this should be in the write path, and we should be validating every
- * superblock section:
- */
- ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
- if (ret) {
- bch_err(c, "error writing marking filesystem clean: validate error");
- goto out;
- }
-
- bch2_write_super(c);
-out:
- mutex_unlock(&c->sb_lock);
-}
-
-static int bch2_sb_clean_validate(struct bch_sb *sb,
- struct bch_sb_field *f,
- struct printbuf *err)
-{
- struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
- if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
- prt_printf(err, "wrong size (got %zu should be %zu)",
- vstruct_bytes(&clean->field), sizeof(*clean));
- return -BCH_ERR_invalid_sb_clean;
- }
-
- return 0;
-}
-
-static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
-{
- struct bch_sb_field_clean *clean = field_to_type(f, clean);
- struct jset_entry *entry;
-
- prt_printf(out, "flags: %x", le32_to_cpu(clean->flags));
- prt_newline(out);
- prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq));
- prt_newline(out);
-
- for (entry = clean->start;
- entry != vstruct_end(&clean->field);
- entry = vstruct_next(entry)) {
- if (entry->type == BCH_JSET_ENTRY_btree_keys &&
- !entry->u64s)
- continue;
-
- bch2_journal_entry_to_text(out, NULL, entry);
- prt_newline(out);
- }
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
- .validate = bch2_sb_clean_validate,
- .to_text = bch2_sb_clean_to_text,
-};
-
static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
#define x(f, nr) \
[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- struct bch_sb_field_members *mi;
struct bch_sb_field *f;
u64 fields_have = 0;
unsigned nr_devices = 0;
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 44);
- mi = bch2_sb_get_members(sb);
- if (mi) {
- struct bch_member *m;
-
- for (m = mi->members;
- m < mi->members + sb->nr_devices;
- m++)
- nr_devices += bch2_member_exists(m);
- }
+ for (int i = 0; i < sb->nr_devices; i++)
+ nr_devices += bch2_dev_exists(sb, i);
prt_printf(out, "External UUID:");
prt_tab(out);
prt_printf(out, "Created:");
prt_tab(out);
if (sb->time_base_lo)
- pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+ bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else
prt_printf(out, "(not set)");
prt_newline(out);
#include "eytzinger.h"
#include "super_types.h"
#include "super.h"
+#include "sb-members.h"
#include <asm/byteorder.h>
unsigned,
unsigned);
-struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
-struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
- enum bch_sb_field_type, unsigned);
-void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
+static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
+{
+ return le32_to_cpu(f->u64s) * sizeof(u64);
+}
#define field_to_type(_f, _name) \
container_of_or_null(_f, struct bch_sb_field_##_name, field)
-#define x(_name, _nr) \
-static inline struct bch_sb_field_##_name * \
-bch2_sb_get_##_name(struct bch_sb *sb) \
-{ \
- return field_to_type(bch2_sb_field_get(sb, \
- BCH_SB_FIELD_##_name), _name); \
-} \
- \
-static inline struct bch_sb_field_##_name * \
-bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \
-{ \
- return field_to_type(bch2_sb_field_resize(sb, \
- BCH_SB_FIELD_##_name, u64s), _name); \
-}
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
+#define bch2_sb_field_get(_sb, _name) \
+ field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
+
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
+ enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_resize(_sb, _name, _u64s) \
+ field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
-BCH_SB_FIELDS()
-#undef x
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
extern const char * const bch2_sb_fields[];
static inline __le64 bch2_sb_magic(struct bch_fs *c)
{
__le64 ret;
+
memcpy(&ret, &c->sb.uuid, sizeof(ret));
return ret;
}
__bch2_check_set_feature(c, feat);
}
-/* BCH_SB_FIELD_members: */
-
-static inline bool bch2_member_exists(struct bch_member *m)
-{
- return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
-}
-
-static inline bool bch2_dev_exists(struct bch_sb *sb,
- struct bch_sb_field_members *mi,
- unsigned dev)
-{
- return dev < sb->nr_devices &&
- bch2_member_exists(&mi->members[dev]);
-}
-
-static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
-{
- return (struct bch_member_cpu) {
- .nbuckets = le64_to_cpu(mi->nbuckets),
- .first_bucket = le16_to_cpu(mi->first_bucket),
- .bucket_size = le16_to_cpu(mi->bucket_size),
- .group = BCH_MEMBER_GROUP(mi),
- .state = BCH_MEMBER_STATE(mi),
- .discard = BCH_MEMBER_DISCARD(mi),
- .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
- .durability = BCH_MEMBER_DURABILITY(mi)
- ? BCH_MEMBER_DURABILITY(mi) - 1
- : 1,
- .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
- .valid = bch2_member_exists(mi),
- };
-}
-
-/* BCH_SB_FIELD_clean: */
-
-void bch2_journal_super_entries_add_common(struct bch_fs *,
- struct jset_entry **, u64);
-
-int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-
void bch2_sb_maybe_downgrade(struct bch_fs *);
void bch2_sb_upgrade(struct bch_fs *, unsigned);
-int bch2_fs_mark_dirty(struct bch_fs *);
-void bch2_fs_mark_clean(struct bch_fs *);
-
void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
#include "bkey_sort.h"
#include "btree_cache.h"
#include "btree_gc.h"
+#include "btree_journal_iter.h"
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "error.h"
#include "fs.h"
#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
#include "fsck.h"
#include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
+#include "sb-clean.h"
+#include "sb-errors.h"
+#include "sb-members.h"
+#include "snapshot.h"
#include "subvolume.h"
#include "super.h"
#include "super-io.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
#define KTYPE(type) \
static const struct attribute_group type ## _group = { \
bch_info(c, "going read-write");
+ ret = bch2_sb_members_v2_init(c);
+ if (ret)
+ goto err;
+
ret = bch2_fs_mark_dirty(c);
if (ret)
goto err;
return ret;
}
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret)
+ goto err;
+
if (!early) {
ret = bch2_fs_read_write_late(c);
if (ret)
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_reinit(&c->writes);
#else
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+ for (i = 0; i < BCH_WRITE_REF_NR; i++) {
BUG_ON(atomic_long_read(&c->writes[i]));
atomic_long_inc(&c->writes[i]);
}
static void __bch2_fs_free(struct bch_fs *c)
{
unsigned i;
- int cpu;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
bch2_free_pending_node_rewrites(c);
+ bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
+ bch2_fs_fs_io_direct_exit(c);
+ bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
- bch2_fs_io_exit(c);
+ bch2_fs_nocow_locking_exit(c);
+ bch2_fs_io_write_exit(c);
+ bch2_fs_io_read_exit(c);
bch2_fs_buckets_waiting_for_journal_exit(c);
bch2_fs_btree_interior_update_exit(c);
bch2_fs_btree_iter_exit(c);
percpu_free_rwsem(&c->mark_lock);
free_percpu(c->online_reserved);
- if (c->btree_paths_bufs)
- for_each_possible_cpu(cpu)
- kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
-
darray_exit(&c->btree_roots_extra);
- free_percpu(c->btree_paths_bufs);
free_percpu(c->pcpu);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
cancel_work_sync(&ca->io_error_work);
cancel_work_sync(&c->read_only_work);
-
- for (i = 0; i < c->sb.nr_devices; i++) {
- struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-
- if (ca)
- bch2_free_super(&ca->disk_sb);
- }
}
void bch2_fs_free(struct bch_fs *c)
closure_sync(&c->cl);
closure_debug_destroy(&c->cl);
- for (i = 0; i < c->sb.nr_devices; i++)
- if (c->devs[i])
- bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
+ for (i = 0; i < c->sb.nr_devices; i++) {
+ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
+
+ if (ca) {
+ bch2_free_super(&ca->disk_sb);
+ bch2_dev_free(ca);
+ }
+ }
bch_verbose(c, "shutdown complete");
ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
{
- struct bch_sb_field_members *mi;
struct bch_fs *c;
struct printbuf name = PRINTBUF;
unsigned i, iter_size;
bch2_fs_quota_init(c);
bch2_fs_ec_init_early(c);
bch2_fs_move_init(c);
+ bch2_fs_sb_errors_init_early(c);
INIT_LIST_HEAD(&c->list);
mutex_init(&c->bio_bounce_pages_lock);
mutex_init(&c->snapshot_table_lock);
+ init_rwsem(&c->snapshot_create_lock);
spin_lock_init(&c->btree_write_error_lock);
INIT_LIST_HEAD(&c->journal_iters);
- INIT_LIST_HEAD(&c->fsck_errors);
- mutex_init(&c->fsck_error_lock);
+ INIT_LIST_HEAD(&c->fsck_error_msgs);
+ mutex_init(&c->fsck_error_msgs_lock);
seqcount_init(&c->gc_pos_lock);
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
- c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);
c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
if (c->opts.inodes_use_key_cache)
c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
c->block_bits = ilog2(block_sectors(c));
c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
- !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
}
ret = bch2_fs_counters_init(c) ?:
+ bch2_fs_sb_errors_init(c) ?:
bch2_io_clock_init(&c->io_clock[READ]) ?:
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_subvolumes_init(c) ?:
- bch2_fs_io_init(c) ?:
+ bch2_fs_io_read_init(c) ?:
+ bch2_fs_io_write_init(c) ?:
bch2_fs_nocow_locking_init(c) ?:
bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?:
- bch2_fs_fsio_init(c);
+ bch2_fs_fsio_init(c) ?:
+ bch2_fs_fs_io_buffered_init(c) ?:
+ bch2_fs_fs_io_direct_init(c);
if (ret)
goto err;
- mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
- if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
+ if (bch2_dev_exists(c->disk_sb.sb, i) &&
bch2_dev_alloc(c, i)) {
ret = -EEXIST;
goto err;
int bch2_fs_start(struct bch_fs *c)
{
- struct bch_sb_field_members *mi;
struct bch_dev *ca;
time64_t now = ktime_get_real_seconds();
unsigned i;
mutex_lock(&c->sb_lock);
- for_each_online_member(ca, c, i)
- bch2_sb_from_fs(c, ca);
+ ret = bch2_sb_members_v2_init(c);
+ if (ret) {
+ mutex_unlock(&c->sb_lock);
+ goto err;
+ }
- mi = bch2_sb_get_members(c->disk_sb.sb);
for_each_online_member(ca, c, i)
- mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+ bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
mutex_unlock(&c->sb_lock);
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
- mutex_lock(&c->btree_transaction_stats[i].lock);
- bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
- mutex_unlock(&c->btree_transaction_stats[i].lock);
- }
-
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
up_write(&c->state_lock);
return ret;
err:
- bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "starting filesystem");
goto out;
}
static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
{
- struct bch_sb_field_members *sb_mi;
-
- sb_mi = bch2_sb_get_members(sb);
- if (!sb_mi)
- return -BCH_ERR_member_info_missing;
+ struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
if (le16_to_cpu(sb->block_size) != block_sectors(c))
return -BCH_ERR_mismatched_block_size;
- if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+ if (le16_to_cpu(m.bucket_size) <
BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
return -BCH_ERR_bucket_size_too_small;
{
struct bch_sb *newest =
le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
- struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
if (!uuid_equal(&fs->uuid, &sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
- if (!bch2_dev_exists(newest, mi, sb->dev_idx))
+ if (!bch2_dev_exists(newest, sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
if (fs->block_size != sb->block_size)
struct bch_member *member)
{
struct bch_dev *ca;
+ unsigned i;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
bch2_time_stats_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
+
+ for (i = 0; i < ARRAY_SIZE(member->errors); i++)
+ atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
+
ca->uuid = member->uuid;
ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
{
- struct bch_member *member =
- bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+ struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
struct bch_dev *ca = NULL;
int ret = 0;
if (bch2_fs_init_fault("dev_alloc"))
goto err;
- ca = __bch2_dev_alloc(c, member);
+ ca = __bch2_dev_alloc(c, &member);
if (!ca)
goto err;
/* Commit: */
ca->disk_sb = *sb;
- if (sb->mode & FMODE_EXCL)
- ca->disk_sb.bdev->bd_holder = ca;
memset(sb, 0, sizeof(*sb));
ca->dev = ca->disk_sb.bdev->bd_dev;
static bool bch2_fs_may_start(struct bch_fs *c)
{
- struct bch_sb_field_members *mi;
struct bch_dev *ca;
unsigned i, flags = 0;
if (!c->opts.degraded &&
!c->opts.very_degraded) {
mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
+ if (!bch2_dev_exists(c->disk_sb.sb, i))
continue;
ca = bch_dev_locked(c, i);
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
- struct bch_sb_field_members *mi;
+ struct bch_member *m;
int ret = 0;
if (ca->mi.state == new_state)
bch_notice(ca, "%s", bch2_member_states[new_state]);
mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb.sb);
- SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_STATE(m, new_state);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
BTREE_TRIGGER_NORUN, NULL);
if (ret)
- bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
- struct bch_sb_field_members *mi;
+ struct bch_member *m;
unsigned dev_idx = ca->dev_idx, data;
int ret;
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) {
- bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
+ bch_err_msg(ca, ret, "dropping data");
goto err;
}
ret = bch2_dev_remove_alloc(c, ca);
if (ret) {
- bch_err(ca, "Remove failed, error deleting alloc info");
+ bch_err_msg(ca, ret, "deleting alloc info");
goto err;
}
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
if (ret) {
- bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
+ bch_err_msg(ca, ret, "flushing journal");
goto err;
}
ret = bch2_journal_flush(&c->journal);
if (ret) {
- bch_err(ca, "Remove failed, journal error");
+ bch_err(ca, "journal error");
goto err;
}
ret = bch2_replicas_gc2(c);
if (ret) {
- bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
+ bch_err_msg(ca, ret, "in replicas_gc2()");
goto err;
}
* this device must be gone:
*/
mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb.sb);
- memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+ memset(&m->uuid, 0, sizeof(m->uuid));
bch2_write_super(c);
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb;
struct bch_dev *ca = NULL;
- struct bch_sb_field_members *mi;
+ struct bch_sb_field_members_v2 *mi;
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
struct printbuf errbuf = PRINTBUF;
ret = bch2_read_super(path, &opts, &sb);
if (ret) {
- bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
+ bch_err_msg(c, ret, "reading super");
goto err;
}
- dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+ dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
- bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+ bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
if (label.allocation_failure) {
ret = -ENOMEM;
goto err;
ret = bch2_dev_may_add(sb.sb, c);
if (ret) {
- bch_err(c, "device add error: %s", bch2_err_str(ret));
+ bch_err_fn(c, ret);
goto err;
}
ca = __bch2_dev_alloc(c, &dev_mi);
if (!ca) {
- bch2_free_super(&sb);
ret = -ENOMEM;
goto err;
}
bch2_dev_usage_init(ca);
ret = __bch2_dev_attach_bdev(ca, &sb);
- if (ret) {
- bch2_dev_free(ca);
+ if (ret)
goto err;
- }
ret = bch2_dev_journal_alloc(ca);
if (ret) {
- bch_err(c, "device add error: journal alloc failed");
+ bch_err_msg(c, ret, "allocating journal");
goto err;
}
ret = bch2_sb_from_fs(c, ca);
if (ret) {
- bch_err(c, "device add error: new device superblock too small");
- goto err_unlock;
- }
-
- mi = bch2_sb_get_members(ca->disk_sb.sb);
-
- if (!bch2_sb_resize_members(&ca->disk_sb,
- le32_to_cpu(mi->field.u64s) +
- sizeof(dev_mi) / sizeof(u64))) {
- bch_err(c, "device add error: new device superblock too small");
- ret = -BCH_ERR_ENOSPC_sb_members;
+ bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
}
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
- mi = bch2_sb_get_members(c->disk_sb.sb);
for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
- if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
+ if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
goto have_slot;
no_slot:
- bch_err(c, "device add error: already have maximum number of devices");
ret = -BCH_ERR_ENOSPC_sb_members;
+ bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
- u64s = (sizeof(struct bch_sb_field_members) +
- sizeof(struct bch_member) * nr_devices) / sizeof(u64);
- mi = bch2_sb_resize_members(&c->disk_sb, u64s);
+ mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+ le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+ mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
if (!mi) {
- bch_err(c, "device add error: no room in superblock for member info");
ret = -BCH_ERR_ENOSPC_sb_members;
+ bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
}
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
/* success: */
- mi->members[dev_idx] = dev_mi;
- mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
+ *m = dev_mi;
+ m->last_mount = cpu_to_le64(ktime_get_real_seconds());
c->disk_sb.sb->nr_devices = nr_devices;
ca->disk_sb.sb->dev_idx = dev_idx;
if (BCH_MEMBER_GROUP(&dev_mi)) {
ret = __bch2_dev_group_set(c, ca, label.buf);
if (ret) {
- bch_err(c, "device add error: error setting label");
+ bch_err_msg(c, ret, "creating new label");
goto err_unlock;
}
}
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
- bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
+ bch_err_msg(ca, ret, "marking new superblock");
goto err_late;
}
ret = bch2_fs_freespace_init(c);
if (ret) {
- bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+ bch_err_msg(ca, ret, "initializing free space");
goto err_late;
}
{
struct bch_opts opts = bch2_opts_empty();
struct bch_sb_handle sb = { NULL };
- struct bch_sb_field_members *mi;
struct bch_dev *ca;
unsigned dev_idx;
int ret;
ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
if (ret) {
- bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret));
+ bch_err_msg(c, ret, "bringing %s online", path);
goto err;
}
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
- bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
- path, bch2_err_str(ret));
+ bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
goto err;
}
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
- mutex_lock(&c->sb_lock);
- mi = bch2_sb_get_members(c->disk_sb.sb);
+ if (!ca->mi.freespace_initialized) {
+ ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
+ goto err;
+ }
- mi->members[ca->dev_idx].last_mount =
- cpu_to_le64(ktime_get_real_seconds());
+ if (!ca->journal.nr) {
+ ret = bch2_dev_journal_alloc(ca);
+ bch_err_msg(ca, ret, "allocating journal");
+ if (ret)
+ goto err;
+ }
+ mutex_lock(&c->sb_lock);
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+ cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- ret = bch2_fs_freespace_init(c);
- if (ret)
- bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
-
up_write(&c->state_lock);
return 0;
err:
int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
- struct bch_member *mi;
+ struct bch_member *m;
+ u64 old_nbuckets;
int ret = 0;
down_write(&c->state_lock);
+ old_nbuckets = ca->mi.nbuckets;
if (nbuckets < ca->mi.nbuckets) {
bch_err(ca, "Cannot shrink yet");
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
if (ret) {
- bch_err(ca, "Resize error: %s", bch2_err_str(ret));
+ bch_err_msg(ca, ret, "resizing buckets");
goto err;
}
goto err;
mutex_lock(&c->sb_lock);
- mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
- mi->nbuckets = cpu_to_le64(nbuckets);
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ m->nbuckets = cpu_to_le64(nbuckets);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ if (ca->mi.freespace_initialized) {
+ ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+ if (ret)
+ goto err;
+
+ /*
+ * XXX: this is all wrong transactionally - we'll be able to do
+ * this correctly after the disk space accounting rewrite
+ */
+ ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
+ }
+
bch2_recalc_capacity(c);
err:
up_write(&c->state_lock);
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_opts opts)
{
- struct bch_sb_handle *sb = NULL;
+ DARRAY(struct bch_sb_handle) sbs = { 0 };
struct bch_fs *c = NULL;
- struct bch_sb_field_members *mi;
- unsigned i, best_sb = 0;
+ struct bch_sb_handle *sb, *best = NULL;
struct printbuf errbuf = PRINTBUF;
int ret = 0;
goto err;
}
- sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
- if (!sb) {
- ret = -ENOMEM;
+ ret = darray_make_room(&sbs, nr_devices);
+ if (ret)
goto err;
- }
- for (i = 0; i < nr_devices; i++) {
- ret = bch2_read_super(devices[i], &opts, &sb[i]);
+ for (unsigned i = 0; i < nr_devices; i++) {
+ struct bch_sb_handle sb = { NULL };
+
+ ret = bch2_read_super(devices[i], &opts, &sb);
if (ret)
goto err;
+ BUG_ON(darray_push(&sbs, sb));
}
- for (i = 1; i < nr_devices; i++)
- if (le64_to_cpu(sb[i].sb->seq) >
- le64_to_cpu(sb[best_sb].sb->seq))
- best_sb = i;
-
- mi = bch2_sb_get_members(sb[best_sb].sb);
+ darray_for_each(sbs, sb)
+ if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+ best = sb;
- i = 0;
- while (i < nr_devices) {
- if (i != best_sb &&
- !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
- pr_info("%pg has been removed, skipping", sb[i].bdev);
- bch2_free_super(&sb[i]);
- array_remove_item(sb, nr_devices, i);
+ darray_for_each_reverse(sbs, sb) {
+ if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
+ pr_info("%pg has been removed, skipping", sb->bdev);
+ bch2_free_super(sb);
+ darray_remove_item(&sbs, sb);
+ best -= best > sb;
continue;
}
- ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+ ret = bch2_dev_in_fs(best->sb, sb->sb);
if (ret)
goto err_print;
- i++;
}
- c = bch2_fs_alloc(sb[best_sb].sb, opts);
- if (IS_ERR(c)) {
- ret = PTR_ERR(c);
+ c = bch2_fs_alloc(best->sb, opts);
+ ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
goto err;
- }
down_write(&c->state_lock);
- for (i = 0; i < nr_devices; i++) {
- ret = bch2_dev_attach_bdev(c, &sb[i]);
+ darray_for_each(sbs, sb) {
+ ret = bch2_dev_attach_bdev(c, sb);
if (ret) {
up_write(&c->state_lock);
goto err;
goto err;
}
out:
- kfree(sb);
+ darray_for_each(sbs, sb)
+ bch2_free_super(sb);
+ darray_exit(&sbs);
printbuf_exit(&errbuf);
module_put(THIS_MODULE);
return c;
err:
if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
- if (sb)
- for (i = 0; i < nr_devices; i++)
- bch2_free_super(&sb[i]);
c = ERR_PTR(ret);
goto out;
}
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
+__maybe_unused
static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
module_param_named(version, bch2_metadata_version, uint, 0400);
#include <linux/math64.h>
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
- return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
- return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
- u32 remainder;
-
- div_u64_rem(s, ca->mi.bucket_size, &remainder);
- return remainder;
-}
-
-static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
- u32 *offset)
-{
- return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
- return !percpu_ref_is_zero(&ca->io_ref);
-}
-
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
-{
- return bch2_dev_is_online(ca) &&
- ca->mi.state != BCH_MEMBER_STATE_failed;
-}
-
-static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-{
- if (!percpu_ref_tryget(&ca->io_ref))
- return false;
-
- if (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
- return true;
-
- percpu_ref_put(&ca->io_ref);
- return false;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
- return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
- unsigned dev)
-{
- unsigned i;
-
- for (i = 0; i < devs.nr; i++)
- if (devs.devs[i] == dev)
- return true;
-
- return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
- unsigned dev)
-{
- unsigned i;
-
- for (i = 0; i < devs->nr; i++)
- if (devs->devs[i] == dev) {
- array_remove_item(devs->devs, devs->nr, i);
- return;
- }
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
- unsigned dev)
-{
- if (!bch2_dev_list_has_dev(*devs, dev)) {
- BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
- devs->devs[devs->nr++] = dev;
- }
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
- return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
- const struct bch_devs_mask *mask)
-{
- struct bch_dev *ca = NULL;
-
- while ((*iter = mask
- ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
- : *iter) < c->sb.nr_devices &&
- !(ca = rcu_dereference_check(c->devs[*iter],
- lockdep_is_held(&c->state_lock))))
- (*iter)++;
-
- return ca;
-}
-
-#define for_each_member_device_rcu(ca, c, iter, mask) \
- for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
-{
- struct bch_dev *ca;
-
- rcu_read_lock();
- if ((ca = __bch2_next_dev(c, iter, NULL)))
- percpu_ref_get(&ca->ref);
- rcu_read_unlock();
-
- return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define for_each_member_device(ca, c, iter) \
- for ((iter) = 0; \
- (ca = bch2_get_next_dev(c, &(iter))); \
- percpu_ref_put(&ca->ref), (iter)++)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
- unsigned *iter,
- int state_mask)
-{
- struct bch_dev *ca;
-
- rcu_read_lock();
- while ((ca = __bch2_next_dev(c, iter, NULL)) &&
- (!((1 << ca->mi.state) & state_mask) ||
- !percpu_ref_tryget(&ca->io_ref)))
- (*iter)++;
- rcu_read_unlock();
-
- return ca;
-}
-
-#define __for_each_online_member(ca, c, iter, state_mask) \
- for ((iter) = 0; \
- (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
- percpu_ref_put(&ca->io_ref), (iter)++)
-
-#define for_each_online_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, ~0)
-
-#define for_each_rw_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
-
-#define for_each_readable_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, \
- (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
-
-/*
- * If a key exists that references a device, the device won't be going away and
- * we can omit rcu_read_lock():
- */
-static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
-{
- EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
- return rcu_dereference_check(c->devs[idx], 1);
-}
-
-static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
-{
- EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
- return rcu_dereference_protected(c->devs[idx],
- lockdep_is_held(&c->sb_lock) ||
- lockdep_is_held(&c->state_lock));
-}
-
-/* XXX kill, move to struct bch_fs */
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-{
- struct bch_devs_mask devs;
- struct bch_dev *ca;
- unsigned i;
-
- memset(&devs, 0, sizeof(devs));
- for_each_online_member(ca, c, i)
- __set_bit(ca->dev_idx, devs.d);
- return devs;
-}
-
-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
- u64 b_offset = bucket_to_sector(ca, b);
- u64 b_end = bucket_to_sector(ca, b + 1);
- unsigned i;
-
- if (!b)
- return true;
-
- for (i = 0; i < layout->nr_superblocks; i++) {
- u64 offset = le64_to_cpu(layout->sb_offset[i]);
- u64 end = offset + (1 << layout->sb_max_size_bits);
-
- if (!(offset >= b_end || end <= b_offset))
- return true;
- }
-
- return false;
-}
-
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
struct bch_sb *sb;
struct block_device *bdev;
struct bio *bio;
+ void *holder;
size_t buffer_size;
- fmode_t mode;
+ blk_mode_t mode;
unsigned have_layout:1;
unsigned have_bio:1;
unsigned fs_sb:1;
u8 valid;
};
-struct bch_disk_group_cpu {
- bool deleted;
- u16 parent;
- struct bch_devs_mask devs;
-};
-
-struct bch_disk_groups_cpu {
- struct rcu_head rcu;
- unsigned nr;
- struct bch_disk_group_cpu entries[];
-};
-
#endif /* _BCACHEFS_SUPER_TYPES_H */
prt_human_readable_s64(out, val); \
} while (0)
-#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
-#define var_print(_var) sysfs_print(_var, var(_var))
-#define var_hprint(_var) sysfs_hprint(_var, var(_var))
-
#define sysfs_strtoul(file, var) \
do { \
if (attr == &sysfs_ ## file) \
_v; \
})
-#define strtoul_restrict_or_return(cp, min, max) \
-({ \
- unsigned long __v = 0; \
- int _r = strtoul_safe_restrict(cp, __v, min, max); \
- if (_r) \
- return _r; \
- __v; \
-})
-
-#define strtoi_h_or_return(cp) \
-({ \
- u64 _v; \
- int _r = strtoi_h(cp, &_v); \
- if (_r) \
- return _r; \
- _v; \
-})
-
-#define sysfs_hatoi(file, var) \
-do { \
- if (attr == &sysfs_ ## file) \
- return strtoi_h(buf, &var) ?: (ssize_t) size; \
-} while (0)
-
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
read_attribute(first_bucket);
read_attribute(nbuckets);
rw_attribute(durability);
-read_attribute(iodone);
+read_attribute(io_done);
+read_attribute(io_errors);
+write_attribute(io_errors_reset);
read_attribute(io_latency_read);
read_attribute(io_latency_write);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_work);
+read_attribute(rebalance_status);
rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
read_attribute(io_timers_read);
read_attribute(io_timers_write);
-read_attribute(data_jobs);
read_attribute(moving_ctxts);
#ifdef CONFIG_BCACHEFS_TESTS
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
enum btree_id id;
incompressible_sectors = 0,
compressed_sectors_compressed = 0,
compressed_sectors_uncompressed = 0;
- int ret;
+ int ret = 0;
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EPERM;
- bch2_trans_init(&trans, c, 0, 0);
+ trans = bch2_trans_get(c);
for (id = 0; id < BTREE_ID_NR; id++) {
if (!btree_type_has_ptrs(id))
continue;
- for_each_btree_key(&trans, iter, id, POS_MIN,
+ for_each_btree_key(trans, iter, id, POS_MIN,
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
else if (compressed)
nr_compressed_extents++;
}
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
return ret;
static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
{
- prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+ prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree));
bch2_bpos_to_text(out, c->gc_gens_pos);
prt_printf(out, "\n");
}
if (attr == &sysfs_copy_gc_wait)
bch2_copygc_wait_to_text(out, c);
- if (attr == &sysfs_rebalance_work)
- bch2_rebalance_work_to_text(out, c);
+ if (attr == &sysfs_rebalance_status)
+ bch2_rebalance_status_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
if (attr == &sysfs_io_timers_write)
bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
- if (attr == &sysfs_data_jobs)
- bch2_data_jobs_to_text(out, c);
-
if (attr == &sysfs_moving_ctxts)
bch2_fs_moving_ctxts_to_text(out, c);
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
- &sysfs_rebalance_work,
+ &sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
- &sysfs_data_jobs,
&sysfs_moving_ctxts,
&sysfs_internal_uuid,
bch2_opt_set_by_id(&c->opts, id, v);
if ((id == Opt_background_target ||
- id == Opt_background_compression) && v) {
- bch2_rebalance_add_work(c, S64_MAX);
- rebalance_wakeup(c);
- }
+ id == Opt_background_compression) && v)
+ bch2_set_rebalance_needs_scan(c, 0);
ret = size;
err:
NULL
};
-static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
+static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
{
int rw, i;
sysfs_print(discard, ca->mi.discard);
if (attr == &sysfs_label) {
- if (ca->mi.group) {
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, c->disk_sb.sb,
- ca->mi.group - 1);
- mutex_unlock(&c->sb_lock);
- }
-
+ if (ca->mi.group)
+ bch2_disk_path_to_text(out, c, ca->mi.group - 1);
prt_char(out, '\n');
}
prt_char(out, '\n');
}
- if (attr == &sysfs_iodone)
- dev_iodone_to_text(out, ca);
+ if (attr == &sysfs_io_done)
+ dev_io_done_to_text(out, ca);
+
+ if (attr == &sysfs_io_errors)
+ bch2_dev_io_errors_to_text(out, ca);
sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
bool v = strtoul_or_return(buf);
mutex_lock(&c->sb_lock);
- mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+ mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
if (v != BCH_MEMBER_DISCARD(mi)) {
SET_BCH_MEMBER_DISCARD(mi, v);
u64 v = strtoul_or_return(buf);
mutex_lock(&c->sb_lock);
- mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+ mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- if (v != BCH_MEMBER_DURABILITY(mi)) {
+ if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
SET_BCH_MEMBER_DURABILITY(mi, v + 1);
bch2_write_super(c);
}
return ret;
}
+ if (attr == &sysfs_io_errors_reset)
+ bch2_dev_errors_reset(ca);
+
return size;
}
SYSFS_OPS(bch2_dev);
&sysfs_label,
&sysfs_has_data,
- &sysfs_iodone,
+ &sysfs_io_done,
+ &sysfs_io_errors,
+ &sysfs_io_errors_reset,
&sysfs_io_latency_read,
&sysfs_io_latency_write,
#include "bcachefs.h"
#include "btree_update.h"
#include "journal_reclaim.h"
-#include "subvolume.h"
+#include "snapshot.h"
#include "tests.h"
#include "linux/kthread.h"
static int test_delete(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_i_cookie k;
int ret;
bkey_cookie_init(&k.k_i);
k.k.p.snapshot = U32_MAX;
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
BTREE_ITER_INTENT);
- ret = commit_do(&trans, NULL, NULL, 0,
+ ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, &k.k_i, 0));
- if (ret) {
- bch_err_msg(c, ret, "update error");
+ bch2_trans_update(trans, &iter, &k.k_i, 0));
+ bch_err_msg(c, ret, "update error");
+ if (ret)
goto err;
- }
pr_info("deleting once");
- ret = commit_do(&trans, NULL, NULL, 0,
+ ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(&trans, &iter, 0));
- if (ret) {
- bch_err_msg(c, ret, "delete error (first)");
+ bch2_btree_delete_at(trans, &iter, 0));
+ bch_err_msg(c, ret, "delete error (first)");
+ if (ret)
goto err;
- }
pr_info("deleting twice");
- ret = commit_do(&trans, NULL, NULL, 0,
+ ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(&trans, &iter, 0));
- if (ret) {
- bch_err_msg(c, ret, "delete error (second)");
+ bch2_btree_delete_at(trans, &iter, 0));
+ bch_err_msg(c, ret, "delete error (second)");
+ if (ret)
goto err;
- }
err:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
static int test_delete_written(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_i_cookie k;
int ret;
bkey_cookie_init(&k.k_i);
k.k.p.snapshot = U32_MAX;
- bch2_trans_init(&trans, c, 0, 0);
-
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
BTREE_ITER_INTENT);
- ret = commit_do(&trans, NULL, NULL, 0,
+ ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(&trans, &iter, &k.k_i, 0));
- if (ret) {
- bch_err_msg(c, ret, "update error");
+ bch2_trans_update(trans, &iter, &k.k_i, 0));
+ bch_err_msg(c, ret, "update error");
+ if (ret)
goto err;
- }
- bch2_trans_unlock(&trans);
+ bch2_trans_unlock(trans);
bch2_journal_flush_all_pins(&c->journal);
- ret = commit_do(&trans, NULL, NULL, 0,
+ ret = commit_do(trans, NULL, NULL, 0,
bch2_btree_iter_traverse(&iter) ?:
- bch2_btree_delete_at(&trans, &iter, 0));
- if (ret) {
- bch_err_msg(c, ret, "delete error");
+ bch2_btree_delete_at(trans, &iter, 0));
+ bch_err_msg(c, ret, "delete error");
+ if (ret)
goto err;
- }
err:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
static int test_iterate(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
delete_test_keys(c);
pr_info("inserting test keys");
for (i = 0; i < nr; i++) {
- struct bkey_i_cookie k;
+ struct bkey_i_cookie ck;
- bkey_cookie_init(&k.k_i);
- k.k.p.offset = i;
- k.k.p.snapshot = U32_MAX;
+ bkey_cookie_init(&ck.k_i);
+ ck.k.p.offset = i;
+ ck.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
- NULL, NULL, 0);
- if (ret) {
- bch_err_msg(c, ret, "insert error");
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+ bch_err_msg(c, ret, "insert error");
+ if (ret)
goto err;
- }
}
pr_info("iterating forwards");
i = 0;
- ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(k.k->p.offset != i++);
0;
}));
- if (ret) {
- bch_err_msg(c, ret, "error iterating forwards");
+ bch_err_msg(c, ret, "error iterating forwards");
+ if (ret)
goto err;
- }
BUG_ON(i != nr);
pr_info("iterating backwards");
- ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+ ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
SPOS(0, U64_MAX, U32_MAX), 0, k,
({
BUG_ON(k.k->p.offset != --i);
0;
}));
- if (ret) {
- bch_err_msg(c, ret, "error iterating backwards");
+ bch_err_msg(c, ret, "error iterating backwards");
+ if (ret)
goto err;
- }
BUG_ON(i);
err:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
delete_test_keys(c);
pr_info("inserting test extents");
for (i = 0; i < nr; i += 8) {
- struct bkey_i_cookie k;
+ struct bkey_i_cookie ck;
- bkey_cookie_init(&k.k_i);
- k.k.p.offset = i + 8;
- k.k.p.snapshot = U32_MAX;
- k.k.size = 8;
+ bkey_cookie_init(&ck.k_i);
+ ck.k.p.offset = i + 8;
+ ck.k.p.snapshot = U32_MAX;
+ ck.k.size = 8;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
- NULL, NULL, 0);
- if (ret) {
- bch_err_msg(c, ret, "insert error");
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+ bch_err_msg(c, ret, "insert error");
+ if (ret)
goto err;
- }
}
pr_info("iterating forwards");
i = 0;
- ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(bkey_start_offset(k.k) != i);
i = k.k->p.offset;
0;
}));
- if (ret) {
- bch_err_msg(c, ret, "error iterating forwards");
+ bch_err_msg(c, ret, "error iterating forwards");
+ if (ret)
goto err;
- }
BUG_ON(i != nr);
pr_info("iterating backwards");
- ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
SPOS(0, U64_MAX, U32_MAX), 0, k,
({
BUG_ON(k.k->p.offset != i);
i = bkey_start_offset(k.k);
0;
}));
- if (ret) {
- bch_err_msg(c, ret, "error iterating backwards");
+ bch_err_msg(c, ret, "error iterating backwards");
+ if (ret)
goto err;
- }
BUG_ON(i);
err:
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
static int test_iterate_slots(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
delete_test_keys(c);
pr_info("inserting test keys");
for (i = 0; i < nr; i++) {
- struct bkey_i_cookie k;
+ struct bkey_i_cookie ck;
- bkey_cookie_init(&k.k_i);
- k.k.p.offset = i * 2;
- k.k.p.snapshot = U32_MAX;
+ bkey_cookie_init(&ck.k_i);
+ ck.k.p.offset = i * 2;
+ ck.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
- NULL, NULL, 0);
- if (ret) {
- bch_err_msg(c, ret, "insert error");
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+ bch_err_msg(c, ret, "insert error");
+ if (ret)
goto err;
- }
}
pr_info("iterating forwards");
i = 0;
- ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(k.k->p.offset != i);
i += 2;
0;
}));
- if (ret) {
- bch_err_msg(c, ret, "error iterating forwards");
+ bch_err_msg(c, ret, "error iterating forwards");
+ if (ret)
goto err;
- }
BUG_ON(i != nr * 2);
i = 0;
- ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
BTREE_ITER_SLOTS, k, ({
if (i >= nr * 2)
}
ret = 0;
err:
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
delete_test_keys(c);
pr_info("inserting test keys");
for (i = 0; i < nr; i += 16) {
- struct bkey_i_cookie k;
+ struct bkey_i_cookie ck;
- bkey_cookie_init(&k.k_i);
- k.k.p.offset = i + 16;
- k.k.p.snapshot = U32_MAX;
- k.k.size = 8;
+ bkey_cookie_init(&ck.k_i);
+ ck.k.p.offset = i + 16;
+ ck.k.p.snapshot = U32_MAX;
+ ck.k.size = 8;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
- NULL, NULL, 0);
- if (ret) {
- bch_err_msg(c, ret, "insert error");
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+ bch_err_msg(c, ret, "insert error");
+ if (ret)
goto err;
- }
}
pr_info("iterating forwards");
i = 0;
- ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k, ({
BUG_ON(bkey_start_offset(k.k) != i + 8);
i += 16;
0;
}));
- if (ret) {
- bch_err_msg(c, ret, "error iterating forwards");
+ bch_err_msg(c, ret, "error iterating forwards");
+ if (ret)
goto err;
- }
BUG_ON(i != nr);
i = 0;
- ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
BTREE_ITER_SLOTS, k, ({
if (i == nr)
i = k.k->p.offset;
0;
}));
- if (ret) {
- bch_err_msg(c, ret, "error iterating forwards by slots");
+ bch_err_msg(c, ret, "error iterating forwards by slots");
+ if (ret)
goto err;
- }
ret = 0;
err:
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return 0;
}
*/
static int test_peek_end(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return 0;
}
static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return 0;
}
k.k_i.k.size = end - start;
k.k_i.k.version.lo = test_version++;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
- NULL, NULL, 0);
- if (ret)
- bch_err_fn(c, ret);
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
+ bch_err_fn(c, ret);
return ret;
}
k.k_i.k.size = len;
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i,
+ bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
/* Test skipping over keys in unrelated snapshots: */
static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
{
- struct btree_trans trans;
+ struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_cookie cookie;
bkey_cookie_init(&cookie.k_i);
cookie.k.p.snapshot = snapid_hi;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
- NULL, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
if (ret)
return ret;
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ trans = bch2_trans_get(c);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, snapid_lo), 0);
- lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
BUG_ON(k.k->p.snapshot != U32_MAX);
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
bkey_cookie_init(&cookie.k_i);
cookie.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
- NULL, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
if (ret)
return ret;
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_snapshot_node_create(&trans, U32_MAX,
+ bch2_snapshot_node_create(trans, U32_MAX,
snapids,
snapid_subvols,
2));
swap(snapids[0], snapids[1]);
ret = test_snapshot_filter(c, snapids[0], snapids[1]);
- if (ret) {
- bch_err_msg(c, ret, "from test_snapshot_filter");
- return ret;
- }
-
- return 0;
+ bch_err_msg(c, ret, "from test_snapshot_filter");
+ return ret;
}
/* perf tests */
static int rand_insert(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct bkey_i_cookie k;
int ret = 0;
u64 i;
- bch2_trans_init(&trans, c, 0, 0);
-
for (i = 0; i < nr; i++) {
bkey_cookie_init(&k.k_i);
k.k.p.offset = test_rand();
k.k.p.snapshot = U32_MAX;
- ret = commit_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0));
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
if (ret)
break;
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
static int rand_insert_multi(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct bkey_i_cookie k[8];
int ret = 0;
unsigned j;
u64 i;
- bch2_trans_init(&trans, c, 0, 0);
-
for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
for (j = 0; j < ARRAY_SIZE(k); j++) {
bkey_cookie_init(&k[j].k_i);
k[j].k.p.snapshot = U32_MAX;
}
- ret = commit_do(&trans, NULL, NULL, 0,
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
- __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
if (ret)
break;
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
static int rand_lookup(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
u64 i;
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
for (i = 0; i < nr; i++) {
bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
- lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
ret = bkey_err(k);
if (ret)
break;
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
k = bch2_btree_iter_peek(iter);
ret = bkey_err(k);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- bch_err_msg(trans->c, ret, "lookup error");
+ bch_err_msg(trans->c, ret, "lookup error");
if (ret)
return ret;
static int rand_mixed(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_i_cookie cookie;
int ret = 0;
u64 i, rand;
- bch2_trans_init(&trans, c, 0, 0);
- bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
for (i = 0; i < nr; i++) {
rand = test_rand();
- ret = commit_do(&trans, NULL, NULL, 0,
- rand_mixed_trans(&trans, &iter, &cookie, i, rand));
+ ret = commit_do(trans, NULL, NULL, 0,
+ rand_mixed_trans(trans, &iter, &cookie, i, rand));
if (ret)
break;
}
- bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_exit(&trans);
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_put(trans);
return ret;
}
static int rand_delete(struct bch_fs *c, u64 nr)
{
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
u64 i;
- bch2_trans_init(&trans, c, 0, 0);
-
for (i = 0; i < nr; i++) {
struct bpos pos = SPOS(0, test_rand(), U32_MAX);
- ret = commit_do(&trans, NULL, NULL, 0,
- __do_delete(&trans, pos));
+ ret = commit_do(trans, NULL, NULL, 0,
+ __do_delete(trans, pos));
if (ret)
break;
}
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
return ret;
}
bkey_cookie_init(&insert.k_i);
return bch2_trans_run(c,
- for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
NULL, NULL, 0, ({
if (iter.pos.offset >= nr)
break;
insert.k.p = iter.pos;
- bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+ bch2_trans_update(trans, &iter, &insert.k_i, 0);
})));
}
struct bkey_s_c k;
return bch2_trans_run(c,
- for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k,
0));
struct bkey_s_c k;
return bch2_trans_run(c,
- for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
BTREE_ITER_INTENT, k,
NULL, NULL, 0, ({
struct bkey_i_cookie u;
bkey_reassemble(&u.k_i, k);
- bch2_trans_update(&trans, &iter, &u.k_i, 0);
+ bch2_trans_update(trans, &iter, &u.k_i, 0);
})));
}
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "keylist.h"
+#include "move_types.h"
#include "opts.h"
+#include "six.h"
#include <linux/blktrace_api.h>
-#include <linux/six.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
TP_printk("%d,%d %u %s %llu:%llu:%u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->level,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
);
TP_ARGS(bio)
);
+TRACE_EVENT(read_nopromote,
+ TP_PROTO(struct bch_fs *c, int ret),
+ TP_ARGS(c, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, ret, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+ ),
+
+ TP_printk("%d,%d ret %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ret)
+);
+
DEFINE_EVENT(bio, read_bounce,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
TRACE_EVENT(journal_reclaim_start,
TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
u64 min_nr, u64 min_key_cache,
- u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total),
- TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+ TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total),
__field(bool, kicked )
__field(u64, min_nr )
__field(u64, min_key_cache )
- __field(u64, prereserved )
- __field(u64, prereserved_total )
__field(u64, btree_cache_dirty )
__field(u64, btree_cache_total )
__field(u64, btree_key_cache_dirty )
__entry->kicked = kicked;
__entry->min_nr = min_nr;
__entry->min_key_cache = min_key_cache;
- __entry->prereserved = prereserved;
- __entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty;
__entry->btree_cache_total = btree_cache_total;
__entry->btree_key_cache_dirty = btree_key_cache_dirty;
__entry->btree_key_cache_total = btree_key_cache_total;
),
- TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->direct,
__entry->kicked,
__entry->min_nr,
__entry->min_key_cache,
- __entry->prereserved,
- __entry->prereserved_total,
__entry->btree_cache_dirty,
__entry->btree_cache_total,
__entry->btree_key_cache_dirty,
__field(u8, level )
TRACE_BPOS_entries(pos)
__array(char, node, 24 )
+ __field(u8, self_read_count )
+ __field(u8, self_intent_count)
+ __field(u8, read_count )
+ __field(u8, intent_count )
__field(u32, iter_lock_seq )
__field(u32, node_lock_seq )
),
TP_fast_assign(
struct btree *b = btree_path_node(path, level);
+ struct six_lock_count c;
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
__entry->level = path->level;
TRACE_BPOS_assign(pos, path->pos);
- if (IS_ERR(b))
+
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ __entry->self_read_count = c.n[SIX_LOCK_read];
+ __entry->self_intent_count = c.n[SIX_LOCK_intent];
+
+ if (IS_ERR(b)) {
strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
- else
+ } else {
+ c = six_lock_counts(&path->l[level].b->c.lock);
+ __entry->read_count = c.n[SIX_LOCK_read];
+ __entry->intent_count = c.n[SIX_LOCK_intent];
scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+ }
__entry->iter_lock_seq = path->l[level].lock_seq;
__entry->node_lock_seq = is_btree_node(path, level)
? six_lock_seq(&path->l[level].b->c.lock)
: 0;
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
__entry->level,
__entry->node,
+ __entry->self_read_count,
+ __entry->self_intent_count,
+ __entry->read_count,
+ __entry->intent_count,
__entry->iter_lock_seq,
__entry->node_lock_seq)
);
__entry->self_intent_count = c.n[SIX_LOCK_intent];
c = six_lock_counts(&path->l[level].b->c.lock);
__entry->read_count = c.n[SIX_LOCK_read];
- __entry->intent_count = c.n[SIX_LOCK_read];
+ __entry->intent_count = c.n[SIX_LOCK_intent];
__entry->iter_lock_seq = path->l[level].lock_seq;
__entry->node_lock_seq = is_btree_node(path, level)
? six_lock_seq(&path->l[level].b->c.lock)
TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
);
TRACE_EVENT(move_data,
- TP_PROTO(struct bch_fs *c, u64 sectors_moved,
- u64 keys_moved),
- TP_ARGS(c, sectors_moved, keys_moved),
+ TP_PROTO(struct bch_fs *c,
+ struct bch_move_stats *stats),
+ TP_ARGS(c, stats),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, sectors_moved )
+ __field(dev_t, dev )
__field(u64, keys_moved )
+ __field(u64, keys_raced )
+ __field(u64, sectors_seen )
+ __field(u64, sectors_moved )
+ __field(u64, sectors_raced )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->sectors_moved = sectors_moved;
- __entry->keys_moved = keys_moved;
+ __entry->dev = c->dev;
+ __entry->keys_moved = atomic64_read(&stats->keys_moved);
+ __entry->keys_raced = atomic64_read(&stats->keys_raced);
+ __entry->sectors_seen = atomic64_read(&stats->sectors_seen);
+ __entry->sectors_moved = atomic64_read(&stats->sectors_moved);
+ __entry->sectors_raced = atomic64_read(&stats->sectors_raced);
),
- TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+ TP_printk("%d,%d keys moved %llu raced %llu"
+ "sectors seen %llu moved %llu raced %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->sectors_moved, __entry->keys_moved)
+ __entry->keys_moved,
+ __entry->keys_raced,
+ __entry->sectors_seen,
+ __entry->sectors_moved,
+ __entry->sectors_raced)
);
TRACE_EVENT(evacuate_bucket,
TP_printk("%s %pS btree %s pos %llu:%llu:%u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot)
TP_ARGS(trans, caller_ip, path)
);
+struct get_locks_fail;
+
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path,
unsigned old_locks_want,
- unsigned new_locks_want),
- TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
+ unsigned new_locks_want,
+ struct get_locks_fail *f),
+ TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(u8, btree_id )
__field(u8, old_locks_want )
__field(u8, new_locks_want )
+ __field(u8, level )
+ __field(u32, path_seq )
+ __field(u32, node_seq )
+ __field(u32, path_alloc_seq )
+ __field(u32, downgrade_seq)
TRACE_BPOS_entries(pos)
),
__entry->btree_id = path->btree_id;
__entry->old_locks_want = old_locks_want;
__entry->new_locks_want = new_locks_want;
+ __entry->level = f->l;
+ __entry->path_seq = path->l[f->l].lock_seq;
+ __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
+ __entry->path_alloc_seq = path->alloc_seq;
+ __entry->downgrade_seq = path->downgrade_seq;
TRACE_BPOS_assign(pos, path->pos)
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
__entry->old_locks_want,
- __entry->new_locks_want)
+ __entry->new_locks_want,
+ __entry->level,
+ __entry->path_seq,
+ __entry->node_seq,
+ __entry->path_alloc_seq,
+ __entry->downgrade_seq)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
- bch2_btree_ids[__entry->btree_id],
+ bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
__entry->new_u64s)
);
+TRACE_EVENT(path_downgrade,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip,
+ struct btree_path *path,
+ unsigned old_locks_want),
+ TP_ARGS(trans, caller_ip, path, old_locks_want),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __field(unsigned, old_locks_want )
+ __field(unsigned, new_locks_want )
+ __field(unsigned, btree )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->old_locks_want = old_locks_want;
+ __entry->new_locks_want = path->locks_want;
+ __entry->btree = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
+ ),
+
+ TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->old_locks_want,
+ __entry->new_locks_want,
+ bch2_btree_id_str(__entry->btree),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
+);
+
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
#define parse_or_ret(cp, _f) \
do { \
- int ret = _f; \
- if (ret < 0) \
- return ret; \
- cp += ret; \
+ int _ret = _f; \
+ if (_ret < 0) \
+ return _ret; \
+ cp += _ret; \
} while (0)
static int __bch2_strtou64_h(const char *cp, u64 *res)
while ((p = strsep(&s, ","))) {
int flag = match_string(list, -1, p);
+
if (flag < 0) {
ret = -1;
break;
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
{
+#ifdef CONFIG_STACKTRACE
unsigned nr_entries = 0;
int ret = 0;
up_read(&task->signal->exec_update_lock);
return ret;
+#else
+ return 0;
+#endif
}
void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
return ret;
}
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ time_t t = sec;
+ char buf[64];
+ ctime_r(&t, buf);
+ strim(buf);
+ prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%ptT", &sec);
+ prt_u64(out, sec);
+}
+#endif
+
+static const struct time_unit {
+ const char *name;
+ u64 nsecs;
+} time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
/* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration);
}
}
}
+static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct bch2_time_stat_buffer *b)
+{
+ for (struct bch2_time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ bch2_time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
+}
+
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
- struct bch2_time_stat_buffer_entry *i;
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
- for (i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
+ __bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
-
- b->nr = 0;
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
preempt_enable();
}
}
-#endif
-
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = pick_time_units(ns);
-
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
-}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
prt_printf(out, "%s", u->name);
}
-#define TABSTOP_SIZE 12
-
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
prt_str(out, name);
prt_newline(out);
}
+#define TABSTOP_SIZE 12
+
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
const struct time_unit *u;
s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
+
+ if (stats->buffer) {
+ int cpu;
+
+ spin_lock_irq(&stats->lock);
+ for_each_possible_cpu(cpu)
+ __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+ spin_unlock_irq(&stats->lock);
+ }
+
/*
* avoid divide by zero
*/
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
+ pr_name_and_units(out, "total:", stats->total_duration);
prt_printf(out, "mean:");
prt_tab(out);
last_q = q;
}
}
+#else
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
+#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
/**
* bch2_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- *
- * @d - the struct bch_ratelimit to update
- *
- * Returns the amount of time to delay by, in jiffies
+ * some work
+ * @d: the struct bch_ratelimit to update
+ * Returns: the amount of time to delay by, in jiffies
*/
u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
{
/**
* bch2_ratelimit_increment() - increment @d by the amount of work done
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
+ * @d: the struct bch_ratelimit to update
+ * @done: the amount of work done, in arbitrary units
*/
void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
{
}
}
-int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask)
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
{
while (size) {
- struct page *page = alloc_pages_noprof(gfp_mask, 0);
+ struct page *page = alloc_pages(gfp_mask, 0);
unsigned len = min_t(size_t, PAGE_SIZE, size);
if (!page)
struct bvec_iter iter;
__bio_for_each_segment(bv, dst, iter, dst_iter) {
- void *dstp = kmap_atomic(bv.bv_page);
+ void *dstp = kmap_local_page(bv.bv_page);
+
memcpy(dstp + bv.bv_offset, src, bv.bv_len);
- kunmap_atomic(dstp);
+ kunmap_local(dstp);
src += bv.bv_len;
}
struct bvec_iter iter;
__bio_for_each_segment(bv, src, iter, src_iter) {
- void *srcp = kmap_atomic(bv.bv_page);
+ void *srcp = kmap_local_page(bv.bv_page);
+
memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
- kunmap_atomic(srcp);
+ kunmap_local(srcp);
dst += bv.bv_len;
}
free_pages((unsigned long) p, get_order(size));
}
-static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask)
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
{
- return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN,
- get_order(size)) ?:
- __vmalloc_noprof(size, gfp_mask);
+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+ get_order(size)) ?:
+ __vmalloc(size, gfp_mask);
}
-#define vpmalloc(_size, _gfp) alloc_hooks(vpmalloc_noprof(_size, _gfp))
static inline void kvpfree(void *p, size_t size)
{
vpfree(p, size);
}
-static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask)
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
{
return size < PAGE_SIZE
- ? kmalloc_noprof(size, gfp_mask)
- : vpmalloc_noprof(size, gfp_mask);
+ ? kmalloc(size, gfp_mask)
+ : vpmalloc(size, gfp_mask);
}
-#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp))
int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__)
void bch2_pr_time_units(struct printbuf *, u64);
-
-#ifdef __KERNEL__
-static inline void pr_time(struct printbuf *out, u64 time)
-{
- prt_printf(out, "%llu", time);
-}
-#else
-#include <time.h>
-static inline void pr_time(struct printbuf *out, u64 _time)
-{
- char time_str[64];
- time_t time = _time;
- struct tm *tm = localtime(&time);
- size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
- if (!err)
- prt_printf(out, "(formatting error)");
- else
- prt_printf(out, "%s", time_str);
-}
-#endif
+void bch2_prt_datetime(struct printbuf *, time64_t);
#ifdef __KERNEL__
static inline void uuid_unparse_lower(u8 *uuid, char *out)
struct bch2_time_stats {
spinlock_t lock;
/* all fields are in nanoseconds */
- u64 max_duration;
u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-#endif
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ if (v != !!*start) {
+ if (!v) {
+ bch2_time_stats_update(stats, *start);
+ *start = 0;
+ } else {
+ *start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ bool ret = v && !*start;
+ *start = v;
+ return ret;
+}
+#endif
+
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
s64 last_change;
s64 last_target;
- /* If true, the rate will not increase if bch2_ratelimit_delay()
- * is not being called often enough. */
+ /*
+ * If true, the rate will not increase if bch2_ratelimit_delay()
+ * is not being called often enough.
+ */
bool backpressure;
};
}
void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t);
-#define bch2_bio_alloc_pages(_bio, _size, _gfp) \
- alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp))
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
static inline sector_t bdev_sectors(struct block_device *bdev)
{
{
#ifdef CONFIG_X86_64
long d0, d1, d2;
+
asm volatile("rep ; movsq"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (u64s), "1" (dst), "2" (src)
#ifdef CONFIG_X86_64
long d0, d1, d2;
+
asm volatile("std ;\n"
"rep ; movsq\n"
"cld ;\n"
#define bubble_sort(_base, _nr, _cmp) \
do { \
- ssize_t _i, _end; \
+ ssize_t _i, _last; \
bool _swapped = true; \
\
- for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+ for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
_swapped = false; \
- for (_i = 0; _i < _end; _i++) \
+ for (_i = 0; _i < _last; _i++) \
if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \
swap((_base)[_i], (_base)[_i + 1]); \
_swapped = true; \
return cmp_int(l, r);
}
+static inline int cmp_le32(__le32 l, __le32 r)
+{
+ return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
+}
+
#include <linux/uuid.h>
#endif /* _BCACHEFS_UTIL_H */
/**
* bch2_varint_encode - encode a variable length integer
- * @out - destination to encode to
- * @v - unsigned integer to encode
- *
- * Returns the size in bytes of the encoded integer - at most 9 bytes
+ * @out: destination to encode to
+ * @v: unsigned integer to encode
+ * Returns: size in bytes of the encoded integer - at most 9 bytes
*/
int bch2_varint_encode(u8 *out, u64 v)
{
/**
* bch2_varint_decode - encode a variable length integer
- * @in - varint to decode
- * @end - end of buffer to decode from
- * @out - on success, decoded integer
- *
- * Returns the size in bytes of the decoded integer - or -1 on failure (would
+ * @in: varint to decode
+ * @end: end of buffer to decode from
+ * @out: on success, decoded integer
+ * Returns: size in bytes of the decoded integer - or -1 on failure (would
* have read past the end of the buffer)
*/
int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
if (likely(bytes < 9)) {
__le64 v_le = 0;
+
memcpy(&v_le, in, bytes);
v = le64_to_cpu(v_le);
v >>= bytes;
/**
* bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out: destination to encode to
+ * @v: unsigned integer to encode
+ * Returns: size in bytes of the encoded integer - at most 9 bytes
*
* This version assumes it's always safe to write 8 bytes to @out, even if the
* encoded integer would be smaller.
/**
* bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in: varint to decode
+ * @end: end of buffer to decode from
+ * @out: on success, decoded integer
+ * Returns: size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
*
* This version assumes that it is safe to read at most 8 bytes past the end of
* @end (we still return an error if the varint extends past @end).
(round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
#define vstruct_next(_s) \
- ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s)))
+ ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
#define vstruct_last(_s) \
- ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s)))
+ ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
#define vstruct_end(_s) \
- ((void *) ((_s)->_data + __vstruct_u64s(_s)))
+ ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
#define vstruct_for_each(_s, _i) \
for (_i = (_s)->start; \
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "acl.h"
#include "bkey_methods.h"
#include "btree_update.h"
#include "extents.h"
.cmp_bkey = xattr_cmp_bkey,
};
-int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
- const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+ unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len));
+ int ret = 0;
- if (bkey_val_u64s(k.k) <
- xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len))) {
- prt_printf(err, "value too small (%zu < %u)",
- bkey_val_u64s(k.k),
- xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len)));
- return -BCH_ERR_invalid_bkey;
- }
+ bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err,
+ xattr_val_size_too_small,
+ "value too small (%zu < %u)",
+ bkey_val_u64s(k.k), val_u64s);
/* XXX why +4 ? */
- if (bkey_val_u64s(k.k) >
- xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len) + 4)) {
- prt_printf(err, "value too big (%zu > %u)",
- bkey_val_u64s(k.k),
- xattr_val_u64s(xattr.v->x_name_len,
- le16_to_cpu(xattr.v->x_val_len) + 4));
- return -BCH_ERR_invalid_bkey;
- }
-
- handler = bch2_xattr_type_to_handler(xattr.v->x_type);
- if (!handler) {
- prt_printf(err, "invalid type (%u)", xattr.v->x_type);
- return -BCH_ERR_invalid_bkey;
- }
-
- if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
- prt_printf(err, "xattr name has invalid characters");
- return -BCH_ERR_invalid_bkey;
- }
-
- return 0;
+ val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len) + 4);
+
+ bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err,
+ xattr_val_size_too_big,
+ "value too big (%zu > %u)",
+ bkey_val_u64s(k.k), val_u64s);
+
+ bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err,
+ xattr_invalid_type,
+ "invalid type (%u)", xattr.v->x_type);
+
+ bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err,
+ xattr_name_invalid_chars,
+ "xattr name has invalid characters");
+fsck_err:
+ return ret;
}
void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
xattr.v->x_name,
le16_to_cpu(xattr.v->x_val_len),
(char *) xattr_val(xattr.v));
+
+ if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+ xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+ prt_char(out, ' ');
+ bch2_acl_to_text(out, xattr_val(xattr.v),
+ le16_to_cpu(xattr.v->x_val_len));
+ }
}
static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
{
struct bch_fs *c = dentry->d_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
- struct btree_trans trans;
+ struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
u64 offset = 0, inum = inode->ei_inode.bi_inum;
u32 snapshot;
int ret;
-
- bch2_trans_init(&trans, c, 0, 0);
retry:
- bch2_trans_begin(&trans);
+ bch2_trans_begin(trans);
iter = (struct btree_iter) { NULL };
- ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+ ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
if (ret)
goto err;
- for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
SPOS(inum, offset, snapshot),
POS(inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_xattr)
}
offset = iter.pos.offset;
- bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
- bch2_trans_exit(&trans);
+ bch2_trans_put(trans);
if (ret)
goto out;
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags));
+ bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
return bch2_err_class(ret);
}
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct bch_inode_unpacked inode_u;
- struct btree_trans trans;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
- ret = commit_do(&trans, NULL, NULL, 0,
- bch2_xattr_set(&trans, inode_inum(inode), &inode_u,
+ ret = bch2_trans_run(c,
+ commit_do(trans, NULL, NULL, 0,
+ bch2_xattr_set(trans, inode_inum(inode), &inode_u,
&hash, name, value, size,
- handler->flags, flags));
- if (!ret)
- bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
- bch2_trans_exit(&trans);
+ handler->flags, flags)) ?:
+ (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
return bch2_err_class(ret);
}
bool defined;
};
-static int inode_opt_set_fn(struct bch_inode_info *inode,
+static int inode_opt_set_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
s.v = v + 1;
s.defined = true;
} else {
+ /*
+ * Check if this option was set on the parent - if so, switched
+ * back to inheriting from the parent:
+ *
+ * rename() also has to deal with keeping inherited options up
+ * to date - see bch2_reinherit_attrs()
+ */
+ spin_lock(&dentry->d_lock);
if (!IS_ROOT(dentry)) {
struct bch_inode_info *dir =
to_bch_ei(d_inode(dentry->d_parent));
} else {
s.v = 0;
}
+ spin_unlock(&dentry->d_lock);
s.defined = false;
}
if (value &&
(opt_id == Opt_background_compression ||
opt_id == Opt_background_target))
- bch2_rebalance_add_work(c, inode->v.i_blocks);
+ bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
}
extern const struct bch_hash_desc bch2_xattr_hash_desc;
-int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
return bytes >> 9;
}
-void blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, void *holder)
{
fdatasync(bdev->bd_fd);
close(bdev->bd_sync_fd);
free(bdev);
}
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
- void *holder)
+struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+ void *holder, const struct blk_holder_ops *hop)
{
struct block_device *bdev;
int fd, sync_fd, buffered_fd, flags = 0;
- if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE))
+ if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE))
flags = O_RDWR;
- else if (mode & FMODE_READ)
+ else if (mode & BLK_OPEN_READ)
flags = O_RDONLY;
- else if (mode & FMODE_WRITE)
+ else if (mode & BLK_OPEN_WRITE)
flags = O_WRONLY;
- if (!(mode & FMODE_BUFFERED))
+ if (!(mode & BLK_OPEN_BUFFERED))
flags |= O_DIRECT;
#if 0
/* using O_EXCL doesn't work with opening twice for an O_SYNC fd: */
- if (mode & FMODE_EXCL)
+ if (mode & BLK_OPEN_EXCL)
flags |= O_EXCL;
#endif
buffered_fd = open(path, flags & ~O_DIRECT);
sync_check(bio, ret);
}
+static DECLARE_WAIT_QUEUE_HEAD(aio_events_completed);
+
static int aio_completion_thread(void *arg)
{
struct io_event events[8], *ev;
continue;
if (ret < 0)
die("io_getevents() error: %s", strerror(-ret));
+ if (ret)
+ wake_up(&aio_events_completed);
for (ev = events; ev < events + ret; ev++) {
struct bio *bio = (struct bio *) ev->data;
}, *iocbp = &iocb;
atomic_inc(&running_requests);
- ret = io_submit(aio_ctx, 1, &iocbp);
+
+ wait_event(aio_events_completed,
+ (ret = io_submit(aio_ctx, 1, &iocbp)) != -EAGAIN);;
+
if (ret != 1)
die("io_submit err: %s", strerror(-ret));
}
BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
if (!r) {
+ smp_acquire__after_ctrl_dep();
+
+ cl->closure_get_happened = false;
+
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
atomic_set(&cl->remaining,
CLOSURE_REMAINING_INITIALIZER);
/* For clearing flags with the same atomic op as a put */
void closure_sub(struct closure *cl, int v)
{
- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+ closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
}
EXPORT_SYMBOL(closure_sub);
*/
void closure_put(struct closure *cl)
{
- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+ closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
}
EXPORT_SYMBOL(closure_put);
if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
return false;
+ cl->closure_get_happened = true;
closure_set_waiting(cl, _RET_IP_);
atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
llist_add(&cl->list, &waitlist->list);
static int shrinker_thread(void *arg)
{
while (!kthread_should_stop()) {
- sleep(1);
+ struct timespec to;
+ int v;
+
+ clock_gettime(CLOCK_MONOTONIC, &to);
+ to.tv_sec += 1;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ errno = 0;
+ while ((v = READ_ONCE(current->state)) != TASK_RUNNING &&
+ errno != ETIMEDOUT)
+ futex(¤t->state, FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG,
+ v, &to, NULL, (uint32_t)~0);
+ if (kthread_should_stop())
+ break;
+ if (v != TASK_RUNNING)
+ __set_current_state(TASK_RUNNING);
run_shrinkers(GFP_KERNEL, false);
}
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/export.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/rt.h>
-#include <linux/six.h>
-#include <linux/slab.h>
-
-#include <trace/events/lock.h>
-
-#ifdef DEBUG
-#define EBUG_ON(cond) BUG_ON(cond)
-#else
-#define EBUG_ON(cond) do {} while (0)
-#endif
-
-#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip)
-#define six_release(l, ip) lock_release(l, ip)
-
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
-
-#define SIX_LOCK_HELD_read_OFFSET 0
-#define SIX_LOCK_HELD_read ~(~0U << 26)
-#define SIX_LOCK_HELD_intent (1U << 26)
-#define SIX_LOCK_HELD_write (1U << 27)
-#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent))
-#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write))
-#define SIX_LOCK_NOSPIN (1U << 31)
-
-struct six_lock_vals {
- /* Value we add to the lock in order to take the lock: */
- u32 lock_val;
-
- /* If the lock has this value (used as a mask), taking the lock fails: */
- u32 lock_fail;
-
- /* Mask that indicates lock is held for this type: */
- u32 held_mask;
-
- /* Waitlist we wakeup when releasing the lock: */
- enum six_lock_type unlock_wakeup;
-};
-
-static const struct six_lock_vals l[] = {
- [SIX_LOCK_read] = {
- .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET,
- .lock_fail = SIX_LOCK_HELD_write,
- .held_mask = SIX_LOCK_HELD_read,
- .unlock_wakeup = SIX_LOCK_write,
- },
- [SIX_LOCK_intent] = {
- .lock_val = SIX_LOCK_HELD_intent,
- .lock_fail = SIX_LOCK_HELD_intent,
- .held_mask = SIX_LOCK_HELD_intent,
- .unlock_wakeup = SIX_LOCK_intent,
- },
- [SIX_LOCK_write] = {
- .lock_val = SIX_LOCK_HELD_write,
- .lock_fail = SIX_LOCK_HELD_read,
- .held_mask = SIX_LOCK_HELD_write,
- .unlock_wakeup = SIX_LOCK_read,
- },
-};
-
-static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
-{
- if ((atomic_read(&lock->state) & mask) != mask)
- atomic_or(mask, &lock->state);
-}
-
-static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
-{
- if (atomic_read(&lock->state) & mask)
- atomic_and(~mask, &lock->state);
-}
-
-static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
- u32 old, struct task_struct *owner)
-{
- if (type != SIX_LOCK_intent)
- return;
-
- if (!(old & SIX_LOCK_HELD_intent)) {
- EBUG_ON(lock->owner);
- lock->owner = owner;
- } else {
- EBUG_ON(lock->owner != current);
- }
-}
-
-static inline unsigned pcpu_read_count(struct six_lock *lock)
-{
- unsigned read_count = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- read_count += *per_cpu_ptr(lock->readers, cpu);
- return read_count;
-}
-
-/*
- * __do_six_trylock() - main trylock routine
- *
- * Returns 1 on success, 0 on failure
- *
- * In percpu reader mode, a failed trylock may cause a spurious trylock failure
- * for anoter thread taking the competing lock type, and we may havve to do a
- * wakeup: when a wakeup is required, we return -1 - wakeup_type.
- */
-static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
- struct task_struct *task, bool try)
-{
- int ret;
- u32 old;
-
- EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
- EBUG_ON(type == SIX_LOCK_write &&
- (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
-
- /*
- * Percpu reader mode:
- *
- * The basic idea behind this algorithm is that you can implement a lock
- * between two threads without any atomics, just memory barriers:
- *
- * For two threads you'll need two variables, one variable for "thread a
- * has the lock" and another for "thread b has the lock".
- *
- * To take the lock, a thread sets its variable indicating that it holds
- * the lock, then issues a full memory barrier, then reads from the
- * other thread's variable to check if the other thread thinks it has
- * the lock. If we raced, we backoff and retry/sleep.
- *
- * Failure to take the lock may cause a spurious trylock failure in
- * another thread, because we temporarily set the lock to indicate that
- * we held it. This would be a problem for a thread in six_lock(), when
- * they are calling trylock after adding themself to the waitlist and
- * prior to sleeping.
- *
- * Therefore, if we fail to get the lock, and there were waiters of the
- * type we conflict with, we will have to issue a wakeup.
- *
- * Since we may be called under wait_lock (and by the wakeup code
- * itself), we return that the wakeup has to be done instead of doing it
- * here.
- */
- if (type == SIX_LOCK_read && lock->readers) {
- preempt_disable();
- this_cpu_inc(*lock->readers); /* signal that we own lock */
-
- smp_mb();
-
- old = atomic_read(&lock->state);
- ret = !(old & l[type].lock_fail);
-
- this_cpu_sub(*lock->readers, !ret);
- preempt_enable();
-
- if (!ret && (old & SIX_LOCK_WAITING_write))
- ret = -1 - SIX_LOCK_write;
- } else if (type == SIX_LOCK_write && lock->readers) {
- if (try) {
- atomic_add(SIX_LOCK_HELD_write, &lock->state);
- smp_mb__after_atomic();
- }
-
- ret = !pcpu_read_count(lock);
-
- if (try && !ret) {
- old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
- if (old & SIX_LOCK_WAITING_read)
- ret = -1 - SIX_LOCK_read;
- }
- } else {
- old = atomic_read(&lock->state);
- do {
- ret = !(old & l[type].lock_fail);
- if (!ret || (type == SIX_LOCK_write && !try)) {
- smp_mb();
- break;
- }
- } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
-
- EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
- }
-
- if (ret > 0)
- six_set_owner(lock, type, old, task);
-
- EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
- (atomic_read(&lock->state) & SIX_LOCK_HELD_write));
-
- return ret;
-}
-
-static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
-{
- struct six_lock_waiter *w, *next;
- struct task_struct *task;
- bool saw_one;
- int ret;
-again:
- ret = 0;
- saw_one = false;
- raw_spin_lock(&lock->wait_lock);
-
- list_for_each_entry_safe(w, next, &lock->wait_list, list) {
- if (w->lock_want != lock_type)
- continue;
-
- if (saw_one && lock_type != SIX_LOCK_read)
- goto unlock;
- saw_one = true;
-
- ret = __do_six_trylock(lock, lock_type, w->task, false);
- if (ret <= 0)
- goto unlock;
-
- __list_del(w->list.prev, w->list.next);
- task = w->task;
- /*
- * Do no writes to @w besides setting lock_acquired - otherwise
- * we would need a memory barrier:
- */
- barrier();
- w->lock_acquired = true;
- wake_up_process(task);
- }
-
- six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
-unlock:
- raw_spin_unlock(&lock->wait_lock);
-
- if (ret < 0) {
- lock_type = -ret - 1;
- goto again;
- }
-}
-
-__always_inline
-static void six_lock_wakeup(struct six_lock *lock, u32 state,
- enum six_lock_type lock_type)
-{
- if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
- return;
-
- if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
- return;
-
- __six_lock_wakeup(lock, lock_type);
-}
-
-__always_inline
-static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
-{
- int ret;
-
- ret = __do_six_trylock(lock, type, current, try);
- if (ret < 0)
- __six_lock_wakeup(lock, -ret - 1);
-
- return ret > 0;
-}
-
-/**
- * six_trylock_ip - attempt to take a six lock without blocking
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
- if (!do_six_trylock(lock, type, true))
- return false;
-
- if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
- return true;
-}
-EXPORT_SYMBOL_GPL(six_trylock_ip);
-
-/**
- * six_relock_ip - attempt to re-take a lock that was held previously
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq: lock sequence number obtained from six_lock_seq() while lock was
- * held previously
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
- unsigned seq, unsigned long ip)
-{
- if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
- return false;
-
- if (six_lock_seq(lock) != seq) {
- six_unlock_ip(lock, type, ip);
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(six_relock_ip);
-
-#ifdef CONFIG_LOCK_SPIN_ON_OWNER
-
-static inline bool six_can_spin_on_owner(struct six_lock *lock)
-{
- struct task_struct *owner;
- bool ret;
-
- if (need_resched())
- return false;
-
- rcu_read_lock();
- owner = READ_ONCE(lock->owner);
- ret = !owner || owner_on_cpu(owner);
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool six_spin_on_owner(struct six_lock *lock,
- struct task_struct *owner,
- u64 end_time)
-{
- bool ret = true;
- unsigned loop = 0;
-
- rcu_read_lock();
- while (lock->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- if (!owner_on_cpu(owner) || need_resched()) {
- ret = false;
- break;
- }
-
- if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
- six_set_bitmask(lock, SIX_LOCK_NOSPIN);
- ret = false;
- break;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
- struct task_struct *task = current;
- u64 end_time;
-
- if (type == SIX_LOCK_write)
- return false;
-
- preempt_disable();
- if (!six_can_spin_on_owner(lock))
- goto fail;
-
- if (!osq_lock(&lock->osq))
- goto fail;
-
- end_time = sched_clock() + 10 * NSEC_PER_USEC;
-
- while (1) {
- struct task_struct *owner;
-
- /*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
- */
- owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner, end_time))
- break;
-
- if (do_six_trylock(lock, type, false)) {
- osq_unlock(&lock->osq);
- preempt_enable();
- return true;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax();
- }
-
- osq_unlock(&lock->osq);
-fail:
- preempt_enable();
-
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock again. This avoids getting
- * scheduled out right after we obtained the lock.
- */
- if (need_resched())
- schedule();
-
- return false;
-}
-
-#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
- return false;
-}
-
-#endif
-
-noinline
-static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- int ret = 0;
-
- if (type == SIX_LOCK_write) {
- EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
- atomic_add(SIX_LOCK_HELD_write, &lock->state);
- smp_mb__after_atomic();
- }
-
- trace_contention_begin(lock, 0);
- lock_contended(&lock->dep_map, ip);
-
- if (six_optimistic_spin(lock, type))
- goto out;
-
- wait->task = current;
- wait->lock_want = type;
- wait->lock_acquired = false;
-
- raw_spin_lock(&lock->wait_lock);
- six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
- /*
- * Retry taking the lock after taking waitlist lock, in case we raced
- * with an unlock:
- */
- ret = __do_six_trylock(lock, type, current, false);
- if (ret <= 0) {
- wait->start_time = local_clock();
-
- if (!list_empty(&lock->wait_list)) {
- struct six_lock_waiter *last =
- list_last_entry(&lock->wait_list,
- struct six_lock_waiter, list);
-
- if (time_before_eq64(wait->start_time, last->start_time))
- wait->start_time = last->start_time + 1;
- }
-
- list_add_tail(&wait->list, &lock->wait_list);
- }
- raw_spin_unlock(&lock->wait_lock);
-
- if (unlikely(ret > 0)) {
- ret = 0;
- goto out;
- }
-
- if (unlikely(ret < 0)) {
- __six_lock_wakeup(lock, -ret - 1);
- ret = 0;
- }
-
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (wait->lock_acquired)
- break;
-
- ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
- if (unlikely(ret)) {
- raw_spin_lock(&lock->wait_lock);
- if (!wait->lock_acquired)
- list_del(&wait->list);
- raw_spin_unlock(&lock->wait_lock);
-
- if (unlikely(wait->lock_acquired))
- do_six_unlock_type(lock, type);
- break;
- }
-
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
-out:
- if (ret && type == SIX_LOCK_write) {
- six_clear_bitmask(lock, SIX_LOCK_HELD_write);
- six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
- }
- trace_contention_end(lock, 0);
-
- return ret;
-}
-
-/**
- * six_lock_ip_waiter - take a lock, with full waitlist interface
- * @lock: lock to take
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait: pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- * to scheduling
- * @p: passed through to @should_sleep_fn
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * This is the most general six_lock() variant, with parameters to support full
- * cycle detection for deadlock avoidance.
- *
- * The code calling this function must implement tracking of held locks, and the
- * @wait object should be embedded into the struct that tracks held locks -
- * which must also be accessible in a thread-safe way.
- *
- * @should_sleep_fn should invoke the cycle detector; it should walk each
- * lock's waiters, and for each waiter recursively walk their held locks.
- *
- * When this function must block, @wait will be added to @lock's waitlist before
- * calling trylock, and before calling @should_sleep_fn, and @wait will not be
- * removed from the lock waitlist until the lock has been successfully acquired,
- * or we abort.
- *
- * @wait.start_time will be monotonically increasing for any given waitlist, and
- * thus may be used as a loop cursor.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
- struct six_lock_waiter *wait,
- six_lock_should_sleep_fn should_sleep_fn, void *p,
- unsigned long ip)
-{
- int ret;
-
- wait->start_time = 0;
-
- if (type != SIX_LOCK_write)
- six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
-
- ret = do_six_trylock(lock, type, true) ? 0
- : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
-
- if (ret && type != SIX_LOCK_write)
- six_release(&lock->dep_map, ip);
- if (!ret)
- lock_acquired(&lock->dep_map, ip);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
-
-__always_inline
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
- u32 state;
-
- if (type == SIX_LOCK_intent)
- lock->owner = NULL;
-
- if (type == SIX_LOCK_read &&
- lock->readers) {
- smp_mb(); /* unlock barrier */
- this_cpu_dec(*lock->readers);
- smp_mb(); /* between unlocking and checking for waiters */
- state = atomic_read(&lock->state);
- } else {
- u32 v = l[type].lock_val;
-
- if (type != SIX_LOCK_read)
- v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
-
- EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
- state = atomic_sub_return_release(v, &lock->state);
- }
-
- six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-}
-
-/**
- * six_unlock_ip - drop a six lock
- * @lock: lock to unlock
- * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock); read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0
- */
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
- EBUG_ON(type == SIX_LOCK_write &&
- !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
- EBUG_ON((type == SIX_LOCK_write ||
- type == SIX_LOCK_intent) &&
- lock->owner != current);
-
- if (type != SIX_LOCK_write)
- six_release(&lock->dep_map, ip);
- else
- lock->seq++;
-
- if (type == SIX_LOCK_intent &&
- lock->intent_lock_recurse) {
- --lock->intent_lock_recurse;
- return;
- }
-
- do_six_unlock_type(lock, type);
-}
-EXPORT_SYMBOL_GPL(six_unlock_ip);
-
-/**
- * six_lock_downgrade - convert an intent lock to a read lock
- * @lock: lock to dowgrade
- *
- * @lock will have read count incremented and intent count decremented
- */
-void six_lock_downgrade(struct six_lock *lock)
-{
- six_lock_increment(lock, SIX_LOCK_read);
- six_unlock_intent(lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_downgrade);
-
-/**
- * six_lock_tryupgrade - attempt to convert read lock to an intent lock
- * @lock: lock to upgrade
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_lock_tryupgrade(struct six_lock *lock)
-{
- u32 old = atomic_read(&lock->state), new;
-
- do {
- new = old;
-
- if (new & SIX_LOCK_HELD_intent)
- return false;
-
- if (!lock->readers) {
- EBUG_ON(!(new & SIX_LOCK_HELD_read));
- new -= l[SIX_LOCK_read].lock_val;
- }
-
- new |= SIX_LOCK_HELD_intent;
- } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
-
- if (lock->readers)
- this_cpu_dec(*lock->readers);
-
- six_set_owner(lock, SIX_LOCK_intent, old, current);
-
- return true;
-}
-EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
-
-/**
- * six_trylock_convert - attempt to convert a held lock from one type to another
- * @lock: lock to upgrade
- * @from: SIX_LOCK_read or SIX_LOCK_intent
- * @to: SIX_LOCK_read or SIX_LOCK_intent
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_trylock_convert(struct six_lock *lock,
- enum six_lock_type from,
- enum six_lock_type to)
-{
- EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
-
- if (to == from)
- return true;
-
- if (to == SIX_LOCK_read) {
- six_lock_downgrade(lock);
- return true;
- } else {
- return six_lock_tryupgrade(lock);
- }
-}
-EXPORT_SYMBOL_GPL(six_trylock_convert);
-
-/**
- * six_lock_increment - increase held lock count on a lock that is already held
- * @lock: lock to increment
- * @type: SIX_LOCK_read or SIX_LOCK_intent
- *
- * @lock must already be held, with a lock type that is greater than or equal to
- * @type
- *
- * A corresponding six_unlock_type() call will be required for @lock to be fully
- * unlocked.
- */
-void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
-{
- six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
-
- /* XXX: assert already locked, and that we don't overflow: */
-
- switch (type) {
- case SIX_LOCK_read:
- if (lock->readers) {
- this_cpu_inc(*lock->readers);
- } else {
- EBUG_ON(!(atomic_read(&lock->state) &
- (SIX_LOCK_HELD_read|
- SIX_LOCK_HELD_intent)));
- atomic_add(l[type].lock_val, &lock->state);
- }
- break;
- case SIX_LOCK_intent:
- EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
- lock->intent_lock_recurse++;
- break;
- case SIX_LOCK_write:
- BUG();
- break;
- }
-}
-EXPORT_SYMBOL_GPL(six_lock_increment);
-
-/**
- * six_lock_wakeup_all - wake up all waiters on @lock
- * @lock: lock to wake up waiters for
- *
- * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
- * abort the lock operation.
- *
- * This function is never needed in a bug-free program; it's only useful in
- * debug code, e.g. to determine if a cycle detector is at fault.
- */
-void six_lock_wakeup_all(struct six_lock *lock)
-{
- u32 state = atomic_read(&lock->state);
- struct six_lock_waiter *w;
-
- six_lock_wakeup(lock, state, SIX_LOCK_read);
- six_lock_wakeup(lock, state, SIX_LOCK_intent);
- six_lock_wakeup(lock, state, SIX_LOCK_write);
-
- raw_spin_lock(&lock->wait_lock);
- list_for_each_entry(w, &lock->wait_list, list)
- wake_up_process(w->task);
- raw_spin_unlock(&lock->wait_lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-
-/**
- * six_lock_counts - return held lock counts, for each lock type
- * @lock: lock to return counters for
- *
- * Return: the number of times a lock is held for read, intent and write.
- */
-struct six_lock_count six_lock_counts(struct six_lock *lock)
-{
- struct six_lock_count ret;
-
- ret.n[SIX_LOCK_read] = !lock->readers
- ? atomic_read(&lock->state) & SIX_LOCK_HELD_read
- : pcpu_read_count(lock);
- ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
- lock->intent_lock_recurse;
- ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_counts);
-
-/**
- * six_lock_readers_add - directly manipulate reader count of a lock
- * @lock: lock to add/subtract readers for
- * @nr: reader count to add/subtract
- *
- * When an upper layer is implementing lock reentrency, we may have both read
- * and intent locks on the same lock.
- *
- * When we need to take a write lock, the read locks will cause self-deadlock,
- * because six locks themselves do not track which read locks are held by the
- * current thread and which are held by a different thread - it does no
- * per-thread tracking of held locks.
- *
- * The upper layer that is tracking held locks may however, if trylock() has
- * failed, count up its own read locks, subtract them, take the write lock, and
- * then re-add them.
- *
- * As in any other situation when taking a write lock, @lock must be held for
- * intent one (or more) times, so @lock will never be left unlocked.
- */
-void six_lock_readers_add(struct six_lock *lock, int nr)
-{
- if (lock->readers) {
- this_cpu_add(*lock->readers, nr);
- } else {
- EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
- /* reader count starts at bit 0 */
- atomic_add(nr, &lock->state);
- }
-}
-EXPORT_SYMBOL_GPL(six_lock_readers_add);
-
-/**
- * six_lock_exit - release resources held by a lock prior to freeing
- * @lock: lock to exit
- *
- * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
- * required to free the percpu read counts.
- */
-void six_lock_exit(struct six_lock *lock)
-{
- WARN_ON(lock->readers && pcpu_read_count(lock));
- WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
-
- free_percpu(lock->readers);
- lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_exit);
-
-void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags)
-{
- atomic_set(&lock->state, 0);
- raw_spin_lock_init(&lock->wait_lock);
- INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- debug_check_no_locks_freed((void *) lock, sizeof(*lock));
- lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-
- /*
- * Don't assume that we have real percpu variables available in
- * userspace:
- */
-#ifdef __KERNEL__
- if (flags & SIX_LOCK_INIT_PCPU) {
- /*
- * We don't return an error here on memory allocation failure
- * since percpu is an optimization, and locks will work with the
- * same semantics in non-percpu mode: callers can check for
- * failure if they wish by checking lock->readers, but generally
- * will not want to treat it as an error.
- */
- lock->readers = alloc_percpu(unsigned);
- }
-#endif
-}
-EXPORT_SYMBOL_GPL(__six_lock_init);
+++ /dev/null
-#!/bin/sh
-
-SDIR="$(readlink -f "$0")"
-exec "${SDIR%/*}/bcachefs" format "$@"
+++ /dev/null
-#!/bin/sh
-
-SDIR="$(readlink -f "$0")"
-exec "${SDIR%/*}/bcachefs" mount "$@"
rm -f %{buildroot}/usr/lib/libbcachefs.so
%files
-%{_sbindir}/mount.bcachefs
%{_sbindir}/bcachefs
+%{_sbindir}/mount.bcachefs
%{_sbindir}/fsck.bcachefs
%{_sbindir}/mkfs.bcachefs
+%{_sbindir}/mount.fuse.bcachefs
+%{_sbindir}/fsck.fuse.bcachefs
+%{_sbindir}/mkfs.fuse.bcachefs
%{_mandir}/man8/bcachefs.8.gz
%changelog
void qcow2_write_image(int infd, int outfd, ranges *data,
unsigned block_size)
{
- u64 image_size = get_size(NULL, infd);
+ u64 image_size = get_size(infd);
unsigned l2_size = block_size / sizeof(u64);
unsigned l1_size = DIV_ROUND_UP(image_size, (u64) block_size * l2_size);
struct qcow2_hdr hdr = { 0 };
[[package]]
name = "aho-corasick"
-version = "0.7.20"
+version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]]
-name = "android_system_properties"
-version = "0.1.5"
+name = "anstream"
+version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
dependencies = [
- "libc",
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is-terminal",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
+dependencies = [
+ "windows-sys",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c677ab05e09154296dd37acecd46420c17b9713e8366facafa8fc0885167cf4c"
+dependencies = [
+ "anstyle",
+ "windows-sys",
]
[[package]]
name = "anyhow"
-version = "1.0.68"
+version = "1.0.75"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61"
+checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
[[package]]
name = "atty"
"byteorder",
"chrono",
"clap",
+ "clap_complete",
"colored",
"either",
- "errno",
+ "errno 0.2.8",
"gag",
"getset",
"itertools",
"anyhow",
"bindgen",
"bitfield",
- "bitflags",
+ "bitflags 1.3.2",
"byteorder",
"chrono",
"colored",
version = "0.64.0"
source = "git+https://evilpiepirate.org/git/rust-bindgen.git#f773267b090bf16b9e8375fcbdcd8ba5e88806a8"
dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
"cexpr",
"clang-sys",
"lazy_static",
"regex",
"rustc-hash",
"shlex",
- "syn",
+ "syn 1.0.109",
]
[[package]]
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
-name = "bumpalo"
-version = "3.12.0"
+name = "bitflags"
+version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
+checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "byteorder"
-version = "1.4.3"
+version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "cc"
-version = "1.0.79"
+version = "1.0.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+dependencies = [
+ "libc",
+]
[[package]]
name = "cexpr"
[[package]]
name = "chrono"
-version = "0.4.23"
+version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
dependencies = [
- "iana-time-zone",
- "js-sys",
- "num-integer",
"num-traits",
- "time",
- "wasm-bindgen",
- "winapi",
]
[[package]]
name = "clang-sys"
-version = "1.6.0"
+version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ed9a53e5d4d9c573ae844bfac6872b159cb1d1585a83b29e7a64b7eef7332a"
+checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
dependencies = [
"glob",
"libc",
[[package]]
name = "clap"
-version = "4.1.4"
+version = "4.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76"
+checksum = "fb690e81c7840c0d7aade59f242ea3b41b9bc27bcd5997890e7702ae4b32e487"
dependencies = [
- "bitflags",
+ "clap_builder",
"clap_derive",
- "clap_lex",
- "is-terminal",
"once_cell",
- "strsim",
- "termcolor",
- "terminal_size",
]
[[package]]
-name = "clap_derive"
-version = "4.1.0"
+name = "clap_builder"
+version = "4.3.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
+checksum = "5ed2e96bc16d8d740f6f48d663eddf4b8a0983e79210fd55479b7bcd0a69860e"
dependencies = [
- "heck",
- "proc-macro-error",
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "clap_lex"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
-dependencies = [
- "os_str_bytes",
-]
-
-[[package]]
-name = "codespan-reporting"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
-dependencies = [
- "termcolor",
- "unicode-width",
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+ "terminal_size",
]
[[package]]
-name = "colored"
-version = "2.0.0"
+name = "clap_complete"
+version = "4.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd"
+checksum = "bffe91f06a11b4b9420f62103854e90867812cd5d01557f853c5ee8e791b12ae"
dependencies = [
- "atty",
- "lazy_static",
- "winapi",
+ "clap",
]
[[package]]
-name = "core-foundation-sys"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
-
-[[package]]
-name = "cxx"
-version = "1.0.89"
+name = "clap_derive"
+version = "4.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9"
+checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050"
dependencies = [
- "cc",
- "cxxbridge-flags",
- "cxxbridge-macro",
- "link-cplusplus",
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.38",
]
[[package]]
-name = "cxx-build"
-version = "1.0.89"
+name = "clap_lex"
+version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d"
-dependencies = [
- "cc",
- "codespan-reporting",
- "once_cell",
- "proc-macro2",
- "quote",
- "scratch",
- "syn",
-]
+checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
[[package]]
-name = "cxxbridge-flags"
-version = "1.0.89"
+name = "colorchoice"
+version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a"
+checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
[[package]]
-name = "cxxbridge-macro"
-version = "1.0.89"
+name = "colored"
+version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2"
+checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6"
dependencies = [
- "proc-macro2",
- "quote",
- "syn",
+ "is-terminal",
+ "lazy_static",
+ "windows-sys",
]
[[package]]
name = "either"
-version = "1.8.1"
+version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
+checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
[[package]]
name = "errno"
"winapi",
]
+[[package]]
+name = "errno"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
+dependencies = [
+ "libc",
+ "windows-sys",
+]
+
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
[[package]]
name = "fastrand"
-version = "1.8.0"
+version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
-dependencies = [
- "instant",
-]
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
[[package]]
name = "filedescriptor"
"proc-macro-error",
"proc-macro2",
"quote",
- "syn",
+ "syn 1.0.109",
]
[[package]]
[[package]]
name = "hermit-abi"
-version = "0.2.6"
+version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "iana-time-zone"
-version = "0.1.53"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765"
-dependencies = [
- "android_system_properties",
- "core-foundation-sys",
- "iana-time-zone-haiku",
- "js-sys",
- "wasm-bindgen",
- "winapi",
-]
-
-[[package]]
-name = "iana-time-zone-haiku"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca"
-dependencies = [
- "cxx",
- "cxx-build",
-]
-
-[[package]]
-name = "instant"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
-dependencies = [
- "cfg-if",
-]
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "io-lifetimes"
-version = "1.0.4"
+version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
dependencies = [
+ "hermit-abi 0.3.3",
"libc",
"windows-sys",
]
[[package]]
name = "is-terminal"
-version = "0.4.2"
+version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189"
+checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
dependencies = [
- "hermit-abi 0.2.6",
- "io-lifetimes",
- "rustix",
+ "hermit-abi 0.3.3",
+ "rustix 0.38.21",
"windows-sys",
]
"either",
]
-[[package]]
-name = "js-sys"
-version = "0.3.61"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730"
-dependencies = [
- "wasm-bindgen",
-]
-
[[package]]
name = "lazy_static"
version = "1.4.0"
[[package]]
name = "libc"
-version = "0.2.139"
+version = "0.2.149"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
[[package]]
name = "libudev-sys"
]
[[package]]
-name = "link-cplusplus"
-version = "1.0.8"
+name = "linux-raw-sys"
+version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
-dependencies = [
- "cc",
-]
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "linux-raw-sys"
-version = "0.1.4"
+version = "0.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
[[package]]
name = "log"
-version = "0.4.17"
+version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
-dependencies = [
- "cfg-if",
-]
+checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "memchr"
-version = "2.5.0"
+version = "2.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
[[package]]
name = "memoffset"
"minimal-lexical",
]
-[[package]]
-name = "num-integer"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
-dependencies = [
- "autocfg",
- "num-traits",
-]
-
[[package]]
name = "num-traits"
-version = "0.2.15"
+version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
-version = "1.17.0"
+version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
-
-[[package]]
-name = "os_str_bytes"
-version = "6.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
[[package]]
name = "parse-display"
"proc-macro2",
"quote",
"regex",
- "regex-syntax",
- "syn",
+ "regex-syntax 0.6.29",
+ "syn 1.0.109",
]
[[package]]
name = "paste"
-version = "1.0.11"
+version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba"
+checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
[[package]]
name = "peeking_take_while"
[[package]]
name = "pkg-config"
-version = "0.3.26"
+version = "0.3.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
[[package]]
name = "proc-macro-error"
"proc-macro-error-attr",
"proc-macro2",
"quote",
- "syn",
+ "syn 1.0.109",
"version_check",
]
[[package]]
name = "proc-macro2"
-version = "1.0.50"
+version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
+checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
-version = "1.0.23"
+version = "1.0.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
+checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
dependencies = [
"proc-macro2",
]
[[package]]
name = "redox_syscall"
-version = "0.2.16"
+version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
]
[[package]]
name = "regex"
-version = "1.7.1"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
dependencies = [
"aho-corasick",
"memchr",
- "regex-syntax",
+ "regex-syntax 0.8.2",
]
[[package]]
name = "regex-syntax"
-version = "0.6.28"
+version = "0.6.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
[[package]]
-name = "remove_dir_all"
-version = "0.5.3"
+name = "regex-syntax"
+version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
-dependencies = [
- "winapi",
-]
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]]
name = "rpassword"
[[package]]
name = "rustix"
-version = "0.36.7"
+version = "0.37.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
+checksum = "fea8ca367a3a01fe35e6943c400addf443c0f57670e6ec51196f71a4b8762dd2"
dependencies = [
- "bitflags",
- "errno",
+ "bitflags 1.3.2",
+ "errno 0.3.5",
"io-lifetimes",
"libc",
- "linux-raw-sys",
+ "linux-raw-sys 0.3.8",
"windows-sys",
]
[[package]]
-name = "scratch"
-version = "1.0.3"
+name = "rustix"
+version = "0.38.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2"
+checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3"
+dependencies = [
+ "bitflags 2.4.1",
+ "errno 0.3.5",
+ "libc",
+ "linux-raw-sys 0.4.10",
+ "windows-sys",
+]
[[package]]
name = "shlex"
-version = "1.1.0"
+version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380"
[[package]]
name = "strsim"
[[package]]
name = "syn"
-version = "1.0.107"
+version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
]
[[package]]
-name = "tempfile"
-version = "3.3.0"
+name = "syn"
+version = "2.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
+checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
dependencies = [
- "cfg-if",
- "fastrand",
- "libc",
- "redox_syscall",
- "remove_dir_all",
- "winapi",
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
]
[[package]]
-name = "termcolor"
-version = "1.2.0"
+name = "tempfile"
+version = "3.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5"
dependencies = [
- "winapi-util",
+ "cfg-if",
+ "fastrand",
+ "redox_syscall",
+ "rustix 0.38.21",
+ "windows-sys",
]
[[package]]
name = "terminal_size"
-version = "0.2.3"
+version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb20089a8ba2b69debd491f8d2d023761cbf196e999218c591fa1e7e15a21907"
+checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237"
dependencies = [
- "rustix",
+ "rustix 0.37.27",
"windows-sys",
]
[[package]]
name = "thiserror"
-version = "1.0.38"
+version = "1.0.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
+checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
-version = "1.0.38"
+version = "1.0.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
+checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8"
dependencies = [
"proc-macro2",
"quote",
- "syn",
-]
-
-[[package]]
-name = "time"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
-dependencies = [
- "libc",
- "wasi",
- "winapi",
+ "syn 2.0.38",
]
[[package]]
[[package]]
name = "unicode-ident"
-version = "1.0.6"
+version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
-name = "unicode-width"
-version = "0.1.10"
+name = "utf8parse"
+version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
[[package]]
name = "uuid"
-version = "1.3.0"
+version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79"
+checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc"
[[package]]
name = "version_check"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
-[[package]]
-name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
-
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b"
-dependencies = [
- "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9"
-dependencies = [
- "bumpalo",
- "log",
- "once_cell",
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-backend",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
-
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-[[package]]
-name = "winapi-util"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
-dependencies = [
- "winapi",
-]
-
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
[[package]]
name = "windows-sys"
-version = "0.42.0"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
[[package]]
name = "windows_aarch64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_i686_gnu"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_x86_64_gnu"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
version = "0.3.1"
authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>"]
edition = "2018"
+rust-version = "1.65"
[lib]
crate-type = ["staticlib"]
[dependencies]
atty = "0.2.14"
log = { version = "0.4", features = ["std"] }
-chrono = "0.4"
+chrono = { version = "0.4", default-features = false }
colored = "2"
clap = { version = "4.0.32", features = ["derive", "wrap_help"] }
+clap_complete = "4.4.4"
anyhow = "1.0"
libc = "0.2.69"
udev = "0.7.0"
version = 3
[[package]]
-name = "android_system_properties"
-version = "0.1.5"
+name = "aho-corasick"
+version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
- "libc",
+ "memchr",
]
[[package]]
name = "anyhow"
-version = "1.0.69"
+version = "1.0.75"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
-
-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi",
- "libc",
- "winapi",
-]
+checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
[[package]]
name = "autocfg"
"anyhow",
"bindgen",
"bitfield",
- "bitflags",
+ "bitflags 1.3.2",
"byteorder",
"chrono",
"colored",
version = "0.64.0"
source = "git+https://evilpiepirate.org/git/rust-bindgen.git#f773267b090bf16b9e8375fcbdcd8ba5e88806a8"
dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
"cexpr",
"clang-sys",
"lazy_static",
"regex",
"rustc-hash",
"shlex",
- "syn",
+ "syn 1.0.109",
]
[[package]]
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
-name = "bumpalo"
-version = "3.12.0"
+name = "bitflags"
+version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
+checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "byteorder"
-version = "1.4.3"
+version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
-
-[[package]]
-name = "cc"
-version = "1.0.79"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "cexpr"
[[package]]
name = "chrono"
-version = "0.4.23"
+version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
dependencies = [
- "iana-time-zone",
- "js-sys",
- "num-integer",
"num-traits",
- "time",
- "wasm-bindgen",
- "winapi",
]
[[package]]
name = "clang-sys"
-version = "1.6.0"
+version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ed9a53e5d4d9c573ae844bfac6872b159cb1d1585a83b29e7a64b7eef7332a"
+checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
dependencies = [
"glob",
"libc",
]
-[[package]]
-name = "codespan-reporting"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
-dependencies = [
- "termcolor",
- "unicode-width",
-]
-
[[package]]
name = "colored"
-version = "2.0.0"
+version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd"
+checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6"
dependencies = [
- "atty",
+ "is-terminal",
"lazy_static",
- "winapi",
-]
-
-[[package]]
-name = "core-foundation-sys"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
-
-[[package]]
-name = "cxx"
-version = "1.0.91"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62"
-dependencies = [
- "cc",
- "cxxbridge-flags",
- "cxxbridge-macro",
- "link-cplusplus",
-]
-
-[[package]]
-name = "cxx-build"
-version = "1.0.91"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690"
-dependencies = [
- "cc",
- "codespan-reporting",
- "once_cell",
- "proc-macro2",
- "quote",
- "scratch",
- "syn",
-]
-
-[[package]]
-name = "cxxbridge-flags"
-version = "1.0.91"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf"
-
-[[package]]
-name = "cxxbridge-macro"
-version = "1.0.91"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
+ "windows-sys",
]
[[package]]
name = "errno"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
-dependencies = [
- "errno-dragonfly",
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
+version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
dependencies = [
- "cc",
"libc",
+ "windows-sys",
]
[[package]]
name = "fastrand"
-version = "1.9.0"
+version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
-dependencies = [
- "instant",
-]
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
[[package]]
name = "filedescriptor"
[[package]]
name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "iana-time-zone"
-version = "0.1.53"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765"
-dependencies = [
- "android_system_properties",
- "core-foundation-sys",
- "iana-time-zone-haiku",
- "js-sys",
- "wasm-bindgen",
- "winapi",
-]
-
-[[package]]
-name = "iana-time-zone-haiku"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca"
-dependencies = [
- "cxx",
- "cxx-build",
-]
-
-[[package]]
-name = "instant"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "io-lifetimes"
-version = "1.0.5"
+version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
-dependencies = [
- "libc",
- "windows-sys 0.45.0",
-]
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
-name = "js-sys"
-version = "0.3.61"
+name = "is-terminal"
+version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730"
+checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
dependencies = [
- "wasm-bindgen",
+ "hermit-abi",
+ "rustix",
+ "windows-sys",
]
[[package]]
[[package]]
name = "libc"
-version = "0.2.139"
+version = "0.2.149"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
[[package]]
name = "libudev-sys"
"pkg-config",
]
-[[package]]
-name = "link-cplusplus"
-version = "1.0.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
-dependencies = [
- "cc",
-]
-
[[package]]
name = "linux-raw-sys"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
-
-[[package]]
-name = "log"
-version = "0.4.17"
+version = "0.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
-dependencies = [
- "cfg-if",
-]
+checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
[[package]]
name = "memchr"
-version = "2.5.0"
+version = "2.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
[[package]]
name = "memoffset"
"minimal-lexical",
]
-[[package]]
-name = "num-integer"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
-dependencies = [
- "autocfg",
- "num-traits",
-]
-
[[package]]
name = "num-traits"
-version = "0.2.15"
+version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
]
-[[package]]
-name = "once_cell"
-version = "1.17.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
-
[[package]]
name = "paste"
-version = "1.0.11"
+version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba"
+checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
[[package]]
name = "peeking_take_while"
[[package]]
name = "pkg-config"
-version = "0.3.26"
+version = "0.3.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
[[package]]
name = "proc-macro2"
-version = "1.0.51"
+version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
+checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
-version = "1.0.23"
+version = "1.0.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
+checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
dependencies = [
"proc-macro2",
]
[[package]]
name = "redox_syscall"
-version = "0.2.16"
+version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
]
[[package]]
name = "regex"
-version = "1.7.1"
+version = "1.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
-version = "0.6.28"
+version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]]
name = "rustc-hash"
[[package]]
name = "rustix"
-version = "0.36.8"
+version = "0.38.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
+checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3"
dependencies = [
- "bitflags",
+ "bitflags 2.4.1",
"errno",
- "io-lifetimes",
"libc",
"linux-raw-sys",
- "windows-sys 0.45.0",
+ "windows-sys",
]
-[[package]]
-name = "scratch"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2"
-
[[package]]
name = "shlex"
-version = "1.1.0"
+version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380"
[[package]]
name = "syn"
]
[[package]]
-name = "tempfile"
-version = "3.4.0"
+name = "syn"
+version = "2.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
+checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
dependencies = [
- "cfg-if",
- "fastrand",
- "redox_syscall",
- "rustix",
- "windows-sys 0.42.0",
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
]
[[package]]
-name = "termcolor"
-version = "1.2.0"
+name = "tempfile"
+version = "3.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5"
dependencies = [
- "winapi-util",
+ "cfg-if",
+ "fastrand",
+ "redox_syscall",
+ "rustix",
+ "windows-sys",
]
[[package]]
name = "thiserror"
-version = "1.0.38"
+version = "1.0.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
+checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
-version = "1.0.38"
+version = "1.0.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
+checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8"
dependencies = [
"proc-macro2",
"quote",
- "syn",
-]
-
-[[package]]
-name = "time"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
-dependencies = [
- "libc",
- "wasi",
- "winapi",
+ "syn 2.0.38",
]
[[package]]
[[package]]
name = "unicode-ident"
-version = "1.0.6"
+version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
-
-[[package]]
-name = "unicode-width"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "uuid"
-version = "1.3.0"
+version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79"
-
-[[package]]
-name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
-
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b"
-dependencies = [
- "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9"
-dependencies = [
- "bumpalo",
- "log",
- "once_cell",
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-backend",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
+checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc"
[[package]]
name = "winapi"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-[[package]]
-name = "winapi-util"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
-dependencies = [
- "winapi",
-]
-
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
[[package]]
name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.45.0"
+version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
[[package]]
name = "windows_aarch64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_i686_gnu"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_x86_64_gnu"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_msvc"
-version = "0.42.1"
+version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-chrono = "0.4"
+chrono = { version = "0.4", default-features = false }
colored = "2"
anyhow = "1.0"
udev = "0.7.0"
memoffset = "0.8.0"
byteorder = "1.3"
libc = "0.2.69"
-gag = "1.0.0"
+gag = { version = "1.0.0", default-features = false }
bitflags = "1.3.2"
paste = "1.0.11"
.allowlist_var("KEY_SPEC_.*")
.allowlist_var("Fix753_FMODE_.*")
.allowlist_var("bch.*")
+ .allowlist_var("__bch2.*")
.allowlist_var("__BTREE_ITER.*")
.allowlist_var("BTREE_ITER.*")
.blocklist_item("bch2_bkey_ops")
impl bch_sb {
pub fn crypt(&self) -> Option<&bch_sb_field_crypt> {
unsafe {
- let ptr = bch2_sb_field_get(
+ let ptr = bch2_sb_field_get_id(
self as *const _ as *mut _,
bch_sb_field_type::BCH_SB_FIELD_crypt,
) as *const u8;
inode_v3(&'a c::bch_inode_v3),
bucket_gens(&'a c::bch_bucket_gens),
snapshot_tree(&'a c::bch_snapshot_tree),
+ logged_op_truncate(&'a c::bch_logged_op_truncate),
+ logged_op_finsert(&'a c::bch_logged_op_finsert),
}
impl<'a, 'b> BkeySC<'a> {
KEY_TYPE_inode_v3 => inode_v3(unsafe { transmute(self.v) }),
KEY_TYPE_bucket_gens => bucket_gens(unsafe { transmute(self.v) }),
KEY_TYPE_snapshot_tree => snapshot_tree(unsafe { transmute(self.v) }),
+ KEY_TYPE_logged_op_truncate => logged_op_truncate(unsafe { transmute(self.v) }),
+ KEY_TYPE_logged_op_finsert => logged_op_finsert(unsafe { transmute(self.v) }),
KEY_TYPE_MAX => unreachable!(),
}
}
use bitflags::bitflags;
pub struct BtreeTrans<'f> {
- raw: c::btree_trans,
+ raw: *mut c::btree_trans,
fs: PhantomData<&'f Fs>
}
impl<'f> BtreeTrans<'f> {
pub fn new(fs: &'f Fs) -> BtreeTrans {
unsafe {
- let mut trans: MaybeUninit<c::btree_trans> = MaybeUninit::uninit();
-
- c::__bch2_trans_init(&mut (*trans.as_mut_ptr()), fs.raw, 0);
- BtreeTrans { raw: trans.assume_init(), fs: PhantomData }
+ BtreeTrans { raw: &mut *c::__bch2_trans_get(fs.raw, 0), fs: PhantomData }
}
}
}
impl<'f> Drop for BtreeTrans<'f> {
fn drop(&mut self) {
- unsafe { c::bch2_trans_exit(&mut self.raw) }
+ unsafe { c::bch2_trans_put(&mut *self.raw) }
}
}
let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
c::bch2_trans_iter_init_outlined(
- ptr::addr_of!(trans.raw).cast_mut(),
+ trans.raw,
iter.as_mut_ptr(),
- btree as u32,
+ btree,
pos,
flags.bits as u32);
unsafe {
let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
c::bch2_trans_node_iter_init(
- ptr::addr_of!(trans.raw).cast_mut(),
+ trans.raw,
iter.as_mut_ptr(),
btree,
pos,
impl fmt::Display for c::btree_id {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let s = unsafe { CStr::from_ptr(*c::bch2_btree_ids.get_unchecked(*self as usize)) };
+ let s = unsafe { CStr::from_ptr(c::bch2_btree_id_str(*self)) };
let s = s.to_str().unwrap();
write!(f, "{}", s)
}
let s = CString::new(s).unwrap();
let p = s.as_ptr();
- let v = unsafe {c::match_string(c::bch2_btree_ids[..].as_ptr(), (-(1 as isize)) as usize, p)};
+ let v = unsafe {c::match_string(c::__bch2_btree_ids[..].as_ptr(), (-(1 as isize)) as usize, p)};
if v >= 0 {
Ok(unsafe { std::mem::transmute(v) })
} else {
#include "../include/linux/blkdev.h"
-#define MARK_FIX_753(req_name) const fmode_t Fix753_##req_name = req_name;
+#define MARK_FIX_753(req_name) const blk_mode_t Fix753_##req_name = req_name;
-MARK_FIX_753(FMODE_READ);
-MARK_FIX_753(FMODE_WRITE);
-MARK_FIX_753(FMODE_EXCL);
\ No newline at end of file
+MARK_FIX_753(BLK_OPEN_READ);
+MARK_FIX_753(BLK_OPEN_WRITE);
+MARK_FIX_753(BLK_OPEN_EXCL);
use bch_bindgen::btree::BtreeIter;
use bch_bindgen::btree::BtreeNodeIter;
use bch_bindgen::btree::BtreeIterFlags;
-use clap::Parser;
-use std::ffi::{CStr, OsStr, c_int, c_char};
-use std::os::unix::ffi::OsStrExt;
+use clap::{Args, Parser};
+use std::ffi::{c_int, c_char};
+use crate::transform_c_args;
fn list_keys(fs: &Fs, opt: Cli) -> anyhow::Result<()> {
let trans = BtreeTrans::new(fs);
Ok(())
}
-#[derive(Clone, clap::ValueEnum)]
+#[derive(Clone, clap::ValueEnum, Debug)]
enum Mode {
Keys,
Formats,
NodesOndisk,
}
-#[derive(Parser)]
-struct Cli {
+/// List filesystem metadata in textual form
+#[derive(Parser, Debug)]
+pub struct Cli {
/// Btree to list from
#[arg(short, long, default_value_t=bcachefs::btree_id::BTREE_ID_extents)]
btree: bcachefs::btree_id,
/// Force color on/off. Default: autodetect tty
#[arg(short, long, action = clap::ArgAction::Set, default_value_t=atty::is(Stream::Stdout))]
colorize: bool,
-
+
/// Verbose mode
#[arg(short, long)]
verbose: bool,
}
#[no_mangle]
+#[allow(clippy::not_unsafe_ptr_arg_deref)]
pub extern "C" fn cmd_list(argc: c_int, argv: *const *const c_char) {
- let argv: Vec<_> = (0..argc)
- .map(|i| unsafe { CStr::from_ptr(*argv.add(i as usize)) })
- .map(|i| OsStr::from_bytes(i.to_bytes()))
- .collect();
-
+ transform_c_args!(argv, argc, argv);
let opt = Cli::parse_from(argv);
colored::control::set_override(opt.colorize);
if let Err(e) = cmd_list_inner(opt) {
use atty::Stream;
use bch_bindgen::{bcachefs, bcachefs::bch_sb_handle};
use log::{info, debug, error, LevelFilter};
-use clap::Parser;
+use clap::{Parser, Subcommand};
use uuid::Uuid;
use std::path::PathBuf;
-use crate::key;
-use crate::key::KeyLoc;
+use crate::{key, transform_c_args};
+use crate::key::KeyLocation;
use crate::logger::SimpleLogger;
use std::ffi::{CStr, CString, OsStr, c_int, c_char, c_void};
use std::os::unix::ffi::OsStrExt;
src: String,
target: impl AsRef<std::path::Path>,
fstype: &str,
- mountflags: u64,
+ mountflags: libc::c_ulong,
data: Option<String>,
) -> anyhow::Result<()> {
/// Parse a comma-separated mount options and split out mountflags and filesystem
/// specific options.
-fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, u64) {
+fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, libc::c_ulong) {
use either::Either::*;
debug!("parsing mount options: {}", options.as_ref());
let (opts, flags) = options
/// Mount a bcachefs filesystem by its UUID.
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
-struct Cli {
+pub struct Cli {
/// Where the password would be loaded from.
///
/// Possible values are:
/// "wait" - wait for password to become available before mounting;
/// "ask" - prompt the user for password;
#[arg(short, long, default_value = "ask", verbatim_doc_comment)]
- key_location: KeyLoc,
+ key_location: KeyLocation,
/// Device, or UUID=<UUID>
dev: String,
/// Where the filesystem should be mounted. If not set, then the filesystem
/// won't actually be mounted. But all steps preceeding mounting the
/// filesystem (e.g. asking for passphrase) will still be performed.
- mountpoint: std::path::PathBuf,
+ mountpoint: Option<std::path::PathBuf>,
/// Mount options
#[arg(short, default_value = "")]
if sbs.len() == 0 {
Err(anyhow::anyhow!("No device found from specified parameters"))?;
} else if unsafe { bcachefs::bch2_sb_is_encrypted(sbs[0].sb) } {
- let key = opt
- .key_location
- .0
- .ok_or_else(|| anyhow::anyhow!("no keyoption specified for locked filesystem"))?;
-
- key::prepare_key(&sbs[0], key)?;
+ key::prepare_key(&sbs[0], opt.key_location)?;
}
- info!(
- "mounting with params: device: {}, target: {}, options: {}",
- devs,
- &opt.mountpoint.to_string_lossy(),
- &opt.options
- );
+ if let Some(mountpoint) = opt.mountpoint {
+ info!(
+ "mounting with params: device: {}, target: {}, options: {}",
+ devs,
+ mountpoint.to_string_lossy(),
+ &opt.options
+ );
+
+ mount(devs, mountpoint, &opt.options)?;
+ } else {
+ info!(
+ "would mount with params: device: {}, options: {}",
+ devs,
+ &opt.options
+ );
+ }
- mount(devs, &opt.mountpoint, &opt.options)?;
Ok(())
}
#[no_mangle]
-pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) {
- let argv: Vec<_> = (0..argc)
- .map(|i| unsafe { CStr::from_ptr(*argv.add(i as usize)) })
- .map(|i| OsStr::from_bytes(i.to_bytes()))
- .collect();
-
+#[allow(clippy::not_unsafe_ptr_arg_deref)]
+pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) -> c_int {
+ transform_c_args!(argv, argc, argv);
let opt = Cli::parse_from(argv);
-
+
log::set_boxed_logger(Box::new(SimpleLogger)).unwrap();
// @TODO : more granular log levels via mount option
colored::control::set_override(opt.colorize);
if let Err(e) = cmd_mount_inner(opt) {
error!("Fatal error: {}", e);
+ 1
} else {
info!("Successfully mounted");
+ 0
}
}
use log::{info};
use bch_bindgen::bcachefs::bch_sb_handle;
+use clap::builder::PossibleValue;
use crate::c_str;
use anyhow::anyhow;
#[derive(Clone, Debug)]
pub enum KeyLocation {
+ None,
Fail,
Wait,
Ask,
}
-#[derive(Clone, Debug)]
-pub struct KeyLoc(pub Option<KeyLocation>);
-impl std::ops::Deref for KeyLoc {
- type Target = Option<KeyLocation>;
- fn deref(&self) -> &Self::Target {
- &self.0
- }
-}
-
-impl std::str::FromStr for KeyLoc {
+impl std::str::FromStr for KeyLocation {
type Err = anyhow::Error;
fn from_str(s: &str) -> anyhow::Result<Self> {
match s {
- "" => Ok(KeyLoc(None)),
- "fail" => Ok(KeyLoc(Some(KeyLocation::Fail))),
- "wait" => Ok(KeyLoc(Some(KeyLocation::Wait))),
- "ask" => Ok(KeyLoc(Some(KeyLocation::Ask))),
- _ => Err(anyhow!("invalid password option")),
+ ""|"none" => Ok(KeyLocation::None),
+ "fail" => Ok(KeyLocation::Fail),
+ "wait" => Ok(KeyLocation::Wait),
+ "ask" => Ok(KeyLocation::Ask),
+ _ => Err(anyhow!("invalid password option")),
}
}
}
+impl clap::ValueEnum for KeyLocation {
+ fn value_variants<'a>() -> &'a [Self] {
+ &[
+ KeyLocation::None,
+ KeyLocation::Fail,
+ KeyLocation::Wait,
+ KeyLocation::Ask,
+ ]
+ }
+
+ fn to_possible_value(&self) -> Option<PossibleValue> {
+ Some(match self {
+ Self::None => PossibleValue::new("none").alias(""),
+ Self::Fail => PossibleValue::new("fail"),
+ Self::Wait => PossibleValue::new("wait"),
+ Self::Ask => PossibleValue::new("ask"),
+ })
+ }
+}
+
fn check_for_key(key_name: &std::ffi::CStr) -> anyhow::Result<bool> {
use bch_bindgen::keyutils::{self, keyctl_search};
let key_name = key_name.to_bytes_with_nul().as_ptr() as *const _;
- let key_type = c_str!("logon");
+ let key_type = c_str!("user");
let key_id = unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) };
if key_id > 0 {
info!("Key has became available");
Ok(true)
- } else if errno::errno().0 != libc::ENOKEY {
- Err(crate::ErrnoError(errno::errno()).into())
} else {
- Ok(false)
+ match errno::errno().0 {
+ libc::ENOKEY | libc::EKEYREVOKED => Ok(false),
+ _ => Err(crate::ErrnoError(errno::errno()).into()),
+ }
}
}
} else if key.magic != bch_key_magic {
Err(anyhow!("failed to verify the password"))
} else {
- let key_type = c_str!("logon");
+ let key_type = c_str!("user");
let ret = unsafe {
bch_bindgen::keyutils::add_key(
key_type,
KeyLocation::Fail => Err(anyhow!("no key available")),
KeyLocation::Wait => Ok(wait_for_key(&sb.sb().uuid())?),
KeyLocation::Ask => ask_for_key(sb),
+ _ => Err(anyhow!("no keyoption specified for locked filesystem")),
}
}
+use clap::Subcommand;
+
pub mod key;
pub mod logger;
pub mod cmd_mount;
pub mod cmd_list;
+pub mod cmd_completions;
+
+#[derive(clap::Parser, Debug)]
+#[command(name = "bcachefs")]
+pub struct Cli {
+ #[command(subcommand)]
+ subcommands: Subcommands,
+}
+
+#[derive(Subcommand, Debug)]
+enum Subcommands {
+ List(cmd_list::Cli),
+ Mount(cmd_mount::Cli),
+ Completions(cmd_completions::Cli),
+}
#[macro_export]
macro_rules! c_str {
};
}
+#[macro_export]
+macro_rules! transform_c_args {
+ ($var:ident, $argc:expr, $argv:expr) => {
+ // TODO: `OsStr::from_bytes` only exists on *nix
+ use ::std::os::unix::ffi::OsStrExt;
+ let $var: Vec<_> = (0..$argc)
+ .map(|i| unsafe { ::std::ffi::CStr::from_ptr(*$argv.add(i as usize)) })
+ .map(|i| ::std::ffi::OsStr::from_bytes(i.to_bytes()))
+ .collect();
+ };
+}
+
#[derive(Debug)]
struct ErrnoError(errno::Errno);
impl std::fmt::Display for ErrnoError {
#include <blkid.h>
#include <uuid/uuid.h>
+#include "libbcachefs.h"
#include "libbcachefs/bcachefs_ioctl.h"
#include "linux/sort.h"
#include "tools-util.h"
}
/* Returns size of file or block device: */
-u64 get_size(const char *path, int fd)
+u64 get_size(int fd)
{
struct stat statbuf = xfstat(fd);
}
/* Returns blocksize, in bytes: */
-unsigned get_blocksize(const char *path, int fd)
+unsigned get_blocksize(int fd)
{
struct stat statbuf = xfstat(fd);
}
/* Open a block device, do magic blkid stuff to probe for existing filesystems: */
-int open_for_format(const char *dev, bool force)
+int open_for_format(struct dev_opts *dev, bool force)
{
blkid_probe pr;
const char *fs_type = NULL, *fs_label = NULL;
size_t fs_type_len, fs_label_len;
- int fd = open(dev, O_RDWR|O_EXCL);
- if (fd < 0)
- die("Error opening device to format %s: %m", dev);
+ dev->bdev = blkdev_get_by_path(dev->path, BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL,
+ dev, NULL);
+ int ret = PTR_ERR_OR_ZERO(dev->bdev);
+ if (ret < 0)
+ die("Error opening device to format %s: %s", dev->path, strerror(-ret));
if (force)
- return fd;
+ return 0;
if (!(pr = blkid_new_probe()))
die("blkid error 1");
- if (blkid_probe_set_device(pr, fd, 0, 0))
+ if (blkid_probe_set_device(pr, dev->bdev->bd_buffered_fd, 0, 0))
die("blkid error 2");
if (blkid_probe_enable_partitions(pr, true))
die("blkid error 3");
if (fs_type) {
if (fs_label)
printf("%s contains a %s filesystem labelled '%s'\n",
- dev, fs_type, fs_label);
+ dev->path, fs_type, fs_label);
else
printf("%s contains a %s filesystem\n",
- dev, fs_type);
+ dev->path, fs_type);
fputs("Proceed anyway?", stdout);
if (!ask_yn())
exit(EXIT_FAILURE);
while (blkid_do_probe(pr) == 0)
- blkid_do_wipe(pr, 0);
+ blkid_do_wipe(pr, 0);
}
blkid_free_probe(pr);
- return fd;
+ return ret;
}
bool ask_yn(void)
if (!(field = strsep(&s, ":")))
die("invalid bbpos %s", buf);
- ret.btree = read_string_list_or_die(field, bch2_btree_ids, "btree id");
+ ret.btree = read_string_list_or_die(field, __bch2_btree_ids, "btree id");
+
+ if (!s)
+ die("invalid bbpos %s", buf);
+
ret.pos = bpos_parse(s);
return ret;
}
ssize_t read_string_list_or_die(const char *, const char * const[],
const char *);
-u64 get_size(const char *, int);
-unsigned get_blocksize(const char *, int);
-int open_for_format(const char *, bool);
+u64 get_size(int);
+unsigned get_blocksize(int);
+struct dev_opts;
+int open_for_format(struct dev_opts *, bool);
bool ask_yn(void);