mount/target
mount.bcachefs
-doc/bcachefs.5.rst
+bcachefs-principles-of-operation.*
apt:
packages:
- valgrind
+ - python3-docutils
- python3-pytest
- python3-pytest-xdist
- meson
- libblkid-dev
- libkeyutils-dev
- liblz4-dev
- - libscrypt-dev
- libsodium-dev
- liburcu-dev
- libzstd-dev
* libblkid
* libkeyutils
* liblz4
- * libscrypt
* libsodium
* liburcu
* libuuid
Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with
apt install -y pkg-config libaio-dev libblkid-dev libkeyutils-dev \
- liblz4-dev libscrypt-dev libsodium-dev liburcu-dev libzstd-dev \
- uuid-dev zlib1g-dev valgrind libudev-dev
+ liblz4-dev libsodium-dev liburcu-dev libzstd-dev \
+ uuid-dev zlib1g-dev valgrind libudev-dev git build-essential \
+ python3 python3-docutils
Fedora: install the "Development tools" group along with:
dnf install -y libaio-devel libsodium-devel \
libblkid-devel libzstd-devel zlib-devel userspace-rcu-devel \
lz4-devel libuuid-devel valgrind-devel keyutils-libs-devel \
- libscrypt-devel findutils
+ findutils
Arch: install bcachefs-tools-git from the AUR.
-Or to build from source, install libscrypt from the AUR along with,
+Or to build from source, install build dependencies with
pacman -S base-devel libaio keyutils libsodium liburcu zstd valgrind
Then, just make && make install
PREFIX?=/usr/local
PKG_CONFIG?=pkg-config
INSTALL=install
-PYTEST=pytest-3
-CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall \
+
+CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \
-Wno-pointer-sign \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \
$(EXTRA_CFLAGS)
LDFLAGS+=$(CFLAGS) $(EXTRA_LDFLAGS)
+## Configure Tools
+PYTEST_ARGS?=
+PYTEST_CMD?=$(shell \
+ command -v pytest-3 \
+ || which pytest-3 \
+)
+PYTEST:=$(PYTEST_CMD) $(PYTEST_ARGS)
+
+CARGO_ARGS=
+CARGO=cargo $(CARGO_ARGS)
+CARGO_PROFILE=release
+# CARGO_PROFILE=debug
+
+CARGO_BUILD_ARGS=--$(CARGO_PROFILE)
+CARGO_BUILD=$(CARGO) build $(CARGO_BUILD_ARGS)
VERSION?=$(shell git describe --dirty=+ 2>/dev/null || echo v0.1-nogit)
include Makefile.compiler
CFLAGS+=$(PKGCONFIG_CFLAGS)
LDLIBS+=$(PKGCONFIG_LDLIBS)
-
-LDLIBS+=-lm -lpthread -lrt -lscrypt -lkeyutils -laio -ldl
+LDLIBS+=-lm -lpthread -lrt -lkeyutils -laio -ldl
LDLIBS+=$(EXTRA_LDLIBS)
ifeq ($(PREFIX),/usr)
INITRAMFS_DIR=/etc/initramfs-tools
endif
-var := $(shell rst2man -V 2>/dev/null)
-ifeq ($(.SHELLSTATUS),0)
- RST2MAN=rst2man
-endif
-
-var := $(shell rst2man.py -V 2>/dev/null)
-ifeq ($(.SHELLSTATUS),0)
- RST2MAN=rst2man.py
-endif
-
-undefine var
-
-ifeq (,$(RST2MAN))
- @echo "WARNING: no RST2MAN found!"
-endif
-
.PHONY: all
-all: bcachefs bcachefs.5
+all: bcachefs lib
+
+.PHONY: lib
+lib: libbcachefs.so
.PHONY: tests
tests: tests/test_helper
.PHONY: check
check: tests bcachefs
- cd tests; $(PYTEST)
+ifneq (,$(PYTEST_CMD))
+ $(PYTEST)
+else
+ @echo "WARNING: pytest not found or specified, tests could not be run."
+endif
.PHONY: TAGS tags
TAGS:
tags:
ctags -R .
-DOCSRC := opts_macro.h bcachefs.5.rst.tmpl
-DOCGENERATED := bcachefs.5 doc/bcachefs.5.rst
-DOCDEPS := $(addprefix ./doc/,$(DOCSRC))
-bcachefs.5: $(DOCDEPS) libbcachefs/opts.h
- $(CC) doc/opts_macro.h -I libbcachefs -I include -E 2>/dev/null \
- | doc/macro2rst.py
- $(RST2MAN) doc/bcachefs.5.rst bcachefs.5
-
-SRCS=$(shell find . -type f -iname '*.c')
+SRCS=$(shell find . -type f ! -path '*/.*/*' -iname '*.c')
DEPS=$(SRCS:.c=.d)
-include $(DEPS)
OBJS=$(SRCS:.c=.o)
bcachefs: $(filter-out ./tests/%.o, $(OBJS))
-MOUNT_SRCS=$(shell find mount/src -type f -iname '*.rs') \
- mount/Cargo.toml mount/Cargo.lock mount/build.rs
+RUST_SRCS=$(shell find rust-src/ -type f -iname '*.rs')
+MOUNT_SRCS=$(filter %mount, $(RUST_SRCS))
debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y
debug: bcachefs
-libbcachefs_mount.a: $(MOUNT_SRCS)
- LIBBCACHEFS_INCLUDE=$(CURDIR) cargo build --manifest-path mount/Cargo.toml --release
- cp mount/target/release/libbcachefs_mount.a $@
-
MOUNT_OBJ=$(filter-out ./bcachefs.o ./tests/%.o ./cmd_%.o , $(OBJS))
-mount.bcachefs: libbcachefs_mount.a $(MOUNT_OBJ)
- $(CC) -Wl,--gc-sections libbcachefs_mount.a $(MOUNT_OBJ) -o $@ $(LDLIBS)
+libbcachefs.so: LDFLAGS+=-shared
+libbcachefs.so: $(MOUNT_OBJ)
+ $(CC) $(LDFLAGS) $+ -o $@ $(LDLIBS)
+
+MOUNT_TOML=rust-src/mount/Cargo.toml
+mount.bcachefs: lib $(MOUNT_SRCS)
+ LIBBCACHEFS_LIB=$(CURDIR) \
+ LIBBCACHEFS_INCLUDE=$(CURDIR) \
+ $(CARGO_BUILD) --manifest-path $(MOUNT_TOML)
+
+ ln -f rust-src/mount/target/$(CARGO_PROFILE)/bcachefs-mount $@
+
tests/test_helper: $(filter ./tests/%.o, $(OBJS))
.PHONY: install
install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs
install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs
-install: bcachefs
+install: bcachefs lib
$(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR)
$(INSTALL) -m0755 fsck.bcachefs $(DESTDIR)$(ROOT_SBINDIR)
$(INSTALL) -m0755 mkfs.bcachefs $(DESTDIR)$(ROOT_SBINDIR)
$(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT)
$(INSTALL) -m0755 -D initramfs/hook $(DESTDIR)$(INITRAMFS_HOOK)
$(INSTALL) -m0755 -D mount.bcachefs.sh $(DESTDIR)$(ROOT_SBINDIR)
+ $(INSTALL) -m0755 -D libbcachefs.so -t $(DESTDIR)$(PREFIX)/lib/
+
sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK)
echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK)
.PHONY: clean
clean:
$(RM) bcachefs mount.bcachefs libbcachefs_mount.a tests/test_helper .version $(OBJS) $(DEPS) $(DOCGENERATED)
- $(RM) -rf mount/target
+ $(RM) -rf rust-src/*/target
.PHONY: deb
deb: all
debuild -us -uc -nc -b -i -I
+bcachefs-principles-of-operation.pdf: doc/bcachefs-principles-of-operation.tex
+ pdflatex doc/bcachefs-principles-of-operation.tex
+ pdflatex doc/bcachefs-principles-of-operation.tex
+
+doc: bcachefs-principles-of-operation.pdf
+
.PHONY: update-bcachefs-sources
update-bcachefs-sources:
git rm -rf --ignore-unmatch libbcachefs
git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
git add .bcachefs_revision
+
.PHONY: update-commit-bcachefs-sources
update-commit-bcachefs-sources: update-bcachefs-sources
git commit -m "Update bcachefs sources to $(shell git -C $(LINUX_DIR) show --oneline --no-patch)"
You need to do this before you create a volume.
.Pp
Device specific options must come before corresponding devices, e.g.
-.Dl bcachefs format --group=ssd /dev/sda --group=hdd /dev/sdb
+.Dl bcachefs format --group=ssd /dev/sda --label=hdd /dev/sdb
.Bl -tag -width Ds
.It Fl b , Fl -block Ns = Ns Ar size
block size, in bytes (e.g. 4k)
.It Fl -data_checksum_type Ns = Ns ( Cm none | crc32c | crc64 )
Set data checksum type (default:
.Cm crc32c ) .
-.It Fl -compression_type Ns = Ns ( Cm none | lz4 | gzip )
+.It Fl -compression Ns = Ns ( Cm none | lz4 | gzip | zstd )
Set compression type (default:
.Cm none ) .
.It Fl -data_replicas Ns = Ns Ar number
"Repair:\n"
" fsck Check an existing filesystem for errors\n"
"\n"
- "Startup/shutdown, assembly of multi device filesystems:\n"
#if 0
+ "Startup/shutdown, assembly of multi device filesystems:\n"
" assemble Assemble an existing multi device filesystem\n"
" incremental Incrementally assemble an existing multi device filesystem\n"
" run Start a partially assembled filesystem\n"
" stop Stop a running filesystem\n"
-#endif
"\n"
+#endif
"Commands for managing a running filesystem:\n"
" fs usage Show disk usage\n"
"\n"
" device evacuate Migrate data off of a specific device\n"
" device set-state Mark a device as failed\n"
" device resize Resize filesystem on a device\n"
- " device journal-resize Resize journal on a device\n"
+ " device resize-journal Resize journal on a device\n"
+ "\n"
+ "Commands for managing subvolumes and snapshots:\n"
+ " subvolume create Create a new subvolume\n"
+ " subvolume delete Delete an existing subvolume\n"
+ " subvolume snapshot Create a snapshot\n"
"\n"
"Commands for managing filesystem data:\n"
" data rereplicate Rereplicate degraded data\n"
static char *pop_cmd(int *argc, char *argv[])
{
- if (*argc < 2) {
- printf("%s: missing command\n", argv[0]);
- usage();
- exit(EXIT_FAILURE);
- }
-
char *cmd = argv[1];
- memmove(&argv[1], &argv[2], *argc * sizeof(argv[0]));
+ if (!(*argc < 2))
+ memmove(&argv[1], &argv[2], *argc * sizeof(argv[0]));
(*argc)--;
full_cmd = mprintf("%s %s", full_cmd, cmd);
{
char *cmd = pop_cmd(&argc, argv);
+ if (argc < 1)
+ return fs_usage();
if (!strcmp(cmd, "usage"))
return cmd_fs_usage(argc, argv);
- usage();
return 0;
}
{
char *cmd = pop_cmd(&argc, argv);
+ if (argc < 1)
+ return device_usage();
if (!strcmp(cmd, "add"))
return cmd_device_add(argc, argv);
if (!strcmp(cmd, "remove"))
if (!strcmp(cmd, "resize-journal"))
return cmd_device_resize_journal(argc, argv);
- usage();
return 0;
}
{
char *cmd = pop_cmd(&argc, argv);
+ if (argc < 1)
+ return data_usage();
if (!strcmp(cmd, "rereplicate"))
return cmd_data_rereplicate(argc, argv);
if (!strcmp(cmd, "job"))
return cmd_data_job(argc, argv);
- usage();
+ return 0;
+}
+
+static int subvolume_cmds(int argc, char *argv[])
+{
+ char *cmd = pop_cmd(&argc, argv);
+ if (argc < 1)
+ return subvolume_usage();
+ if (!strcmp(cmd, "create"))
+ return cmd_subvolume_create(argc, argv);
+ if (!strcmp(cmd, "delete"))
+ return cmd_subvolume_delete(argc, argv);
+ if (!strcmp(cmd, "snapshot"))
+ return cmd_subvolume_snapshot(argc, argv);
+
return 0;
}
setvbuf(stdout, NULL, _IOLBF, 0);
char *cmd = pop_cmd(&argc, argv);
+ if (argc < 1) {
+ puts("missing command\n");
+ goto usage;
+ }
- if (!strcmp(cmd, "version"))
- return cmd_version(argc, argv);
+ /* these subcommands display usage when argc < 2 */
+ if (!strcmp(cmd, "device"))
+ return device_cmds(argc, argv);
+ if (!strcmp(cmd, "fs"))
+ return fs_cmds(argc, argv);
+ if (!strcmp(cmd, "data"))
+ return data_cmds(argc, argv);
+ if (!strcmp(cmd, "subvolume"))
+ return subvolume_cmds(argc, argv);
if (!strcmp(cmd, "format"))
return cmd_format(argc, argv);
+ if (!strcmp(cmd, "fsck"))
+ return cmd_fsck(argc, argv);
+ if (!strcmp(cmd, "version"))
+ return cmd_version(argc, argv);
if (!strcmp(cmd, "show-super"))
return cmd_show_super(argc, argv);
- if (!strcmp(cmd, "fsck"))
- return cmd_fsck(argc, argv);
+ if (argc < 2) {
+ printf("%s: missing command\n", argv[0]);
+ usage();
+ exit(EXIT_FAILURE);
+ }
#if 0
if (!strcmp(cmd, "assemble"))
return cmd_stop(argc, argv);
#endif
- if (!strcmp(cmd, "fs"))
- return fs_cmds(argc, argv);
-
- if (!strcmp(cmd, "device"))
- return device_cmds(argc, argv);
-
- if (!strcmp(cmd, "data"))
- return data_cmds(argc, argv);
-
if (!strcmp(cmd, "unlock"))
return cmd_unlock(argc, argv);
if (!strcmp(cmd, "set-passphrase"))
}
printf("Unknown command %s\n", cmd);
+usage:
usage();
exit(EXIT_FAILURE);
}
#include "cmds.h"
#include "libbcachefs.h"
+int data_usage(void)
+{
+ puts("bcachefs data - manage filesystem data\n"
+ "Usage: bcachefs data <CMD> [OPTIONS]\n"
+ "\n"
+ "Commands:\n"
+ " rereplicate Rereplicate degraded data\n"
+ " job Kick off low level data jobs\n"
+ "\n"
+ "Report bugs to <linux-bcache@vger.kernel.org>");
+ return 0;
+}
+
static void data_rereplicate_usage(void)
{
puts("bcachefs data rereplicate\n"
struct bch_sb *sb = ca->disk_sb.sb;
ranges data;
unsigned i;
+ int ret;
darray_init(data);
const struct bch_extent_ptr *ptr;
struct bkey_ptrs_c ptrs;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
bch2_trans_init(&trans, c, 0, 0);
- __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b) {
+ __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
struct btree_node_iter iter;
struct bkey u;
struct bkey_s_c k;
}
}
+ if (ret)
+ die("error %s walking btree nodes", strerror(-ret));
+
b = c->btree_roots[i].b;
if (!btree_node_fake(b)) {
ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
ptr->offset << 9,
btree_bytes(c));
}
+
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
}
struct bpos start, struct bpos end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
char buf[512];
int ret;
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, btree_id, start,
+ BTREE_ITER_ALL_SNAPSHOTS|
BTREE_ITER_PREFETCH, k, ret) {
if (bkey_cmp(k.k->p, end) > 0)
break;
bch2_bkey_val_to_text(&PBUF(buf), c, k);
puts(buf);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
}
struct bpos start, struct bpos end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
char buf[4096];
+ int ret;
bch2_trans_init(&trans, c, 0, 0);
- __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) {
+ __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) {
if (bkey_cmp(b->key.k.p, end) > 0)
break;
bch2_btree_node_to_text(&PBUF(buf), c, b);
puts(buf);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret)
+ die("error %s walking btree nodes", strerror(-ret));
bch2_trans_exit(&trans);
}
struct bpos start, struct bpos end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
char buf[4096];
+ int ret;
bch2_trans_init(&trans, c, 0, 0);
- __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) {
+ __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) {
if (bkey_cmp(b->key.k.p, end) > 0)
break;
fputs(buf, stdout);
putchar('\n');
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret)
+ die("error %s walking btree nodes", strerror(-ret));
bch2_trans_exit(&trans);
}
bio_put(bio);
percpu_ref_put(&ca->io_ref);
- while (offset < c->opts.btree_node_size) {
+ while (offset < btree_sectors(c)) {
struct bset *i;
struct nonce nonce;
struct bch_csum csum;
struct bpos start, struct bpos end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
char buf[4096];
+ int ret;
bch2_trans_init(&trans, c, 0, 0);
- __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) {
+ __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) {
if (bkey_cmp(b->key.k.p, end) > 0)
break;
print_node_ondisk(c, b);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret)
+ die("error %s walking btree nodes", strerror(-ret));
bch2_trans_exit(&trans);
}
struct bpos start, struct bpos end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree_node_iter node_iter;
struct bkey unpacked;
struct bkey_s_c k;
struct btree *b;
char buf[4096];
+ int ret;
bch2_trans_init(&trans, c, 0, 0);
- __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) {
+ __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) {
if (bkey_cmp(b->key.k.p, end) > 0)
break;
puts(buf);
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret)
+ die("error %s walking btree nodes", strerror(-ret));
bch2_trans_exit(&trans);
}
enum btree_id btree_id_start = 0;
enum btree_id btree_id_end = BTREE_ID_NR;
enum btree_id btree_id;
- unsigned level;
+ unsigned level = 0;
struct bpos start = POS_MIN, end = POS_MAX;
- u64 inum;
+ u64 inum = 0;
int mode = 0, opt;
opt_set(opts, nochanges, true);
struct journal_replay *p;
struct jset_entry *entry;
- struct bkey_i *k, *_n;
-
- /* This could be greatly expanded: */
list_for_each_entry(p, &c->journal_entries, list) {
printf("journal entry %8llu\n"
le32_to_cpu(p->j.version),
le64_to_cpu(p->j.last_seq));
- for_each_jset_key(k, _n, entry, &p->j) {
- char buf[200];
+ vstruct_for_each(&p->j, entry) {
+ char _buf[4096];
+ struct printbuf buf = PBUF(_buf);
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
- printf("btree %s l %u: %s\n",
- bch2_btree_ids[entry->btree_id],
- entry->level,
- buf);
+ printbuf_indent_push(&buf, 2);
+ bch2_journal_entry_to_text(&buf, c, entry);
+ printf("%s\n", _buf);
}
}
#include "libbcachefs/opts.h"
#include "tools-util.h"
+int device_usage(void)
+{
+ puts("bcachefs device - manage devices within a running filesystem\n"
+ "Usage: bcachefs device <CMD> [OPTION]\n"
+ "\n"
+ "Commands:\n"
+ " add add a new device to an existing filesystem\n"
+ " remove remove a device from an existing filesystem\n"
+ " online re-add an existing member to a filesystem\n"
+ " offline take a device offline, without removing it\n"
+ " evacuate migrate data off a specific device\n"
+ " set-state mark a device as failed\n"
+ " resize resize filesystem on a device\n"
+ " resize-journal resize journal on a device\n"
+ "\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+ return 0;
+}
+
static void device_add_usage(void)
{
puts("bcachefs device add - add a device to an existing filesystem\n"
" -S, --fs_size=size Size of filesystem on device\n"
" -B, --bucket=size Bucket size\n"
" -D, --discard Enable discards\n"
- " -g, --group=group Disk group\n"
+ " -l, --label=label Disk label\n"
" -f, --force Use device even if it appears to already be formatted\n"
" -h, --help Display this help and exit\n"
"\n"
{ "fs_size", required_argument, NULL, 'S' },
{ "bucket", required_argument, NULL, 'B' },
{ "discard", no_argument, NULL, 'D' },
- { "group", required_argument, NULL, 'g' },
+ { "label", required_argument, NULL, 'l' },
{ "force", no_argument, NULL, 'f' },
{ "help", no_argument, NULL, 'h' },
{ NULL }
case 'S':
if (bch2_strtoull_h(optarg, &dev_opts.size))
die("invalid filesystem size");
-
- dev_opts.size >>= 9;
break;
case 'B':
- dev_opts.bucket_size =
- hatoi_validate(optarg, "bucket size");
+ if (bch2_strtoull_h(optarg, &dev_opts.bucket_size))
+ die("bad bucket_size %s", optarg);
break;
case 'D':
dev_opts.discard = true;
break;
- case 'g':
- dev_opts.group = strdup(optarg);
+ case 'l':
+ dev_opts.label = strdup(optarg);
break;
case 'f':
force = true;
if (!fs_path)
die("Please supply a filesystem");
- char *dev_path = arg_pop();
- if (!dev_path)
+ dev_opts.path = arg_pop();
+ if (!dev_opts.path)
die("Please supply a device");
if (argc)
struct bchfs_handle fs = bcache_fs_open(fs_path);
- dev_opts.path = dev_path;
dev_opts.fd = open_for_format(dev_opts.path, force);
struct bch_opt_strs fs_opt_strs;
struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs);
opt_set(fs_opts, block_size,
- read_file_u64(fs.sysfs_fd, "block_size") >> 9);
+ read_file_u64(fs.sysfs_fd, "options/block_size"));
opt_set(fs_opts, btree_node_size,
- read_file_u64(fs.sysfs_fd, "btree_node_size") >> 9);
+ read_file_u64(fs.sysfs_fd, "options/btree_node_size"));
struct bch_sb *sb = bch2_format(fs_opt_strs,
fs_opts,
u64 nbuckets = size / le16_to_cpu(m->bucket_size);
+ if (nbuckets < le64_to_cpu(m->nbuckets))
+ die("Shrinking not supported yet");
+
printf("resizing %s to %llu buckets\n", dev, nbuckets);
bchu_disk_resize(fs, idx, nbuckets);
} else {
u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size);
+ if (nbuckets < le64_to_cpu(resize->mi.nbuckets))
+ die("Shrinking not supported yet");
+
printf("resizing %s to %llu buckets\n", dev, nbuckets);
int ret = bch2_dev_resize(c, resize, nbuckets);
if (ret)
static void device_resize_journal_usage(void)
{
puts("bcachefs device resize-journal \n"
- "Usage: bcachefs device resize-journal device [ size ]\n"
+ "Usage: bcachefs device resize-journal device size\n"
"\n"
"Options:\n"
" -h, --help display this help and exit\n"
char *size_arg = arg_pop();
if (!size_arg)
- size = get_size(dev, dev_fd);
+ die("Please supply a journal size");
else if (bch2_strtoull_h(size_arg, &size))
die("invalid size");
x(0, replicas, required_argument) \
x(0, encrypted, no_argument) \
x(0, no_passphrase, no_argument) \
-x('L', label, required_argument) \
+x('L', fs_label, required_argument) \
x('U', uuid, required_argument) \
x(0, fs_size, required_argument) \
x(0, superblock_size, required_argument) \
x(0, bucket_size, required_argument) \
-x('g', group, required_argument) \
+x('l', label, required_argument) \
x(0, discard, no_argument) \
x(0, data_allowed, required_argument) \
x(0, durability, required_argument) \
" --replicas=# Sets both data and metadata replicas\n"
" --encrypted Enable whole filesystem encryption (chacha20/poly1305)\n"
" --no_passphrase Don't encrypt master encryption key\n"
- " -L, --label=label\n"
+ " -L, --fs_label=label\n"
" -U, --uuid=uuid\n"
" --superblock_size=size\n"
"\n"
bch2_opts_usage(OPT_DEVICE);
- puts(" -g, --group=label Disk group\n"
+ puts(" -l, --label=label Disk label\n"
"\n"
" -f, --force\n"
" -q, --quiet Only print errors\n"
" -h, --help Display this help and exit\n"
"\n"
"Device specific options must come before corresponding devices, e.g.\n"
- " bcachefs format --group cache /dev/sdb /dev/sdc\n"
+ " bcachefs format --label cache /dev/sdb /dev/sdc\n"
"\n"
"Report bugs to <linux-bcache@vger.kernel.org>");
}
case O_no_passphrase:
no_passphrase = true;
break;
- case O_label:
+ case O_fs_label:
case 'L':
opts.label = optarg;
break;
case O_fs_size:
if (bch2_strtoull_h(optarg, &dev_opts.size))
die("invalid filesystem size");
-
- dev_opts.size >>= 9;
break;
case O_superblock_size:
if (bch2_strtouint_h(optarg, &opts.superblock_size))
opts.superblock_size >>= 9;
break;
case O_bucket_size:
- dev_opts.bucket_size =
- hatoi_validate(optarg, "bucket size");
+ if (bch2_strtoull_h(optarg, &dev_opts.bucket_size))
+ die("bad bucket_size %s", optarg);
break;
- case O_group:
- case 'g':
- dev_opts.group = optarg;
+ case O_label:
+ case 'l':
+ dev_opts.label = optarg;
break;
case O_discard:
dev_opts.discard = true;
darray_size(device_paths),
bch2_opts_empty());
if (IS_ERR(c))
- die("error opening %s: %s", device_paths.item,
+ die("error opening %s: %s", device_paths.item[0],
strerror(-PTR_ERR(c)));
bch2_fs_stop(c);
bcache_fs_close(fs);
}
+int fs_usage(void)
+{
+ puts("bcachefs fs - manage a running filesystem\n"
+ "Usage: bcachefs fs <CMD> [OPTION]... path\n"
+ "\n"
+ "Commands:\n"
+ " usage show disk usage\n"
+ "\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+ return 0;
+}
+
int cmd_fs_usage(int argc, char *argv[])
{
enum units units = BYTES;
struct bch_fs *c = fuse_req_userdata(req);
struct bch_inode_unpacked inode_u;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
u64 now;
int ret;
bch2_trans_begin(&trans);
now = bch2_current_time(c);
- iter = bch2_inode_peek(&trans, &inode_u, inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter);
+ ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
if (ret)
goto err;
inode_u.bi_mtime = now;
/* TODO: CTIME? */
- ret = bch2_inode_write(&trans, iter, &inode_u) ?:
+ ret = bch2_inode_write(&trans, &iter, &inode_u) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR)
goto retry;
static int inode_update_times(struct bch_fs *c, fuse_ino_t inum)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bch_inode_unpacked inode_u;
int ret = 0;
u64 now;
bch2_trans_begin(&trans);
now = bch2_current_time(c);
- iter = bch2_inode_peek(&trans, &inode_u, inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter);
+ ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
if (ret)
goto err;
inode_u.bi_mtime = now;
inode_u.bi_ctime = now;
- ret = bch2_inode_write(&trans, iter, &inode_u);
+ ret = bch2_inode_write(&trans, &iter, &inode_u);
if (ret)
goto err;
BTREE_INSERT_NOFAIL);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR)
goto retry;
int ret;
bch2_inode_pack(c, &packed, inode);
+ packed.inode.k.p.snapshot = U32_MAX;
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
NULL, NULL, 0);
if (ret)
struct bch_inode_unpacked inode;
int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_link_trans(&trans, parent->bi_inum, inum,
- &parent_u, &inode, &qstr));
+ bch2_link_trans(&trans,
+ (subvol_inum) { 1, parent->bi_inum }, &parent_u,
+ (subvol_inum) { 1, inum }, &inode, &qstr));
if (ret)
die("error creating hardlink: %s", strerror(-ret));
}
struct qstr qstr = QSTR(name);
struct bch_inode_unpacked new_inode;
+ bch2_inode_init_early(c, &new_inode);
+
int ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_create_trans(&trans,
- parent->bi_inum, parent,
+ (subvol_inum) { 1, parent->bi_inum }, parent,
&new_inode, &qstr,
- uid, gid, mode, rdev, NULL, NULL));
+ uid, gid, mode, rdev, NULL, NULL,
+ (subvol_inum) {}, 0));
if (ret)
- die("error creating file: %s", strerror(-ret));
+ die("error creating %s: %s", name, strerror(-ret));
return new_inode;
}
const struct xattr_handler *h = xattr_resolve_name(&attr);
int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr,
+ bch2_xattr_set(&trans,
+ (subvol_inum) { 1, dst->bi_inum },
+ &hash_info, attr,
val, val_size, h->flags, 0));
if (ret < 0)
die("error creating xattr: %s", strerror(-ret));
}
}
-static char buf[1 << 20] __aligned(PAGE_SIZE);
+#define WRITE_DATA_BUF (1 << 20)
+
+static char buf[WRITE_DATA_BUF] __aligned(PAGE_SIZE);
static void write_data(struct bch_fs *c,
struct bch_inode_unpacked *dst_inode,
u64 dst_offset, void *buf, size_t len)
{
- struct {
- struct bch_write_op op;
- struct bio_vec bv[sizeof(buf) / PAGE_SIZE];
- } o;
+ struct bch_write_op op;
+ struct bio_vec bv[WRITE_DATA_BUF / PAGE_SIZE];
struct closure cl;
BUG_ON(dst_offset & (block_bytes(c) - 1));
BUG_ON(len & (block_bytes(c) - 1));
+ BUG_ON(len > WRITE_DATA_BUF);
closure_init_stack(&cl);
- bio_init(&o.op.wbio.bio, o.bv, ARRAY_SIZE(o.bv));
- bch2_bio_map(&o.op.wbio.bio, buf, len);
+ bio_init(&op.wbio.bio, bv, ARRAY_SIZE(bv));
+ bch2_bio_map(&op.wbio.bio, buf, len);
- bch2_write_op_init(&o.op, c, bch2_opts_to_inode_opts(c->opts));
- o.op.write_point = writepoint_hashed(0);
- o.op.nr_replicas = 1;
- o.op.pos = POS(dst_inode->bi_inum, dst_offset >> 9);
+ bch2_write_op_init(&op, c, bch2_opts_to_inode_opts(c->opts));
+ op.write_point = writepoint_hashed(0);
+ op.nr_replicas = 1;
+ op.subvol = 1;
+ op.pos = SPOS(dst_inode->bi_inum, dst_offset >> 9, U32_MAX);
- int ret = bch2_disk_reservation_get(c, &o.op.res, len >> 9,
+ int ret = bch2_disk_reservation_get(c, &op.res, len >> 9,
c->opts.data_replicas, 0);
if (ret)
die("error reserving space in new filesystem: %s", strerror(-ret));
- closure_call(&o.op.cl, bch2_write, NULL, &cl);
+ closure_call(&op.cl, bch2_write, NULL, &cl);
closure_sync(&cl);
dst_inode->bi_sectors += len >> 9;
e = bkey_extent_init(&k.k);
e->k.p.inode = dst->bi_inum;
e->k.p.offset = logical + sectors;
+ e->k.p.snapshot = U32_MAX;
e->k.size = sectors;
bch2_bkey_append_ptr(&e->k_i, (struct bch_extent_ptr) {
.offset = physical,
.dev = 0,
- .gen = bucket(ca, b)->mark.gen,
+ .gen = *bucket_gen(ca, b),
});
ret = bch2_disk_reservation_get(c, &res, sectors, 1,
die("error reserving space in new filesystem: %s",
strerror(-ret));
- bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c);
-
ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
&res, NULL, 0);
if (ret)
if (!strcmp(d->d_name, ".") ||
!strcmp(d->d_name, "..") ||
+ !strcmp(d->d_name, "lost+found") ||
stat.st_ino == s->bcachefs_inum)
continue;
syncfs(src_fd);
struct bch_inode_unpacked root_inode;
- int ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, &root_inode);
+ int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO },
+ &root_inode);
if (ret)
die("error looking up root directory: %s", strerror(-ret));
darray_free(s.extents);
genradix_free(&s.hardlinks);
-
- bch2_alloc_write(c, false);
}
static void find_superblock_space(ranges extents,
u64 bcachefs_inum;
ranges extents = reserve_new_fs_space(file_path,
- fs_opts.block_size << 9,
+ fs_opts.block_size >> 9,
get_size(dev.path, dev.fd) / 5,
&bcachefs_inum, stat.st_dev, force);
opt_set(opts, sb, sb_offset);
opt_set(opts, nostart, true);
opt_set(opts, noexcl, true);
+ opt_set(opts, buckets_nouse, true);
c = bch2_fs_open(path, 1, opts);
if (IS_ERR(c))
--- /dev/null
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/bcachefs_ioctl.h"
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "libbcachefs/opts.h"
+#include "tools-util.h"
+
+int subvolume_usage(void)
+{
+ puts("bcachefs subvolume - manage subvolumes and snapshots\n"
+ "Usage: bcachefs subvolume <CMD> [OPTION]\n"
+ "\n"
+ "Commands:\n"
+ " create create a subvolume\n"
+ " delete delete a subvolume\n"
+ " snapshot create a snapshot\n"
+ "\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+ return 0;
+}
+
+static void subvolume_create_usage(void)
+{
+ puts("bcachefs subvolume create - create a new subvolume\n"
+ "Usage: bcachefs subvolume create [OPTION]... path\n"
+ "\n"
+ "Options:\n"
+ " -h, --help Display this help and exit\n"
+ "\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_subvolume_create(int argc, char *argv[])
+{
+ static const struct option longopts[] = {
+ { "help", no_argument, NULL, 'h' },
+ { NULL }
+ };
+ char *path;
+ int opt;
+
+ while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+ switch (opt) {
+ case 'h':
+ subvolume_create_usage();
+ exit(EXIT_SUCCESS);
+ }
+ args_shift(optind);
+
+ while ((path = arg_pop())) {
+ char *dir = dirname(strdup(path));
+
+ struct bchfs_handle fs = bcache_fs_open(dir);
+
+ struct bch_ioctl_subvolume i = {
+ .dirfd = AT_FDCWD,
+ .mode = 0777,
+ .dst_ptr = (unsigned long)path,
+ };
+
+ xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_CREATE, &i);
+ bcache_fs_close(fs);
+ }
+
+ return 0;
+}
+
+static void subvolume_delete_usage(void)
+{
+ puts("bcachefs subvolume delete - delete an existing subvolume\n"
+ "Usage: bcachefs subvolume delete [OPTION]... path\n"
+ "\n"
+ "Options:\n"
+ " -h, --help Display this help and exit\n"
+ "\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_subvolume_delete(int argc, char *argv[])
+{
+ static const struct option longopts[] = {
+ { "help", no_argument, NULL, 'h' },
+ { NULL }
+ };
+ char *path;
+ int opt;
+
+ while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+ switch (opt) {
+ case 'h':
+ subvolume_delete_usage();
+ exit(EXIT_SUCCESS);
+ }
+ args_shift(optind);
+
+ while ((path = arg_pop())) {
+ char *dir = dirname(strdup(path));
+
+ struct bchfs_handle fs = bcache_fs_open(dir);
+
+ struct bch_ioctl_subvolume i = {
+ .dirfd = AT_FDCWD,
+ .mode = 0777,
+ .dst_ptr = (unsigned long)path,
+ };
+
+ xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_DESTROY, &i);
+ bcache_fs_close(fs);
+ }
+
+ return 0;
+}
+
+static void snapshot_create_usage(void)
+{
+ puts("bcachefs subvolume snapshot - create a snapshot \n"
+ "Usage: bcachefs subvolume snapshot [OPTION]... <source> <dest>\n"
+ "\n"
+ "Create a snapshot of <source> at <dest>. If specified, <source> must be a subvolume;\n"
+ "if not specified the snapshot will be of the subvolme containing <dest>.\n"
+ "Options:\n"
+ " -r Make snapshot read only\n"
+ " -h, --help Display this help and exit\n"
+ "\n"
+ "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_subvolume_snapshot(int argc, char *argv[])
+{
+ static const struct option longopts[] = {
+ { "help", no_argument, NULL, 'h' },
+ { NULL }
+ };
+ unsigned flags = BCH_SUBVOL_SNAPSHOT_CREATE;
+ int opt;
+
+ while ((opt = getopt_long(argc, argv, "rh", longopts, NULL)) != -1)
+ switch (opt) {
+ case 'r':
+ flags |= BCH_SUBVOL_SNAPSHOT_RO;
+ break;
+ case 'h':
+ snapshot_create_usage();
+ exit(EXIT_SUCCESS);
+ }
+ args_shift(optind);
+
+ char *src = arg_pop();
+ char *dst = arg_pop();
+
+ if (argc)
+ die("Too many arguments");
+
+ if (!dst)
+ swap(src, dst);
+ if (!dst)
+ die("Please specify a path to create");
+
+ char *dir = dirname(strdup(dst));
+
+ struct bchfs_handle fs = bcache_fs_open(dir);
+
+ struct bch_ioctl_subvolume i = {
+ .flags = flags,
+ .dirfd = AT_FDCWD,
+ .mode = 0777,
+ .src_ptr = (unsigned long)src,
+ .dst_ptr = (unsigned long)dst,
+ };
+
+ xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_CREATE, &i);
+ bcache_fs_close(fs);
+ return 0;
+}
int cmd_stop(int argc, char *argv[]);
#endif
+int fs_usage(void);
int cmd_fs_usage(int argc, char *argv[]);
+int device_usage(void);
int cmd_device_add(int argc, char *argv[]);
int cmd_device_remove(int argc, char *argv[]);
int cmd_device_online(int argc, char *argv[]);
int cmd_device_resize(int argc, char *argv[]);
int cmd_device_resize_journal(int argc, char *argv[]);
+int data_usage(void);
int cmd_data_rereplicate(int argc, char *argv[]);
int cmd_data_job(int argc, char *argv[]);
int cmd_setattr(int argc, char *argv[]);
+int subvolume_usage(void);
+int cmd_subvolume_create(int argc, char *argv[]);
+int cmd_subvolume_delete(int argc, char *argv[]);
+int cmd_subvolume_snapshot(int argc, char *argv[]);
+
int cmd_fusemount(int argc, char *argv[]);
#endif /* _CMDS_H */
#include <keyutils.h>
#include <linux/random.h>
-#include <libscrypt.h>
+#include <sodium/crypto_pwhash_scryptsalsa208sha256.h>
#include <uuid/uuid.h>
#include "libbcachefs/checksum.h"
switch (BCH_CRYPT_KDF_TYPE(crypt)) {
case BCH_KDF_SCRYPT:
- ret = libscrypt_scrypt((void *) passphrase, strlen(passphrase),
- salt, sizeof(salt),
- 1ULL << BCH_KDF_SCRYPT_N(crypt),
- 1ULL << BCH_KDF_SCRYPT_R(crypt),
- 1ULL << BCH_KDF_SCRYPT_P(crypt),
- (void *) &key, sizeof(key));
+ ret = crypto_pwhash_scryptsalsa208sha256_ll(
+ (void *) passphrase, strlen(passphrase),
+ salt, sizeof(salt),
+ 1ULL << BCH_KDF_SCRYPT_N(crypt),
+ 1ULL << BCH_KDF_SCRYPT_R(crypt),
+ 1ULL << BCH_KDF_SCRYPT_P(crypt),
+ (void *) &key, sizeof(key));
if (ret)
die("scrypt error: %i", ret);
break;
if (passphrase) {
SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
- SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
- SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
- SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
+ SET_BCH_KDF_SCRYPT_N(crypt, ilog2(16384));
+ SET_BCH_KDF_SCRYPT_R(crypt, ilog2(8));
+ SET_BCH_KDF_SCRYPT_P(crypt, ilog2(16));
struct bch_key passphrase_key = derive_passphrase(crypt, passphrase);
+bcachefs-tools (0.1+git20220216.a1e928a-1) unstable; urgency=medium
+
+ * New upstream snapshot
+ * Grab patch from Ubuntu to reduce memory on amd64 builders
+ (http://launchpadlibrarian.net/580140160/bcachefs-tools_0.1+git20210805.6c42566-2_0.1+git20210805.6c42566-2ubuntu1.diff.gz)
+ * Update copyright years
+
+ -- Jonathan Carter <jcc@debian.org> Wed, 16 Feb 2022 14:42:20 +0200
+
bcachefs-tools (0.1+git20210805.6c42566-2) unstable; urgency=medium
* Remove valgrind as build-dependency, seems unneeded unless
Source: https://evilpiepirate.org/git/bcachefs-tools.git
Files: *
-Copyright: 2013-2020 Kent Overstreet <kmo@daterainc.com>
+Copyright: 2013-2022 Kent Overstreet <kmo@daterainc.com>
2013 Gabriel de Perthuis <g2p.code@gmail.com>
2008 Intel Corporation <willy@linux.intel.com>
License: GPL-2
THE SOFTWARE.
Files: debian/*
-Copyright: 2019-2020 Jonathan Carter <jcc@debian.org>
+Copyright: 2019-2022 Jonathan Carter <jcc@debian.org>
2014 Tom Strickx <tstrickx@rootcu.be>,
2014 David Mohr <david@mcbf.net>
License: GPL-2+
-bcachefs-tools_0.1+git20210805.6c42566-2_source.buildinfo utils optional
+bcachefs-tools_0.1+git20220216.a1e928a-1_source.buildinfo utils optional
export DEB_BUILD_MAINT_OPTIONS=hardening=+all
PREFIX := /usr
+DEB_BUILD_ARCH ?= $(shell dpkg-architecture -qDEB_BUILD_ARCH)
+
+ifeq ($(DEB_BUILD_ARCH),amd64)
+ DEB_BUILD_MAINT_OPTIONS += optimize=-lto
+endif
+
%:
dh $@
-{ nixpkgs ? (import ./nix/nixpkgs.nix)
-}:
-
-with nixpkgs;
-
-stdenv.mkDerivation rec {
- name = "bcachefs-tools-${version}";
- version = "git";
-
- src = lib.cleanSource ./.; # NOTE: ignore .git, otherwise things get weird!
-
- nativeBuildInputs = [ git pkgconfig ];
- buildInputs =
- [ liburcu libuuid libaio zlib attr keyutils
- libsodium libscrypt
- ];
-
- enableParallelBuilding = true;
- makeFlags =
- [ "PREFIX=$(out)"
- ];
-
- meta = with stdenv.lib; {
- description = "Userspace tools for bcachefs";
- homepage = http://bcachefs.org;
- license = licenses.gpl2;
- platforms = platforms.linux;
- maintainers =
- [ "Kent Overstreet <kent.overstreet@gmail.com>"
- ];
- };
+{ lib
+, filter
+
+, stdenv
+, pkg-config
+, attr
+, libuuid
+, libsodium
+, keyutils
+
+, liburcu
+, zlib
+, libaio
+, udev
+, zstd
+, lz4
+
+, python39
+, python39Packages
+, docutils
+, nixosTests
+
+, lastModified
+, versionString ? lastModified
+
+, inShell ? false
+, debugMode ? inShell
+
+, testWithValgrind ? true
+, valgrind
+
+, fuseSupport ? false
+, fuse3 ? null }:
+
+assert fuseSupport -> fuse3 != null;
+assert testWithValgrind -> valgrind != null;
+stdenv.mkDerivation {
+ pname = "bcachefs-tools";
+
+ version = "v0.1-flake-${versionString}";
+ VERSION = "v0.1-flake-${versionString}";
+
+ src = filter.filter {
+ name = "bcachefs-tools";
+ root = ./.;
+ exclude = [
+ ./rust-src
+
+ ./.git
+ ./nix
+
+ ./flake.nix
+ ./flake.lock
+ ];
+ };
+
+ postPatch = "patchShebangs --build doc/macro2rst.py";
+
+ nativeBuildInputs = [
+ # used to find dependencies
+ ## see ./INSTALL
+ pkg-config
+ ];
+ buildInputs = [
+ # bcachefs explicit dependencies
+ ## see ./INSTALL
+ libaio
+
+ # libblkid
+ keyutils # libkeyutils
+ lz4 # liblz4
+
+ libsodium
+ liburcu
+ libuuid
+ zstd # libzstd
+ zlib # zlib1g
+ valgrind
+
+ # unspecified dependencies
+ attr
+ udev
+
+ # documentation depenedencies
+ docutils
+ python39Packages.pygments
+ ] ++ (lib.optional fuseSupport fuse3)
+ ++ (lib.optional testWithValgrind valgrind) ;
+
+ makeFlags = [
+ "PREFIX=${placeholder "out"}"
+ ] ++ lib.optional debugMode "EXTRA_CFLAGS=-ggdb";
+
+ installFlags = [
+ "INITRAMFS_DIR=${placeholder "out"}/etc/initramfs-tools"
+ ];
+
+ doCheck = true; # needs bcachefs module loaded on builder
+
+ checkInputs = [
+ python39Packages.pytest
+ python39Packages.pytest-xdist
+ ] ++ lib.optional testWithValgrind valgrind;
+
+ checkFlags = [
+ "BCACHEFS_TEST_USE_VALGRIND=${if testWithValgrind then "yes" else "no"}"
+ # cannot escape spaces within make flags, quotes are stripped
+ "PYTEST_CMD=pytest" # "PYTEST_ARGS='-n4 --version'"
+ ];
+
+ preCheck =
+ ''
+ makeFlagsArray+=(PYTEST_ARGS="--verbose -n2")
+ '' +
+ lib.optionalString fuseSupport ''
+ rm tests/test_fuse.py
+ '';
+
+ dontStrip = debugMode == true;
+ passthru = {
+ bcachefs_revision = let
+ file = builtins.readFile ./.bcachefs_revision;
+ removeLineFeeds = str: lib.lists.foldr (lib.strings.removeSuffix) str ["\r" "\n"];
+ in removeLineFeeds file;
+
+ tests = {
+ smoke-test = nixosTests.bcachefs;
+ };
+ };
+
+ enableParallelBuilding = true;
+ meta = with lib; {
+ description = "Userspace tools for bcachefs";
+ homepage = http://bcachefs.org;
+ license = licenses.gpl2;
+ platforms = platforms.linux;
+ maintainers =
+ [ "Kent Overstreet <kent.overstreet@gmail.com>"
+ ];
+
+ };
}
--- /dev/null
+{
+ "nodes": {
+ "filter": {
+ "locked": {
+ "lastModified": 1620202920,
+ "narHash": "sha256-BOkm3eKT45Dk4NNxJT0xL9NnyYeZcF+t79zPnJkggac=",
+ "owner": "numtide",
+ "repo": "nix-filter",
+ "rev": "3c9e33ed627e009428197b07216613206f06ed80",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "nix-filter",
+ "type": "github"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1633351077,
+ "narHash": "sha256-z38JG4Bb0GtM1aF1pANVdp1dniMP23Yb3HnRoJRy2uU=",
+ "owner": "nixos",
+ "repo": "nixpkgs",
+ "rev": "14aef06d9b3ad1d07626bdbb16083b83f92dc6c1",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nixos",
+ "ref": "nixos-unstable",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "filter": "filter",
+ "nixpkgs": "nixpkgs",
+ "utils": "utils"
+ }
+ },
+ "utils": {
+ "locked": {
+ "lastModified": 1629481132,
+ "narHash": "sha256-JHgasjPR0/J1J3DRm4KxM4zTyAj4IOJY8vIl75v/kPI=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "997f7efcb746a9c140ce1f13c72263189225f482",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
--- /dev/null
+{
+ description = "Userspace tools for bcachefs";
+
+ # Nixpkgs / NixOS version to use.
+ inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
+ inputs.utils.url = "github:numtide/flake-utils";
+ inputs.filter.url = "github:numtide/nix-filter";
+
+ outputs = { self, nixpkgs, utils, filter, ... }@inputs:
+ let
+ # System types to support.
+ supportedSystems = [ "x86_64-linux" ];
+ in
+ {
+ version = "${builtins.substring 0 8 self.lastModifiedDate}-${self.shortRev or "dirty"}";
+
+ overlay = import ./nix/overlay.nix inputs;
+ nixosModule = self.nixosModules.bcachefs;
+ nixosModules.bcachefs = import ./rust-src/mount/module.nix;
+ nixosModules.bcachefs-enable-boot = ({config, pkgs, lib, ... }:{
+ # Disable Upstream NixOS Module when this is in use
+ disabledModules = [ "tasks/filesystems/bcachefs.nix" ];
+ # Import needed packages
+ nixpkgs.overlays = [ self.overlay ];
+
+ # Add bcachefs to boot and kernel
+ boot.initrd.supportedFilesystems = [ "bcachefs" ];
+ boot.supportedFilesystems = [ "bcachefs" ];
+ });
+
+ nixosConfigurations.netboot-bcachefs = self.systems.netboot-bcachefs "x86_64-linux";
+ systems.netboot-bcachefs = system: (nixpkgs.lib.nixosSystem {
+ inherit system; modules = [
+ self.nixosModule
+ self.nixosModules.bcachefs-enable-boot
+ ("${nixpkgs}/nixos/modules/installer/netboot/netboot-minimal.nix")
+ ({ lib, pkgs, config, ... }: {
+ # installation disk autologin
+ services.getty.autologinUser = lib.mkForce "root";
+ users.users.root.initialPassword = "toor";
+
+ # Symlink everything together
+ system.build.netboot = pkgs.symlinkJoin {
+ name = "netboot";
+ paths = with config.system.build; [
+ netbootRamdisk
+ kernel
+ netbootIpxeScript
+ ];
+ preferLocalBuild = true;
+ };
+ })
+ ];
+ });
+ }
+ // utils.lib.eachSystem supportedSystems (system:
+ let pkgs = import nixpkgs {
+ inherit system;
+ overlays = [ self.overlay ];
+ };
+ in rec {
+
+ # A Nixpkgs overlay.
+
+ # Provide some binary packages for selected system types.
+ defaultPackage = pkgs.bcachefs.tools;
+ packages = {
+ inherit (pkgs.bcachefs)
+ tools
+ toolsValgrind
+ toolsDebug
+ mount
+ bch_bindgen
+ kernel;
+
+ tools-musl = pkgs.pkgsMusl.bcachefs.tools;
+ mount-musl = pkgs.pkgsMusl.bcachefs.mount;
+ };
+
+ checks = {
+ kernelSrc = packages.kernel.src;
+ inherit (packages)
+ mount
+ bch_bindgen
+ toolsValgrind;
+
+ # Build and test initrd with bcachefs and bcachefs.mount installed
+ # Disabled Test because it takes a while to build the kernel
+ # bootStage1Module = self.nixosConfigurations.netboot-bcachefs.config.system.build.bootStage1;
+ };
+
+ devShell = devShells.tools;
+ devShells.tools = pkgs.bcachefs.tools.override { inShell = true; };
+ devShells.mount = pkgs.bcachefs.mount.override { inShell = true; };
+ });
+}
};
struct gendisk {
+ struct backing_dev_info *bdi;
+ struct backing_dev_info __bdi;
};
struct hd_struct {
struct gendisk __bd_disk;
int bd_fd;
int bd_sync_fd;
-
- struct backing_dev_info *bd_bdi;
- struct backing_dev_info __bd_bdi;
};
#define bdev_kobj(_bdev) (&((_bdev)->kobj))
#include <linux/kobject.h>
#include <linux/types.h>
-#define BIO_MAX_VECS 256
+#define BIO_MAX_VECS 256U
typedef unsigned fmode_t;
#define bdev_get_queue(bdev) (&((bdev)->queue))
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+#ifndef SECTOR_SIZE
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+#endif
+
+#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
+#define SECTOR_MASK (PAGE_SECTORS - 1)
+
#define blk_queue_discard(q) ((void) (q), 0)
#define blk_queue_nonrot(q) ((void) (q), 0)
#define unlikely(x) __builtin_expect(!!(x), 0)
#define unreachable() __builtin_unreachable()
#define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
+#define fallthrough __attribute__((__fallthrough__))
#define ___PASTE(a,b) a##b
#define __PASTE(a,b) ___PASTE(a,b)
/********** security/ **********/
#define KEY_DESTROY 0xbd
+/********** net/core/page_pool.c **********/
+#define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA)
+
#endif
--- /dev/null
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
+ */
+
+#ifndef _LINUX_SIPHASH_H
+#define _LINUX_SIPHASH_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#define SIPHASH_ALIGNMENT __alignof__(u64)
+typedef struct {
+ u64 key[2];
+} siphash_key_t;
+
+static inline bool siphash_key_is_zero(const siphash_key_t *key)
+{
+ return !(key->key[0] | key->key[1]);
+}
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
+#endif
+
+u64 siphash_1u64(const u64 a, const siphash_key_t *key);
+u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
+u64 siphash_3u64(const u64 a, const u64 b, const u64 c,
+ const siphash_key_t *key);
+u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d,
+ const siphash_key_t *key);
+u64 siphash_1u32(const u32 a, const siphash_key_t *key);
+u64 siphash_3u32(const u32 a, const u32 b, const u32 c,
+ const siphash_key_t *key);
+
+static inline u64 siphash_2u32(const u32 a, const u32 b,
+ const siphash_key_t *key)
+{
+ return siphash_1u64((u64)b << 32 | a, key);
+}
+static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c,
+ const u32 d, const siphash_key_t *key)
+{
+ return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key);
+}
+
+
+static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
+ const siphash_key_t *key)
+{
+ if (__builtin_constant_p(len) && len == 4)
+ return siphash_1u32(le32_to_cpup((const __le32 *)data), key);
+ if (__builtin_constant_p(len) && len == 8)
+ return siphash_1u64(le64_to_cpu(data[0]), key);
+ if (__builtin_constant_p(len) && len == 16)
+ return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+ key);
+ if (__builtin_constant_p(len) && len == 24)
+ return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+ le64_to_cpu(data[2]), key);
+ if (__builtin_constant_p(len) && len == 32)
+ return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+ le64_to_cpu(data[2]), le64_to_cpu(data[3]),
+ key);
+ return __siphash_aligned(data, len, key);
+}
+
+/**
+ * siphash - compute 64-bit siphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the siphash key
+ */
+static inline u64 siphash(const void *data, size_t len,
+ const siphash_key_t *key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
+ return __siphash_unaligned(data, len, key);
+#endif
+ return ___siphash_aligned(data, len, key);
+}
+
+#define HSIPHASH_ALIGNMENT __alignof__(unsigned long)
+typedef struct {
+ unsigned long key[2];
+} hsiphash_key_t;
+
+u32 __hsiphash_aligned(const void *data, size_t len,
+ const hsiphash_key_t *key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+ const hsiphash_key_t *key);
+#endif
+
+u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
+u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
+u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c,
+ const hsiphash_key_t *key);
+u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
+ const hsiphash_key_t *key);
+
+static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
+ const hsiphash_key_t *key)
+{
+ if (__builtin_constant_p(len) && len == 4)
+ return hsiphash_1u32(le32_to_cpu(data[0]), key);
+ if (__builtin_constant_p(len) && len == 8)
+ return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+ key);
+ if (__builtin_constant_p(len) && len == 12)
+ return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+ le32_to_cpu(data[2]), key);
+ if (__builtin_constant_p(len) && len == 16)
+ return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+ le32_to_cpu(data[2]), le32_to_cpu(data[3]),
+ key);
+ return __hsiphash_aligned(data, len, key);
+}
+
+/**
+ * hsiphash - compute 32-bit hsiphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the hsiphash key
+ */
+static inline u32 hsiphash(const void *data, size_t len,
+ const hsiphash_key_t *key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
+ return __hsiphash_unaligned(data, len, key);
+#endif
+ return ___hsiphash_aligned(data, len, key);
+}
+
+#endif /* _LINUX_SIPHASH_H */
static inline void *kmalloc(size_t size, gfp_t flags)
{
+ unsigned i = 0;
void *p;
- run_shrinkers();
-
- if (size) {
- size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE);
- alignment = max(sizeof(void *), alignment);
- if (posix_memalign(&p, alignment, size))
- p = NULL;
- } else {
- p = malloc(0);
- }
- if (p && (flags & __GFP_ZERO))
- memset(p, 0, size);
+ do {
+ run_shrinkers();
+
+ if (size) {
+ size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE);
+ alignment = max(sizeof(void *), alignment);
+ if (posix_memalign(&p, alignment, size))
+ p = NULL;
+ } else {
+ p = malloc(0);
+ }
+ if (p && (flags & __GFP_ZERO))
+ memset(p, 0, size);
+ } while (!p && i++ < 10);
return p;
}
{
void *new;
- run_shrinkers();
-
new = kmalloc(size, flags);
if (!new)
return NULL;
((size) != 0 && (n) > SIZE_MAX / (size) \
? NULL : kmalloc((n) * (size), flags))
+#define kvmalloc_array(n, size, flags) \
+ ((size) != 0 && (n) > SIZE_MAX / (size) \
+ ? NULL : kmalloc((n) * (size), flags))
+
#define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO)
#define kfree(p) free(p)
static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
{
size_t size = PAGE_SIZE << order;
+ unsigned i = 0;
void *p;
- run_shrinkers();
+ do {
+ run_shrinkers();
- p = aligned_alloc(PAGE_SIZE, size);
- if (p && (flags & __GFP_ZERO))
- memset(p, 0, size);
+ p = aligned_alloc(PAGE_SIZE, size);
+ if (p && (flags & __GFP_ZERO))
+ memset(p, 0, size);
+ } while (!p && i++ < 10);
return p;
}
static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
+ unsigned i = 0;
void *p;
size = round_up(size, PAGE_SIZE);
- run_shrinkers();
+ do {
+ run_shrinkers();
- p = aligned_alloc(PAGE_SIZE, size);
- if (!p)
- return NULL;
-
- if (gfp_mask & __GFP_ZERO)
- memset(p, 0, size);
+ p = aligned_alloc(PAGE_SIZE, size);
+ if (p && gfp_mask & __GFP_ZERO)
+ memset(p, 0, size);
+ } while (!p && i++ < 10);
return p;
}
__entry->required, __entry->cl)
);
-TRACE_EVENT(btree_insert_key,
- TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
- TP_ARGS(c, b, k),
-
- TP_STRUCT__entry(
- __field(u8, id )
- __field(u64, inode )
- __field(u64, offset )
- __field(u32, size )
- ),
-
- TP_fast_assign(
- __entry->id = b->c.btree_id;
- __entry->inode = k->k.p.inode;
- __entry->offset = k->k.p.offset;
- __entry->size = k->k.size;
- ),
-
- TP_printk("btree %u: %llu:%llu len %u", __entry->id,
- __entry->inode, __entry->offset, __entry->size)
-);
-
DEFINE_EVENT(btree_node, btree_split,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
TP_ARGS(c, b)
);
+TRACE_EVENT(btree_cache_scan,
+ TP_PROTO(unsigned long nr_to_scan_pages,
+ unsigned long nr_to_scan_nodes,
+ unsigned long can_free_nodes,
+ long ret),
+ TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr_to_scan_pages )
+ __field(unsigned long, nr_to_scan_nodes )
+ __field(unsigned long, can_free_nodes )
+ __field(long, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->nr_to_scan_pages = nr_to_scan_pages;
+ __entry->nr_to_scan_nodes = nr_to_scan_nodes;
+ __entry->can_free_nodes = can_free_nodes;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
+ __entry->nr_to_scan_pages,
+ __entry->nr_to_scan_nodes,
+ __entry->can_free_nodes,
+ __entry->ret)
+);
+
+TRACE_EVENT(btree_node_relock_fail,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos,
+ unsigned long node,
+ u32 iter_lock_seq,
+ u32 node_lock_seq),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 24 )
+ __field(unsigned long, caller_ip )
+ __field(u8, btree_id )
+ __field(u64, pos_inode )
+ __field(u64, pos_offset )
+ __field(u32, pos_snapshot )
+ __field(unsigned long, node )
+ __field(u32, iter_lock_seq )
+ __field(u32, node_lock_seq )
+ ),
+
+ TP_fast_assign(
+ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->btree_id = btree_id;
+ __entry->pos_inode = pos->inode;
+ __entry->pos_offset = pos->offset;
+ __entry->pos_snapshot = pos->snapshot;
+ __entry->node = node;
+ __entry->iter_lock_seq = iter_lock_seq;
+ __entry->node_lock_seq = node_lock_seq;
+ ),
+
+ TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+ __entry->trans_fn,
+ (void *) __entry->caller_ip,
+ __entry->btree_id,
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot,
+ __entry->node,
+ __entry->iter_lock_seq,
+ __entry->node_lock_seq)
+);
+
/* Garbage collection */
DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
),
TP_fast_assign(
- __entry->dev = ca->disk_sb.bdev->bd_dev;
+ __entry->dev = ca->dev;
__entry->found = found;
__entry->inc_gen = inc_gen;
__entry->inc_gen_skipped = inc_gen_skipped;
),
TP_fast_assign(
- __entry->dev = ca->disk_sb.bdev->bd_dev;
+ __entry->dev = ca->dev;
__entry->offset = offset,
__entry->sectors = sectors;
),
),
TP_fast_assign(
- __entry->dev = ca->disk_sb.bdev->bd_dev;
+ __entry->dev = ca->dev;
__entry->reserve = reserve;
),
__entry->wait_amount, __entry->until)
);
-TRACE_EVENT(trans_get_iter,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *got_pos,
- unsigned got_locks,
- unsigned got_uptodate,
- struct bpos *src_pos,
- unsigned src_locks,
- unsigned src_uptodate),
- TP_ARGS(trans_ip, caller_ip, btree_id,
- got_pos, got_locks, got_uptodate,
- src_pos, src_locks, src_uptodate),
-
- TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u64, got_pos_inode )
- __field(u64, got_pos_offset )
- __field(u32, got_pos_snapshot )
- __field(u8, got_locks )
- __field(u8, got_uptodate )
- __field(u64, src_pos_inode )
- __field(u64, src_pos_offset )
- __field(u32, src_pos_snapshot )
- __field(u8, src_locks )
- __field(u8, src_uptodate )
- ),
-
- TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->btree_id = btree_id;
- __entry->got_pos_inode = got_pos->inode;
- __entry->got_pos_offset = got_pos->offset;
- __entry->got_pos_snapshot = got_pos->snapshot;
- __entry->got_locks = got_locks;
- __entry->got_uptodate = got_uptodate;
- __entry->src_pos_inode = src_pos->inode;
- __entry->src_pos_offset = src_pos->offset;
- __entry->src_pos_snapshot = src_pos->snapshot;
- __entry->src_locks = src_locks;
- __entry->src_uptodate = src_uptodate;
- ),
-
- TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u "
- "src %llu:%llu:%u l %u u %u",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip,
- __entry->btree_id,
- __entry->got_pos_inode,
- __entry->got_pos_offset,
- __entry->got_pos_snapshot,
- __entry->got_locks,
- __entry->got_uptodate,
- __entry->src_pos_inode,
- __entry->src_pos_offset,
- __entry->src_pos_snapshot,
- __entry->src_locks,
- __entry->src_uptodate)
-);
-
-TRACE_EVENT(transaction_restart_ip,
- TP_PROTO(unsigned long caller, unsigned long ip),
- TP_ARGS(caller, ip),
-
- TP_STRUCT__entry(
- __field(unsigned long, caller )
- __field(unsigned long, ip )
- ),
-
- TP_fast_assign(
- __entry->caller = caller;
- __entry->ip = ip;
- ),
-
- TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
-);
-
DECLARE_EVENT_CLASS(transaction_restart,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
- TP_ARGS(trans_ip, caller_ip),
+ TP_ARGS(trans_fn, caller_ip),
TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
+ __array(char, trans_fn, 24 )
__field(unsigned long, caller_ip )
),
TP_fast_assign(
- __entry->trans_ip = trans_ip;
+ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
),
- TP_printk("%ps %pS",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip)
+ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart, transaction_restart_ip,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip),
+ TP_ARGS(trans_fn, caller_ip)
);
DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
- TP_ARGS(trans_ip, caller_ip)
+ TP_ARGS(trans_fn, caller_ip)
);
DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
- TP_ARGS(trans_ip, caller_ip)
+ TP_ARGS(trans_fn, caller_ip)
);
DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
- TP_ARGS(trans_ip, caller_ip)
+ TP_ARGS(trans_fn, caller_ip)
);
DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
- TP_ARGS(trans_ip, caller_ip)
+ TP_ARGS(trans_fn, caller_ip)
);
DEFINE_EVENT(transaction_restart, trans_restart_fault_inject,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
- TP_ARGS(trans_ip, caller_ip)
+ TP_ARGS(trans_fn, caller_ip)
);
DEFINE_EVENT(transaction_restart, trans_traverse_all,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
- TP_ARGS(trans_ip, caller_ip)
+ TP_ARGS(trans_fn, caller_ip)
);
DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip),
+ TP_ARGS(trans_fn, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip),
- TP_ARGS(trans_ip, caller_ip)
+ TP_ARGS(trans_fn, caller_ip)
);
DECLARE_EVENT_CLASS(transaction_restart_iter,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
enum btree_id btree_id,
struct bpos *pos),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos),
TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
+ __array(char, trans_fn, 24 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u64, pos_inode )
),
TP_fast_assign(
- __entry->trans_ip = trans_ip;
+ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = btree_id;
__entry->pos_inode = pos->inode;
__entry->pos_snapshot = pos->snapshot;
),
- TP_printk("%ps %pS btree %u pos %llu:%llu:%u",
- (void *) __entry->trans_ip,
+ TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+ __entry->trans_fn,
(void *) __entry->caller_ip,
__entry->btree_id,
__entry->pos_inode,
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
enum btree_id btree_id,
struct bpos *pos),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
enum btree_id btree_id,
struct bpos *pos),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_mark,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
enum btree_id btree_id,
struct bpos *pos),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
enum btree_id btree_id,
struct bpos *pos),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
enum btree_id btree_id,
struct bpos *pos),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
enum btree_id btree_id,
struct bpos *pos),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
- TP_PROTO(unsigned long trans_ip,
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
enum btree_id btree_id,
struct bpos *pos),
- TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
-TRACE_EVENT(iter_traverse,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- bool key_cache,
- enum btree_id btree_id,
- struct bpos *pos,
- int ret),
- TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, ret),
-
- TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
- __field(unsigned long, caller_ip )
- __field(u8, key_cache )
- __field(u8, btree_id )
- __field(u64, pos_inode )
- __field(u64, pos_offset )
- __field(u32, pos_snapshot )
- __field(s32, ret )
- ),
-
- TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->key_cache = key_cache;
- __entry->btree_id = btree_id;
- __entry->pos_inode = pos->inode;
- __entry->pos_offset = pos->offset;
- __entry->pos_snapshot = pos->snapshot;
- __entry->ret = ret;
- ),
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
- TP_printk("%ps %pS key cache %u btree %u %llu:%llu:%u ret %i",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip,
- __entry->key_cache,
- __entry->btree_id,
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->ret)
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
-TRACE_EVENT(iter_set_search_pos,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- enum btree_id btree_id,
- struct bpos *old_pos,
- struct bpos *new_pos,
- unsigned good_level),
- TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level),
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
- TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u64, old_pos_inode )
- __field(u64, old_pos_offset )
- __field(u32, old_pos_snapshot )
- __field(u64, new_pos_inode )
- __field(u64, new_pos_offset )
- __field(u32, new_pos_snapshot )
- __field(u8, good_level )
- ),
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
- TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->btree_id = btree_id;
- __entry->old_pos_inode = old_pos->inode;
- __entry->old_pos_offset = old_pos->offset;
- __entry->old_pos_snapshot = old_pos->snapshot;
- __entry->new_pos_inode = new_pos->inode;
- __entry->new_pos_offset = new_pos->offset;
- __entry->new_pos_snapshot = new_pos->snapshot;
- __entry->good_level = good_level;
- ),
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
- TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip,
- __entry->btree_id,
- __entry->old_pos_inode,
- __entry->old_pos_offset,
- __entry->old_pos_snapshot,
- __entry->new_pos_inode,
- __entry->new_pos_offset,
- __entry->new_pos_snapshot,
- __entry->good_level)
+DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
+ TP_PROTO(const char *trans_fn,
+ unsigned long caller_ip,
+ enum btree_id btree_id,
+ struct bpos *pos),
+ TP_ARGS(trans_fn, caller_ip, btree_id, pos)
);
TRACE_EVENT(trans_restart_would_deadlock,
- TP_PROTO(unsigned long trans_ip,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
bool in_traverse_all,
unsigned reason,
enum btree_id want_btree_id,
unsigned want_iter_type,
struct bpos *want_pos),
- TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
+ TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
have_btree_id, have_iter_type, have_pos,
want_btree_id, want_iter_type, want_pos),
TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
+ __array(char, trans_fn, 24 )
__field(unsigned long, caller_ip )
__field(u8, in_traverse_all )
__field(u8, reason )
),
TP_fast_assign(
- __entry->trans_ip = trans_ip;
+ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->in_traverse_all = in_traverse_all;
__entry->reason = reason;
__entry->want_pos_snapshot = want_pos->snapshot;
),
- TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
- (void *) __entry->trans_ip,
+ TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
+ __entry->trans_fn,
(void *) __entry->caller_ip,
__entry->in_traverse_all,
__entry->reason,
__entry->want_pos_snapshot)
);
-TRACE_EVENT(trans_restart_mem_realloced,
- TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
- unsigned long bytes),
- TP_ARGS(trans_ip, caller_ip, bytes),
+TRACE_EVENT(trans_restart_would_deadlock_write,
+ TP_PROTO(const char *trans_fn),
+ TP_ARGS(trans_fn),
TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
- __field(unsigned long, caller_ip )
- __field(unsigned long, bytes )
+ __array(char, trans_fn, 24 )
),
TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->bytes = bytes;
+ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
),
- TP_printk("%ps %pS bytes %lu",
- (void *) __entry->trans_ip,
- (void *) __entry->caller_ip,
- __entry->bytes)
+ TP_printk("%s", __entry->trans_fn)
);
-DECLARE_EVENT_CLASS(node_lock_fail,
- TP_PROTO(unsigned long trans_ip,
+TRACE_EVENT(trans_restart_mem_realloced,
+ TP_PROTO(const char *trans_fn,
unsigned long caller_ip,
- bool key_cache,
- enum btree_id btree_id,
- struct bpos *pos,
- unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
- TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
- level, iter_seq, node, node_seq),
+ unsigned long bytes),
+ TP_ARGS(trans_fn, caller_ip, bytes),
TP_STRUCT__entry(
- __field(unsigned long, trans_ip )
+ __array(char, trans_fn, 24 )
__field(unsigned long, caller_ip )
- __field(u8, key_cache )
- __field(u8, btree_id )
- __field(u64, pos_inode )
- __field(u64, pos_offset )
- __field(u32, pos_snapshot )
- __field(u32, level )
- __field(u32, iter_seq )
- __field(u32, node )
- __field(u32, node_seq )
+ __field(unsigned long, bytes )
),
TP_fast_assign(
- __entry->trans_ip = trans_ip;
- __entry->caller_ip = caller_ip;
- __entry->key_cache = key_cache;
- __entry->btree_id = btree_id;
- __entry->pos_inode = pos->inode;
- __entry->pos_offset = pos->offset;
- __entry->pos_snapshot = pos->snapshot;
- __entry->level = level;
- __entry->iter_seq = iter_seq;
- __entry->node = node;
- __entry->node_seq = node_seq;
+ strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __entry->bytes = bytes;
),
- TP_printk("%ps %pS key cache %u btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
- (void *) __entry->trans_ip,
+ TP_printk("%s %pS bytes %lu",
+ __entry->trans_fn,
(void *) __entry->caller_ip,
- __entry->key_cache,
- __entry->btree_id,
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->level, __entry->iter_seq,
- __entry->node, __entry->node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- bool key_cache,
- enum btree_id btree_id,
- struct bpos *pos,
- unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
- TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
- level, iter_seq, node, node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_relock_fail,
- TP_PROTO(unsigned long trans_ip,
- unsigned long caller_ip,
- bool key_cache,
- enum btree_id btree_id,
- struct bpos *pos,
- unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
- TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
- level, iter_seq, node, node_seq)
+ __entry->bytes)
);
#endif /* _TRACE_BCACHE_H */
#define NSEC_PER_SEC 1000000000L
-/* minimum size filesystem we can create, given a bucket size: */
-static u64 min_size(unsigned bucket_size)
-{
- return BCH_MIN_NR_NBUCKETS * bucket_size;
-}
-
static void init_layout(struct bch_sb_layout *l,
unsigned block_size,
unsigned sb_size,
u64 sb_start, u64 sb_end)
{
+ u64 sb_pos = sb_start;
unsigned i;
memset(l, 0, sizeof(*l));
/* Create two superblocks in the allowed range: */
for (i = 0; i < l->nr_superblocks; i++) {
- if (sb_start != BCH_SB_SECTOR)
- sb_start = round_up(sb_start, block_size);
+ if (sb_pos != BCH_SB_SECTOR)
+ sb_pos = round_up(sb_pos, block_size);
- l->sb_offset[i] = cpu_to_le64(sb_start);
- sb_start += sb_size;
+ l->sb_offset[i] = cpu_to_le64(sb_pos);
+ sb_pos += sb_size;
}
- if (sb_start >= sb_end)
- die("insufficient space for superblocks");
+ if (sb_pos > sb_end)
+ die("insufficient space for superblocks: start %llu end %llu > %llu size %u",
+ sb_start, sb_pos, sb_end, sb_size);
+}
+
+/* minimum size filesystem we can create, given a bucket size: */
+static u64 min_size(unsigned bucket_size)
+{
+ return BCH_MIN_NR_NBUCKETS * bucket_size;
}
void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
{
if (!dev->size)
- dev->size = get_size(dev->path, dev->fd) >> 9;
+ dev->size = get_size(dev->path, dev->fd);
if (!dev->bucket_size) {
if (dev->size < min_size(opts.block_size))
- die("cannot format %s, too small (%llu sectors, min %llu)",
+ die("cannot format %s, too small (%llu bytes, min %llu)",
dev->path, dev->size, min_size(opts.block_size));
/* Bucket size must be >= block size: */
opts.btree_node_size);
/* Want a bucket size of at least 128k, if possible: */
- dev->bucket_size = max(dev->bucket_size, 256U);
+ dev->bucket_size = max(dev->bucket_size, 128ULL << 10);
if (dev->size >= min_size(dev->bucket_size)) {
unsigned scale = max(1,
- ilog2(dev->size / min_size(dev->bucket_size)) / 4);
+ ilog2(dev->size / min_size(dev->bucket_size)) / 4);
scale = rounddown_pow_of_two(scale);
/* max bucket size 1 mb */
- dev->bucket_size = min(dev->bucket_size * scale, 1U << 11);
+ dev->bucket_size = min(dev->bucket_size * scale, 1ULL << 20);
} else {
do {
dev->bucket_size /= 2;
}
}
- dev->nbuckets = dev->size / dev->bucket_size;
+ dev->nbuckets = dev->size / dev->bucket_size;
if (dev->bucket_size < opts.block_size)
- die("Bucket size cannot be smaller than block size");
+ die("Bucket size (%llu) cannot be smaller than block size (%u)",
+ dev->bucket_size, opts.block_size);
if (opt_defined(opts, btree_node_size) &&
dev->bucket_size < opts.btree_node_size)
- die("Bucket size cannot be smaller than btree node size");
+ die("Bucket size (%llu) cannot be smaller than btree node size (%u)",
+ dev->bucket_size, opts.btree_node_size);
if (dev->nbuckets < BCH_MIN_NR_NBUCKETS)
- die("Not enough buckets: %llu, need %u (bucket size %u)",
+ die("Not enough buckets: %llu, need %u (bucket size %llu)",
dev->nbuckets, BCH_MIN_NR_NBUCKETS, dev->bucket_size);
+ if (dev->bucket_size > (u32) U16_MAX << 9)
+ die("Bucket size (%llu) too big (max %u)",
+ dev->bucket_size, (u32) U16_MAX << 9);
}
static unsigned parse_target(struct bch_sb_handle *sb,
/* calculate btree node size: */
if (!opt_defined(fs_opts, btree_node_size)) {
/* 256k default btree node size */
- opt_set(fs_opts, btree_node_size, 512);
+ opt_set(fs_opts, btree_node_size, 256 << 10);
for (i = devs; i < devs + nr_devs; i++)
fs_opts.btree_node_size =
i->bucket_size);
}
- if (!is_power_of_2(fs_opts.block_size))
- die("block size must be power of 2");
-
- if (!is_power_of_2(fs_opts.btree_node_size))
- die("btree node size must be power of 2");
-
if (uuid_is_null(opts.uuid.b))
uuid_generate(opts.uuid.b);
sb.sb->version = le16_to_cpu(opts.version);
sb.sb->version_min = le16_to_cpu(opts.version);
sb.sb->magic = BCACHE_MAGIC;
- sb.sb->block_size = cpu_to_le16(fs_opts.block_size);
sb.sb->user_uuid = opts.uuid;
sb.sb->nr_devices = nr_devs;
for (opt_id = 0;
opt_id < bch2_opts_nr;
opt_id++) {
- const struct bch_option *opt = &bch2_opt_table[opt_id];
u64 v;
- if (opt->set_sb == SET_NO_SB_OPT)
- continue;
-
v = bch2_opt_defined_by_id(&fs_opts, opt_id)
? bch2_opt_get_by_id(&fs_opts, opt_id)
: bch2_opt_get_by_id(&bch2_opts_default, opt_id);
- opt->set_sb(sb.sb, v);
+ __bch2_opt_set_sb(sb.sb, &bch2_opt_table[opt_id], v);
}
- SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb.sb,
- ilog2(opts.encoded_extent_max));
-
struct timespec now;
if (clock_gettime(CLOCK_REALTIME, &now))
die("error getting current time: %m");
/* Member info: */
mi = bch2_sb_resize_members(&sb,
(sizeof(*mi) + sizeof(struct bch_member) *
- nr_devs) / sizeof(u64));
+ nr_devs) / sizeof(u64));
for (i = devs; i < devs + nr_devs; i++) {
struct bch_member *m = mi->members + (i - devs);
uuid_generate(m->uuid.b);
m->nbuckets = cpu_to_le64(i->nbuckets);
m->first_bucket = 0;
- m->bucket_size = cpu_to_le16(i->bucket_size);
+ m->bucket_size = cpu_to_le16(i->bucket_size >> 9);
- SET_BCH_MEMBER_REPLACEMENT(m, BCH_CACHE_REPLACEMENT_lru);
SET_BCH_MEMBER_DISCARD(m, i->discard);
SET_BCH_MEMBER_DATA_ALLOWED(m, i->data_allowed);
SET_BCH_MEMBER_DURABILITY(m, i->durability + 1);
}
- /* Disk groups */
+ /* Disk labels*/
for (i = devs; i < devs + nr_devs; i++) {
- struct bch_member *m = mi->members + (i - devs);
+ struct bch_member *m;
int idx;
- if (!i->group)
+ if (!i->label)
continue;
- idx = bch2_disk_path_find_or_create(&sb, i->group);
+ idx = bch2_disk_path_find_or_create(&sb, i->label);
if (idx < 0)
- die("error creating disk path: %s", idx);
+ die("error creating disk path: %s", strerror(-idx));
+
+ /*
+ * Recompute mi and m after each sb modification: its location
+ * in memory may have changed due to reallocation.
+ */
+ mi = bch2_sb_get_members(sb.sb);
+ m = mi->members + (i - devs);
SET_BCH_MEMBER_GROUP(m, idx + 1);
}
}
for (i = devs; i < devs + nr_devs; i++) {
+ u64 size_sectors = i->size >> 9;
+
sb.sb->dev_idx = i - devs;
if (!i->sb_offset) {
i->sb_offset = BCH_SB_SECTOR;
- i->sb_end = i->size;
+ i->sb_end = size_sectors;
}
init_layout(&sb.sb->layout, fs_opts.block_size,
*/
if (i->sb_offset == BCH_SB_SECTOR) {
struct bch_sb_layout *l = &sb.sb->layout;
- u64 backup_sb = i->size - (1 << l->sb_max_size_bits);
+ u64 backup_sb = size_sectors - (1 << l->sb_max_size_bits);
- backup_sb = rounddown(backup_sb, i->bucket_size);
+ backup_sb = rounddown(backup_sb, i->bucket_size >> 9);
l->sb_offset[l->nr_superblocks++] = cpu_to_le64(backup_sb);
}
/* Zero start of disk */
static const char zeroes[BCH_SB_SECTOR << 9];
- xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
+ xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0,
+ "zeroing start of disk");
}
bch2_super_write(i->fd, sb.sb);
if (sb->offset == BCH_SB_SECTOR) {
/* Write backup layout */
xpwrite(fd, &sb->layout, sizeof(sb->layout),
- BCH_SB_LAYOUT_SECTOR << 9);
+ BCH_SB_LAYOUT_SECTOR << 9,
+ "backup layout");
}
sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb);
xpwrite(fd, sb, vstruct_bytes(sb),
- le64_to_cpu(sb->offset) << 9);
+ le64_to_cpu(sb->offset) << 9,
+ "superblock");
}
fsync(fd);
struct bch_disk_group *g = gi->entries + t.group;
if (t.group < disk_groups_nr(gi) && !BCH_GROUP_DELETED(g)) {
- ret = scnprintf(buf, len, "Group %u (%.*s)", t.group,
+ ret = scnprintf(buf, len, "Label %u (%.*s)", t.group,
BCH_SB_LABEL_SIZE, g->label);
} else {
- ret = scnprintf(buf, len, "Bad group %u", t.group);
+ ret = scnprintf(buf, len, "Bad label %u", t.group);
}
break;
}
char member_uuid_str[40];
char data_allowed_str[100];
char data_has_str[100];
- char group[BCH_SB_LABEL_SIZE+10];
+ char label [BCH_SB_LABEL_SIZE+10];
char time_str[64];
if (!bch2_member_exists(m))
unsigned idx = BCH_MEMBER_GROUP(m) - 1;
if (idx < disk_groups_nr(gi)) {
- snprintf(group, sizeof(group), "%.*s (%u)",
+ scnprintf(label, sizeof(label), "%.*s (%u)",
BCH_SB_LABEL_SIZE,
gi->entries[idx].label, idx);
} else {
- strcpy(group, "(bad disk groups section)");
+ strcpy(label, "(bad disk labels section)");
}
} else {
- strcpy(group, "(none)");
+ strcpy(label, "(none)");
}
bch2_flags_to_text(&PBUF(data_allowed_str),
" Has data: %s\n"
- " Replacement policy: %s\n"
" Discard: %llu\n",
i, member_uuid_str,
pr_units(le16_to_cpu(m->bucket_size) *
? bch2_member_states[BCH_MEMBER_STATE(m)]
: "unknown",
- group,
+ label,
data_allowed_str,
data_has_str,
- BCH_MEMBER_REPLACEMENT(m) < BCH_CACHE_REPLACEMENT_NR
- ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
- : "unknown",
-
BCH_MEMBER_DISCARD(m));
}
}
}
static void bch2_sb_print_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
+ enum units units)
{
struct bch_sb_field_replicas_v0 *replicas = field_to_type(f, replicas_v0);
struct bch_replicas_entry_v0 *e;
}
static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f,
- enum units units)
+ enum units units)
{
struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist);
unsigned i, nr = blacklist_nr_entries(bl);
free(ctl);
} else {
/* It's a path: */
- ret.ioctl_fd = xopen(path, O_RDONLY);
+ ret.ioctl_fd = open(path, O_RDONLY);
+ if (ret.ioctl_fd < 0)
+ die("Error opening filesystem at %s: %m", path);
struct bch_ioctl_query_uuid uuid;
if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid) < 0)
optid = bch2_opt_lookup(optstr);
if (optid < 0 ||
- !(bch2_opt_table[optid].mode & opt_types)) {
+ !(bch2_opt_table[optid].flags & opt_types)) {
i++;
goto next;
}
bch2_opt_table[i].type == BCH_OPT_FN)
continue;
- ret = bch2_opt_parse(NULL, &bch2_opt_table[i],
+ ret = bch2_opt_parse(NULL, "option",
+ &bch2_opt_table[i],
strs.by_id[i], &v);
if (ret < 0)
die("Invalid %s: %s",
for (opt = bch2_opt_table;
opt < bch2_opt_table + bch2_opts_nr;
opt++) {
- if (!(opt->mode & opt_types))
+ if (!(opt->flags & opt_types))
continue;
c += printf(" --%s", opt->attr.name);
uuid_le uuid;
unsigned version;
unsigned superblock_size;
- unsigned encoded_extent_max;
bool encrypted;
char *passphrase;
};
return (struct format_opts) {
.version = bcachefs_metadata_version_current,
.superblock_size = SUPERBLOCK_SIZE_DEFAULT,
- .encoded_extent_max = 128,
};
}
struct dev_opts {
int fd;
char *path;
- u64 size; /* 512 byte sectors */
- unsigned bucket_size;
- const char *group;
+ u64 size; /* bytes*/
+ u64 bucket_size; /* bytes */
+ const char *label;
unsigned data_allowed;
unsigned durability;
bool discard;
return xattr;
}
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter = { NULL };
struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
- iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
- &hash, inode->v.i_ino,
+ ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
+ &hash, inode_inum(inode),
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
- if (IS_ERR(iter)) {
- if (PTR_ERR(iter) == -EINTR)
+ if (ret) {
+ if (ret == -EINTR)
goto retry;
-
- if (PTR_ERR(iter) != -ENOENT)
- acl = ERR_CAST(iter);
+ if (ret != -ENOENT)
+ acl = ERR_PTR(ret);
goto out;
}
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret) {
acl = ERR_PTR(ret);
if (!IS_ERR(acl))
set_cached_acl(&inode->v, type, acl);
- bch2_trans_iter_put(&trans, iter);
out:
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return acl;
}
-int bch2_set_acl_trans(struct btree_trans *trans,
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode_u,
- const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
int ret;
if (type == ACL_TYPE_DEFAULT &&
if (IS_ERR(xattr))
return PTR_ERR(xattr);
- ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
- inode_u->bi_inum, &xattr->k_i, 0);
+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+ inum, &xattr->k_i, 0);
} else {
struct xattr_search_key search =
X_SEARCH(acl_to_xattr_type(type), "", 0);
- ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
- inode_u->bi_inum, &search);
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+ inum, &search);
}
return ret == -ENOENT ? 0 : ret;
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
- struct btree_iter *inode_iter;
+ struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
- struct bch_hash_info hash_info;
struct posix_acl *acl;
umode_t mode;
int ret;
bch2_trans_begin(&trans);
acl = _acl;
- inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT);
if (ret)
goto btree_err;
goto btree_err;
}
- hash_info = bch2_hash_info_init(c, &inode_u);
-
- ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
+ ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
if (ret)
goto btree_err;
inode_u.bi_ctime = bch2_current_time(c);
inode_u.bi_mode = mode;
- ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL,
- &inode->ei_journal_seq, 0);
+ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, NULL, 0);
btree_err:
- bch2_trans_iter_put(&trans, inode_iter);
+ bch2_trans_iter_exit(&trans, &inode_iter);
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
goto err;
- bch2_inode_update_after_write(c, inode, &inode_u,
+ bch2_inode_update_after_write(&trans, inode, &inode_u,
ATTR_CTIME|ATTR_MODE);
set_cached_acl(&inode->v, type, acl);
return ret;
}
-int bch2_acl_chmod(struct btree_trans *trans,
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
{
struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c_xattr xattr;
struct bkey_i_xattr *new;
struct posix_acl *acl;
struct bkey_s_c k;
int ret;
- iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
- &hash_info, inode->bi_inum,
+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+ &hash_info, inum,
&X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter);
if (ret)
return ret == -ENOENT ? 0 : ret;
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
xattr = bkey_s_c_to_xattr(k);
if (ret)
goto err;
goto err;
}
- new->k.p = iter->pos;
- ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+ new->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
*new_acl = acl;
acl = NULL;
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
if (!IS_ERR_OR_NULL(acl))
kfree(acl);
return ret;
__le32 a_version;
} bch_acl_header;
-struct posix_acl *bch2_get_acl(struct inode *, int);
+struct posix_acl *bch2_get_acl(struct inode *, int, bool);
-int bch2_set_acl_trans(struct btree_trans *,
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
- const struct bch_hash_info *,
struct posix_acl *, int);
int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
umode_t, struct posix_acl **);
#else
-static inline int bch2_set_acl_trans(struct btree_trans *trans,
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode_u,
- const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
return 0;
}
-static inline int bch2_acl_chmod(struct btree_trans *trans,
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *inode,
umode_t mode,
struct posix_acl **new_acl)
#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
#include "ec.h"
return 0;
}
-static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+ const u8 *in = a.v->data;
+ const u8 *end = bkey_val_end(a);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v;
+
+ out->gen = a.v->gen;
+ out->oldest_gen = a.v->oldest_gen;
+ out->data_type = a.v->data_type;
+ out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits) \
+ if (fieldnr < a.v->nr_fields) { \
+ ret = bch2_varint_decode_fast(in, end, &v); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v = 0; \
+ } \
+ out->_name = v; \
+ if (v != out->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_ALLOC_FIELDS_V2()
+#undef x
+ return 0;
+}
+
+static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
const struct bkey_alloc_unpacked src)
{
- struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+ struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
u8 *out = a->v.data;
u8 *end = (void *) &dst[1];
a->v.gen = src.gen;
a->v.oldest_gen = src.oldest_gen;
a->v.data_type = src.data_type;
+ a->v.journal_seq = cpu_to_le64(src.journal_seq);
#define x(_name, _bits) \
nr_fields++; \
.gen = 0,
};
- if (k.k->type == KEY_TYPE_alloc_v2)
- bch2_alloc_unpack_v2(&ret, k);
- else if (k.k->type == KEY_TYPE_alloc)
+ switch (k.k->type) {
+ case KEY_TYPE_alloc:
bch2_alloc_unpack_v1(&ret, k);
+ break;
+ case KEY_TYPE_alloc_v2:
+ bch2_alloc_unpack_v2(&ret, k);
+ break;
+ case KEY_TYPE_alloc_v3:
+ bch2_alloc_unpack_v3(&ret, k);
+ break;
+ }
return ret;
}
-void bch2_alloc_pack(struct bch_fs *c,
- struct bkey_alloc_buf *dst,
- const struct bkey_alloc_unpacked src)
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
+ const struct bkey_alloc_unpacked src)
{
- bch2_alloc_pack_v2(dst, src);
+ struct bkey_alloc_buf *dst;
+
+ dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+ if (!IS_ERR(dst))
+ bch2_alloc_pack_v3(dst, src);
+
+ return dst;
+}
+
+int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_alloc_unpacked *u, unsigned trigger_flags)
+{
+ struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
+
+ return PTR_ERR_OR_ZERO(a) ?:
+ bch2_trans_update(trans, iter, &a->k, trigger_flags);
}
static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
return NULL;
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
- struct bkey_s_c k)
-{
- struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
- pr_buf(out, "gen %u oldest_gen %u data_type %s",
- u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
-#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
- BCH_ALLOC_FIELDS_V2()
-#undef x
-}
-
-static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bch_dev *ca;
- struct bucket *g;
struct bkey_alloc_unpacked u;
- if (k.k->type != KEY_TYPE_alloc &&
- k.k->type != KEY_TYPE_alloc_v2)
- return 0;
-
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = bucket(ca, k.k->p.offset);
- u = bch2_alloc_unpack(k);
+ if (k.k->p.inode >= c->sb.nr_devices ||
+ !c->devs[k.k->p.inode])
+ return "invalid device";
- g->_mark.gen = u.gen;
- g->_mark.data_type = u.data_type;
- g->_mark.dirty_sectors = u.dirty_sectors;
- g->_mark.cached_sectors = u.cached_sectors;
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
- g->gen_valid = 1;
+ if (bch2_alloc_unpack_v3(&u, k))
+ return "unpack error";
- return 0;
+ return NULL;
}
-int bch2_alloc_read(struct bch_fs *c)
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
{
- int ret;
-
- down_read(&c->gc_lock);
- ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn);
- up_read(&c->gc_lock);
- if (ret) {
- bch_err(c, "error reading alloc info: %i", ret);
- return ret;
- }
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
- return 0;
+ pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
+ u.gen, u.oldest_gen, bch2_data_types[u.data_type],
+ u.journal_seq);
+#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
+ BCH_ALLOC_FIELDS_V2()
+#undef x
}
-static int bch2_alloc_write_key(struct btree_trans *trans,
- struct btree_iter *iter,
- unsigned flags)
+int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
{
- struct bch_fs *c = trans->c;
+ struct btree_trans trans;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bch_dev *ca;
struct bucket *g;
- struct bucket_mark m;
- struct bkey_alloc_unpacked old_u, new_u;
- struct bkey_alloc_buf a;
+ struct bkey_alloc_unpacked u;
int ret;
-retry:
- bch2_trans_begin(trans);
-
- ret = bch2_btree_key_cache_flush(trans,
- BTREE_ID_alloc, iter->pos);
- if (ret)
- goto err;
-
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- old_u = bch2_alloc_unpack(k);
-
- percpu_down_read(&c->mark_lock);
- ca = bch_dev_bkey_exists(c, iter->pos.inode);
- g = bucket(ca, iter->pos.offset);
- m = READ_ONCE(g->mark);
- new_u = alloc_mem_to_key(iter, g, m);
- percpu_up_read(&c->mark_lock);
-
- if (!bkey_alloc_unpacked_cmp(old_u, new_u))
- return 0;
- bch2_alloc_pack(c, &a, new_u);
- ret = bch2_trans_update(trans, iter, &a.k,
- BTREE_TRIGGER_NORUN) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags);
-err:
- if (ret == -EINTR)
- goto retry;
- return ret;
-}
-
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bch_dev *ca;
- unsigned i;
- int ret = 0;
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = __bucket(ca, k.k->p.offset, gc);
+ u = bch2_alloc_unpack(k);
+
+ if (!gc)
+ *bucket_gen(ca, k.k->p.offset) = u.gen;
+
+ g->_mark.gen = u.gen;
+ g->io_time[READ] = u.read_time;
+ g->io_time[WRITE] = u.write_time;
+ g->oldest_gen = !gc ? u.oldest_gen : u.gen;
+ g->gen_valid = 1;
+
+ if (!gc ||
+ (metadata_only &&
+ (u.data_type == BCH_DATA_user ||
+ u.data_type == BCH_DATA_cached ||
+ u.data_type == BCH_DATA_parity))) {
+ g->_mark.data_type = u.data_type;
+ g->_mark.dirty_sectors = u.dirty_sectors;
+ g->_mark.cached_sectors = u.cached_sectors;
+ g->_mark.stripe = u.stripe != 0;
+ g->stripe = u.stripe;
+ g->stripe_redundancy = u.stripe_redundancy;
+ }
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- for_each_member_device(ca, c, i) {
- bch2_btree_iter_set_pos(iter,
- POS(ca->dev_idx, ca->mi.first_bucket));
+ bch2_trans_exit(&trans);
- while (iter->pos.offset < ca->mi.nbuckets) {
- bch2_trans_cond_resched(&trans);
+ if (ret)
+ bch_err(c, "error reading alloc info: %i", ret);
- ret = bch2_alloc_write_key(&trans, iter, flags);
- if (ret) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
- bch2_btree_iter_advance(iter);
- }
- }
-err:
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
return ret;
}
size_t bucket_nr, int rw)
{
struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
- struct btree_iter *iter;
- struct bucket *g;
- struct bkey_alloc_buf *a;
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct bkey_alloc_unpacked u;
u64 *time, now;
int ret = 0;
- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto out;
-
- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
- ret = PTR_ERR_OR_ZERO(a);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
if (ret)
goto out;
- percpu_down_read(&c->mark_lock);
- g = bucket(ca, bucket_nr);
- u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
- percpu_up_read(&c->mark_lock);
+ u = bch2_alloc_unpack(k);
time = rw == READ ? &u.read_time : &u.write_time;
now = atomic64_read(&c->io_clock[rw].now);
*time = now;
- bch2_alloc_pack(c, a, u);
- ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
+ ret = bch2_alloc_write(trans, &iter, &u, 0) ?:
bch2_trans_commit(trans, NULL, NULL, 0);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
test_bit(b, ca->buckets_nouse))
return false;
+ if (ca->new_fs_bucket_idx) {
+ /*
+ * Device or filesystem is still being initialized, and we
+ * haven't fully marked superblocks & journal:
+ */
+ if (is_superblock_bucket(ca, b))
+ return false;
+
+ if (b < ca->new_fs_bucket_idx)
+ return false;
+ }
+
gc_gen = bucket_gc_gen(bucket(ca, b));
ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
u64 now, u64 last_seq_ondisk)
{
- unsigned used = bucket_sectors_used(m);
+ unsigned used = m.cached_sectors;
if (used) {
/*
* keys when there's only a small difference, so that we can
* keep sequential buckets together:
*/
- return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
- (bucket_gc_gen(g) >> 4);
+ return bucket_gc_gen(g) >> 4;
}
}
buckets = bucket_array(ca);
ca->alloc_heap.used = 0;
now = atomic64_read(&c->io_clock[READ].now);
- last_seq_ondisk = c->journal.last_seq_ondisk;
+ last_seq_ondisk = c->journal.flushed_seq_ondisk;
/*
* Find buckets with lowest read priority, by building a maxheap sorted
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
+ if (!m.data_type &&
+ bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ last_seq_ondisk,
+ ca->dev_idx, b)) {
+ ca->buckets_waiting_on_journal++;
+ continue;
+ }
+
if (e.nr && e.bucket + e.nr == b && e.key == key) {
e.nr++;
} else {
up_read(&ca->bucket_lock);
}
-static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets = bucket_array(ca);
- struct bucket_mark m;
- size_t b, start;
-
- if (ca->fifo_last_bucket < ca->mi.first_bucket ||
- ca->fifo_last_bucket >= ca->mi.nbuckets)
- ca->fifo_last_bucket = ca->mi.first_bucket;
-
- start = ca->fifo_last_bucket;
-
- do {
- ca->fifo_last_bucket++;
- if (ca->fifo_last_bucket == ca->mi.nbuckets)
- ca->fifo_last_bucket = ca->mi.first_bucket;
-
- b = ca->fifo_last_bucket;
- m = READ_ONCE(buckets->b[b].mark);
-
- if (bch2_can_invalidate_bucket(ca, b, m)) {
- struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
- heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
- if (heap_full(&ca->alloc_heap))
- break;
- }
-
- cond_resched();
- } while (ca->fifo_last_bucket != start);
-}
-
-static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
-{
- struct bucket_array *buckets = bucket_array(ca);
- struct bucket_mark m;
- size_t checked, i;
-
- for (checked = 0;
- checked < ca->mi.nbuckets / 2;
- checked++) {
- size_t b = bch2_rand_range(ca->mi.nbuckets -
- ca->mi.first_bucket) +
- ca->mi.first_bucket;
-
- m = READ_ONCE(buckets->b[b].mark);
-
- if (bch2_can_invalidate_bucket(ca, b, m)) {
- struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
- heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
- if (heap_full(&ca->alloc_heap))
- break;
- }
-
- cond_resched();
- }
-
- sort(ca->alloc_heap.data,
- ca->alloc_heap.used,
- sizeof(ca->alloc_heap.data[0]),
- bucket_idx_cmp, NULL);
-
- /* remove duplicates: */
- for (i = 0; i + 1 < ca->alloc_heap.used; i++)
- if (ca->alloc_heap.data[i].bucket ==
- ca->alloc_heap.data[i + 1].bucket)
- ca->alloc_heap.data[i].nr = 0;
-}
-
static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
{
size_t i, nr = 0;
ca->inc_gen_needs_gc = 0;
ca->inc_gen_really_needs_gc = 0;
+ ca->buckets_waiting_on_journal = 0;
- switch (ca->mi.replacement) {
- case BCH_CACHE_REPLACEMENT_lru:
- find_reclaimable_buckets_lru(c, ca);
- break;
- case BCH_CACHE_REPLACEMENT_fifo:
- find_reclaimable_buckets_fifo(c, ca);
- break;
- case BCH_CACHE_REPLACEMENT_random:
- find_reclaimable_buckets_random(c, ca);
- break;
- }
+ find_reclaimable_buckets_lru(c, ca);
heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
return nr;
}
-/*
- * returns sequence number of most recent journal entry that updated this
- * bucket:
- */
-static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
-{
- if (m.journal_seq_valid) {
- u64 journal_seq = atomic64_read(&c->journal.seq);
- u64 bucket_seq = journal_seq;
-
- bucket_seq &= ~((u64) U16_MAX);
- bucket_seq |= m.journal_seq;
-
- if (bucket_seq > journal_seq)
- bucket_seq -= 1 << 16;
-
- return bucket_seq;
- } else {
- return 0;
- }
-}
-
static int bucket_invalidate_btree(struct btree_trans *trans,
- struct bch_dev *ca, u64 b)
+ struct bch_dev *ca, u64 b,
+ struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
- struct bkey_alloc_buf *a;
- struct bkey_alloc_unpacked u;
- struct bucket *g;
- struct bucket_mark m;
- struct btree_iter *iter =
- bch2_trans_get_iter(trans, BTREE_ID_alloc,
- POS(ca->dev_idx, b),
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
+ struct btree_iter iter;
+ struct bkey_s_c k;
int ret;
- a = bch2_trans_kmalloc(trans, sizeof(*a));
- ret = PTR_ERR_OR_ZERO(a);
- if (ret)
- goto err;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, b),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
if (ret)
goto err;
- percpu_down_read(&c->mark_lock);
- g = bucket(ca, b);
- m = READ_ONCE(g->mark);
- u = alloc_mem_to_key(iter, g, m);
- percpu_up_read(&c->mark_lock);
-
- u.gen++;
- u.data_type = 0;
- u.dirty_sectors = 0;
- u.cached_sectors = 0;
- u.read_time = atomic64_read(&c->io_clock[READ].now);
- u.write_time = atomic64_read(&c->io_clock[WRITE].now);
+ *u = bch2_alloc_unpack(k);
+ u->gen++;
+ u->data_type = 0;
+ u->dirty_sectors = 0;
+ u->cached_sectors = 0;
+ u->read_time = atomic64_read(&c->io_clock[READ].now);
+ u->write_time = atomic64_read(&c->io_clock[WRITE].now);
- bch2_alloc_pack(c, a, u);
- ret = bch2_trans_update(trans, iter, &a->k,
- BTREE_TRIGGER_BUCKET_INVALIDATE);
+ ret = bch2_alloc_write(trans, &iter, u,
+ BTREE_TRIGGER_BUCKET_INVALIDATE);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
u64 *journal_seq, unsigned flags)
{
- struct bucket *g;
- struct bucket_mark m;
+ struct bkey_alloc_unpacked u;
size_t b;
+ u64 commit_seq = 0;
int ret = 0;
+ /*
+ * If the read-only path is trying to shut down, we can't be generating
+ * new btree updates:
+ */
+ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
+ return 1;
+
BUG_ON(!ca->alloc_heap.used ||
!ca->alloc_heap.data[0].nr);
b = ca->alloc_heap.data[0].bucket;
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read(&c->mark_lock);
- g = bucket(ca, b);
- m = READ_ONCE(g->mark);
-
- BUG_ON(m.dirty_sectors);
bch2_mark_alloc_bucket(c, ca, b, true);
BUG_ON(!fifo_push(&ca->free_inc, b));
spin_unlock(&c->freelist_lock);
- /*
- * If we're not invalidating cached data, we only increment the bucket
- * gen in memory here, the incremented gen will be updated in the btree
- * by bch2_trans_mark_pointer():
- */
- if (!m.cached_sectors &&
- !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
- BUG_ON(m.data_type);
- bucket_cmpxchg(g, m, m.gen++);
- percpu_up_read(&c->mark_lock);
- goto out;
- }
-
percpu_up_read(&c->mark_lock);
- /*
- * If the read-only path is trying to shut down, we can't be generating
- * new btree updates:
- */
- if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
- ret = 1;
- goto out;
- }
-
- ret = bch2_trans_do(c, NULL, journal_seq,
+ ret = bch2_trans_do(c, NULL, &commit_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED|
flags,
- bucket_invalidate_btree(&trans, ca, b));
-out:
+ bucket_invalidate_btree(&trans, ca, b, &u));
+
if (!ret) {
/* remove from alloc_heap: */
struct alloc_heap_entry e, *top = ca->alloc_heap.data;
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
/*
- * Make sure we flush the last journal entry that updated this
- * bucket (i.e. deleting the last reference) before writing to
- * this bucket again:
+ * If we invalidating cached data then we need to wait on the
+ * journal commit:
*/
- *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+ if (u.data_type)
+ *journal_seq = max(*journal_seq, commit_seq);
+
+ /*
+ * We already waiting on u.alloc_seq when we filtered out
+ * buckets that need journal commit:
+ */
+ BUG_ON(*journal_seq > u.journal_seq);
} else {
size_t b2;
/* If we used NOWAIT, don't return the error: */
if (!fifo_empty(&ca->free_inc))
ret = 0;
- if (ret) {
+ if (ret < 0)
bch_err(ca, "error invalidating buckets: %i", ret);
+ if (ret)
return ret;
- }
if (journal_seq)
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
gc_count = c->gc_count;
nr = find_reclaimable_buckets(c, ca);
- trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
- ca->inc_gen_really_needs_gc);
+ if (!nr && ca->buckets_waiting_on_journal) {
+ ret = bch2_journal_flush(&c->journal);
+ if (ret)
+ goto stop;
+ } else if (nr < (ca->mi.nbuckets >> 6) &&
+ ca->buckets_waiting_on_journal >= nr / 2) {
+ bch2_journal_flush_async(&c->journal, NULL);
+ }
if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
ca->inc_gen_really_needs_gc) &&
atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread);
}
+
+ trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+ ca->inc_gen_really_needs_gc);
}
ret = bch2_invalidate_buckets(c, ca);
lockdep_assert_held(&c->state_lock);
for_each_online_member(ca, c, i) {
- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
ra_pages += bdi->ra_pages;
}
ob++) {
spin_lock(&ob->lock);
if (ob->valid && !ob->on_partial_list &&
- ob->ptr.dev == ca->dev_idx)
+ ob->dev == ca->dev_idx)
ret = true;
spin_unlock(&ob->lock);
}
{
spin_lock_init(&c->freelist_lock);
}
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
-{
- struct open_bucket *ob;
-
- for (ob = c->open_buckets;
- ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
- ob++) {
- spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list) {
- pr_buf(out, "%zu ref %u type %s\n",
- ob - c->open_buckets,
- atomic_read(&ob->pin),
- bch2_data_types[ob->type]);
- }
- spin_unlock(&ob->lock);
- }
-
-}
#include "bcachefs.h"
#include "alloc_types.h"
+#include "buckets.h"
#include "debug.h"
+#include "super.h"
extern const char * const bch2_allocator_states[];
struct bkey_alloc_unpacked {
+ u64 journal_seq;
u64 bucket;
u8 dev;
u8 gen;
#undef x
};
-struct bkey_alloc_buf {
- struct bkey_i k;
-
- union {
- struct {
-#define x(_name, _bits) + _bits / 8
- u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
-#undef x
- } _v1;
- struct {
-#define x(_name, _bits) + 8 + _bits / 8
- u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
-#undef x
- } _v2;
- };
-} __attribute__((packed, aligned(8)));
-
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
;
}
+struct bkey_alloc_buf {
+ struct bkey_i k;
+ struct bch_alloc_v3 v;
+
+#define x(_name, _bits) + _bits / 8
+ u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
+#undef x
+} __attribute__((packed, aligned(8)));
+
struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
- const struct bkey_alloc_unpacked);
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
+ const struct bkey_alloc_unpacked);
+int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
+ struct bkey_alloc_unpacked *, unsigned);
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct btree_iter *iter,
- struct bucket *g, struct bucket_mark m)
-{
- return (struct bkey_alloc_unpacked) {
- .dev = iter->pos.inode,
- .bucket = iter->pos.offset,
- .gen = m.gen,
- .oldest_gen = g->oldest_gen,
- .data_type = m.data_type,
- .dirty_sectors = m.dirty_sectors,
- .cached_sectors = m.cached_sectors,
- .read_time = g->io_time[READ],
- .write_time = g->io_time[WRITE],
- };
-}
-
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc (struct bkey_ops) { \
.val_to_text = bch2_alloc_to_text, \
}
-int bch2_alloc_read(struct bch_fs *);
+#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \
+ .key_invalid = bch2_alloc_v3_invalid, \
+ .val_to_text = bch2_alloc_to_text, \
+}
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+ return k->type == KEY_TYPE_alloc ||
+ k->type == KEY_TYPE_alloc_v2 ||
+ k->type == KEY_TYPE_alloc_v3;
+}
+
+int bch2_alloc_read(struct bch_fs *, bool, bool);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
-int bch2_alloc_write(struct bch_fs *, unsigned);
void bch2_fs_allocator_background_init(struct bch_fs *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
-
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
* reference _after_ doing the index update that makes its allocation reachable.
*/
+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
+{
+ open_bucket_idx_t idx = ob - c->open_buckets;
+ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+ ob->hash = *slot;
+ *slot = idx;
+}
+
+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
+{
+ open_bucket_idx_t idx = ob - c->open_buckets;
+ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+ while (*slot != idx) {
+ BUG_ON(!*slot);
+ slot = &c->open_buckets[*slot].hash;
+ }
+
+ *slot = ob->hash;
+ ob->hash = 0;
+}
+
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
if (ob->ec) {
bch2_ec_bucket_written(c, ob);
percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock);
- bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
+ bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
ob->valid = false;
- ob->type = 0;
+ ob->data_type = 0;
spin_unlock(&ob->lock);
percpu_up_read(&c->mark_lock);
spin_lock(&c->freelist_lock);
+ bch2_open_bucket_hash_remove(c, ob);
+
ob->freelist = c->open_buckets_freelist;
c->open_buckets_freelist = ob - c->open_buckets;
unsigned i;
open_bucket_for_each(c, obs, ob, i)
- if (ob->ptr.dev == dev &&
- ob->ec)
+ if (ob->dev == dev && ob->ec)
bch2_ec_bucket_cancel(c, ob);
}
ob = c->open_buckets + c->open_buckets_freelist;
c->open_buckets_freelist = ob->freelist;
atomic_set(&ob->pin, 1);
- ob->type = 0;
+ ob->data_type = 0;
c->open_buckets_nr_free--;
return ob;
struct write_point *wp,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- bool may_realloc = wp->type == BCH_DATA_user;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ bool may_realloc = wp->data_type == BCH_DATA_user;
BUG_ON(ca->open_buckets_partial_nr >
ARRAY_SIZE(ca->open_buckets_partial));
}
}
-static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct open_bucket *ob;
- unsigned i;
-
- open_bucket_for_each(c, obs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
- BUG_ON(ptr_stale(ca, &ob->ptr));
- }
-#endif
-}
-
/* _only_ for allocating the journal on a new device: */
long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
{
- struct bucket_array *buckets;
- ssize_t b;
+ while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
+ u64 b = ca->new_fs_bucket_idx++;
- rcu_read_lock();
- buckets = bucket_array(ca);
-
- for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
- if (is_available_bucket(buckets->b[b].mark) &&
- !buckets->b[b].mark.owned_by_allocator)
- goto success;
- b = -1;
-success:
- rcu_read_unlock();
- return b;
+ if (!is_superblock_bucket(ca, b) &&
+ (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
+ return b;
+ }
+
+ return -1;
}
static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
ob->valid = true;
ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve;
- ob->ptr = (struct bch_extent_ptr) {
- .type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = bucket(ca, b)->mark.gen,
- .offset = bucket_to_sector(ca, b),
- .dev = ca->dev_idx,
- };
-
+ ob->dev = ca->dev_idx;
+ ob->gen = *bucket_gen(ca, b);
+ ob->bucket = b;
spin_unlock(&ob->lock);
+ ca->nr_open_buckets++;
+ bch2_open_bucket_hash_add(c, ob);
+
if (c->blocked_allocate_open_bucket) {
bch2_time_stats_update(
&c->times[BCH_TIME_blocked_allocate_open_bucket],
c->blocked_allocate = 0;
}
- ca->nr_open_buckets++;
spin_unlock(&c->freelist_lock);
bch2_wake_allocator(ca);
struct open_bucket *ob)
{
unsigned durability =
- bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability;
+ bch_dev_bkey_exists(c, ob->dev)->mi.durability;
- __clear_bit(ob->ptr.dev, devs_may_alloc->d);
+ __clear_bit(ob->dev, devs_may_alloc->d);
*nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY)
? durability : 1;
*have_cache |= !durability;
ob_push(c, ptrs, ob);
}
-enum bucket_alloc_ret
-bch2_bucket_alloc_set(struct bch_fs *c,
+int bch2_bucket_alloc_set(struct bch_fs *c,
struct open_buckets *ptrs,
struct dev_stripe_state *stripe,
struct bch_devs_mask *devs_may_alloc,
struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
struct bch_dev *ca;
- enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES;
+ int ret = -INSUFFICIENT_DEVICES;
unsigned i;
BUG_ON(*nr_effective >= nr_replicas);
ob = bch2_bucket_alloc(c, ca, reserve,
flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
if (IS_ERR(ob)) {
- ret = -PTR_ERR(ob);
+ ret = PTR_ERR(ob);
if (cl)
return ret;
bch2_dev_stripe_increment(ca, stripe);
if (*nr_effective >= nr_replicas)
- return ALLOC_SUCCESS;
+ return 0;
}
return ret;
* it's to a device we don't want:
*/
-static enum bucket_alloc_ret
-bucket_alloc_from_stripe(struct bch_fs *c,
+static int bucket_alloc_from_stripe(struct bch_fs *c,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_mask *devs_may_alloc,
continue;
ob = c->open_buckets + h->s->blocks[ec_idx];
- if (ob->ptr.dev == devs_sorted.devs[i] &&
+ if (ob->dev == devs_sorted.devs[i] &&
!test_and_set_bit(ec_idx, h->s->blocks_allocated))
goto got_bucket;
}
goto out_put_head;
got_bucket:
- ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ ca = bch_dev_bkey_exists(c, ob->dev);
ob->ec_idx = ec_idx;
ob->ec = h->s;
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
if (*nr_effective < nr_replicas &&
- test_bit(ob->ptr.dev, devs_may_alloc->d) &&
+ test_bit(ob->dev, devs_may_alloc->d) &&
(ca->mi.durability ||
- (wp->type == BCH_DATA_user && !*have_cache)) &&
+ (wp->data_type == BCH_DATA_user && !*have_cache)) &&
(ob->ec || !need_ec)) {
add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache,
wp->ptrs = ptrs_skip;
}
-static enum bucket_alloc_ret
-open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct bch_fs *c,
struct open_buckets *ptrs,
struct write_point *wp,
struct bch_devs_list *devs_have,
struct bch_devs_mask devs;
struct open_bucket *ob;
struct closure *cl = NULL;
- enum bucket_alloc_ret ret;
+ int ret;
unsigned i;
rcu_read_lock();
- devs = target_rw_devs(c, wp->type, target);
+ devs = target_rw_devs(c, wp->data_type, target);
rcu_read_unlock();
/* Don't allocate from devices we already have pointers to: */
__clear_bit(devs_have->devs[i], devs.d);
open_bucket_for_each(c, ptrs, ob, i)
- __clear_bit(ob->ptr.dev, devs.d);
+ __clear_bit(ob->dev, devs.d);
if (erasure_code) {
if (!ec_open_bucket(c, ptrs)) {
target, erasure_code,
nr_replicas, nr_effective,
have_cache, flags, _cl);
- if (ret == FREELIST_EMPTY ||
- ret == OPEN_BUCKETS_EMPTY)
+ if (ret == -FREELIST_EMPTY ||
+ ret == -OPEN_BUCKETS_EMPTY)
return ret;
if (*nr_effective >= nr_replicas)
return 0;
ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
nr_replicas, nr_effective, have_cache,
reserve, flags, cl);
- if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) {
+ if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
cl = _cl;
goto retry_blocking;
}
unsigned i, j;
open_bucket_for_each(c, obs, ob, i) {
- bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+ bool drop = !ca || ob->dev == ca->dev_idx;
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
continue;
ob2 = c->open_buckets + ob->ec->blocks[j];
- drop |= ob2->ptr.dev == ca->dev_idx;
+ drop |= ob2->dev == ca->dev_idx;
}
mutex_unlock(&ob->ec->lock);
}
unsigned nr_effective, write_points_nr;
unsigned ob_flags = 0;
bool have_cache;
- enum bucket_alloc_ret ret;
+ int ret;
int i;
if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
wp = writepoint_find(c, write_point.v);
- if (wp->type == BCH_DATA_user)
+ if (wp->data_type == BCH_DATA_user)
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
/* metadata may not allocate on cache devices: */
- if (wp->type != BCH_DATA_user)
+ if (wp->data_type != BCH_DATA_user)
have_cache = true;
if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
if (erasure_code && !ec_open_bucket(c, &ptrs))
pr_debug("failed to get ec bucket: ret %u", ret);
- if (ret == INSUFFICIENT_DEVICES &&
+ if (ret == -INSUFFICIENT_DEVICES &&
nr_effective >= nr_replicas_required)
ret = 0;
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
- verify_not_stale(c, &wp->ptrs);
-
return wp;
err:
open_bucket_for_each(c, &wp->ptrs, ob, i)
mutex_unlock(&wp->lock);
- if (ret == FREELIST_EMPTY &&
+ if (ret == -FREELIST_EMPTY &&
try_decrease_writepoints(c, write_points_nr))
goto retry;
switch (ret) {
- case OPEN_BUCKETS_EMPTY:
- case FREELIST_EMPTY:
+ case -OPEN_BUCKETS_EMPTY:
+ case -FREELIST_EMPTY:
return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
- case INSUFFICIENT_DEVICES:
+ case -INSUFFICIENT_DEVICES:
return ERR_PTR(-EROFS);
default:
BUG();
}
}
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+ return (struct bch_extent_ptr) {
+ .type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .gen = ob->gen,
+ .dev = ob->dev,
+ .offset = bucket_to_sector(ca, ob->bucket) +
+ ca->mi.bucket_size -
+ ob->sectors_free,
+ };
+}
+
/*
* Append pointers to the space we just allocated to @k, and mark @sectors space
* as allocated out of @ob
*/
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
- struct bkey_i *k, unsigned sectors)
+ struct bkey_i *k, unsigned sectors,
+ bool cached)
{
struct open_bucket *ob;
wp->sectors_free -= sectors;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- struct bch_extent_ptr tmp = ob->ptr;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+ struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
- tmp.cached = !ca->mi.durability &&
- wp->type == BCH_DATA_user;
+ ptr.cached = cached ||
+ (!ca->mi.durability &&
+ wp->data_type == BCH_DATA_user);
- tmp.offset += ca->mi.bucket_size - ob->sectors_free;
- bch2_bkey_append_ptr(k, tmp);
+ bch2_bkey_append_ptr(k, ptr);
BUG_ON(sectors > ob->sectors_free);
ob->sectors_free -= sectors;
enum bch_data_type type)
{
mutex_init(&wp->lock);
- wp->type = type;
+ wp->data_type = type;
}
void bch2_fs_allocator_foreground_init(struct bch_fs *c)
writepoint_hash(c, wp->write_point));
}
}
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct open_bucket *ob;
+
+ for (ob = c->open_buckets;
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+ ob++) {
+ spin_lock(&ob->lock);
+ if (ob->valid && !ob->on_partial_list) {
+ pr_buf(out, "%zu ref %u type %s\n",
+ ob - c->open_buckets,
+ atomic_read(&ob->pin),
+ bch2_data_types[ob->data_type]);
+ }
+ spin_unlock(&ob->lock);
+ }
+
+}
struct bch_fs;
struct bch_devs_List;
-enum bucket_alloc_ret {
- ALLOC_SUCCESS,
- OPEN_BUCKETS_EMPTY,
- FREELIST_EMPTY, /* Allocator thread not keeping up */
- INSUFFICIENT_DEVICES,
-};
-
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i) {
- ob->type = wp->type;
+ ob->data_type = wp->data_type;
atomic_inc(&ob->pin);
ob_push(c, ptrs, ob);
}
}
-enum bucket_alloc_ret
-bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
+ unsigned dev, u64 bucket)
+{
+ return c->open_buckets_hash +
+ (jhash_3words(dev, bucket, bucket >> 32, 0) &
+ (OPEN_BUCKETS_COUNT - 1));
+}
+
+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+ open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
+
+ while (slot) {
+ struct open_bucket *ob = &c->open_buckets[slot];
+
+ if (ob->dev == dev && ob->bucket == bucket)
+ return true;
+
+ slot = ob->hash;
+ }
+
+ return false;
+}
+
+int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
unsigned, unsigned *, bool *, enum alloc_reserve,
unsigned, struct closure *);
unsigned,
struct closure *);
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
- struct bkey_i *, unsigned);
+ struct bkey_i *, unsigned, bool);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
void bch2_fs_allocator_foreground_init(struct bch_fs *);
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
#define WRITE_POINT_HASH_NR 32
#define WRITE_POINT_MAX 32
+/*
+ * 0 is never a valid open_bucket_idx_t:
+ */
typedef u16 open_bucket_idx_t;
struct open_bucket {
spinlock_t lock;
atomic_t pin;
open_bucket_idx_t freelist;
+ open_bucket_idx_t hash;
/*
* When an open bucket has an ec_stripe attached, this is the index of
* the block in the stripe this open_bucket corresponds to:
*/
u8 ec_idx;
- u8 type;
+ enum bch_data_type data_type:3;
unsigned valid:1;
unsigned on_partial_list:1;
int alloc_reserve:3;
+
unsigned sectors_free;
- struct bch_extent_ptr ptr;
+ u8 dev;
+ u8 gen;
+ u64 bucket;
struct ec_stripe_new *ec;
};
struct mutex lock;
u64 last_used;
unsigned long write_point;
- enum bch_data_type type;
+ enum bch_data_type data_type;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
*/
#undef pr_fmt
+#ifdef __KERNEL__
#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#else
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+#endif
+#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
#include <linux/bio.h>
#include <linux/closure.h>
#include <linux/zstd.h>
#include "bcachefs_format.h"
+#include "errcode.h"
#include "fifo.h"
#include "opts.h"
#include "util.h"
"significantly affect performance") \
BCH_DEBUG_PARAM(debug_check_iterators, \
"Enables extra verification for btree iterators") \
- BCH_DEBUG_PARAM(debug_check_bkeys, \
- "Run bkey_debugcheck (primarily checking GC/allocation "\
- "information) when iterating over keys") \
BCH_DEBUG_PARAM(debug_check_btree_accounting, \
"Verify btree accounting for keys within a node") \
BCH_DEBUG_PARAM(journal_seq_verify, \
#define BCH_TIME_STATS() \
x(btree_node_mem_alloc) \
x(btree_node_split) \
+ x(btree_node_compact) \
+ x(btree_node_merge) \
x(btree_node_sort) \
x(btree_node_read) \
+ x(btree_interior_update_foreground) \
+ x(btree_interior_update_total) \
x(btree_gc) \
x(btree_lock_contended_read) \
x(btree_lock_contended_intent) \
x(data_write) \
x(data_read) \
x(data_promote) \
- x(journal_write) \
- x(journal_delay) \
+ x(journal_flush_write) \
+ x(journal_noflush_write) \
x(journal_flush_seq) \
x(blocked_journal) \
x(blocked_allocate) \
#include "alloc_types.h"
#include "btree_types.h"
#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
#include "ec_types.h"
#include "journal_types.h"
#include "quota_types.h"
#include "rebalance_types.h"
#include "replicas_types.h"
+#include "subvolume_types.h"
#include "super_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
GC_PHASE_BTREE_alloc,
GC_PHASE_BTREE_quotas,
GC_PHASE_BTREE_reflink,
+ GC_PHASE_BTREE_subvolumes,
+ GC_PHASE_BTREE_snapshots,
GC_PHASE_PENDING_DELETE,
};
struct bch_sb_handle disk_sb;
struct bch_sb *sb_read_scratch;
int sb_write_error;
+ dev_t dev;
struct bch_devs_mask self;
* Or rcu_read_lock(), but only for ptr_stale():
*/
struct bucket_array __rcu *buckets[2];
+ struct bucket_gens __rcu *bucket_gens;
+ u8 *oldest_gen;
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage_gc;
/* Allocator: */
+ u64 new_fs_bucket_idx;
struct task_struct __rcu *alloc_thread;
/*
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
+ size_t buckets_waiting_on_journal;
enum allocator_states allocator_state;
enum {
/* startup: */
+ BCH_FS_INITIALIZED,
BCH_FS_ALLOC_READ_DONE,
BCH_FS_ALLOC_CLEAN,
BCH_FS_ALLOCATOR_RUNNING,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED,
BCH_FS_TOPOLOGY_REPAIR_DONE,
- BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
BCH_FS_RW,
/* misc: */
BCH_FS_NEED_ANOTHER_GC,
BCH_FS_DELETED_NODES,
- BCH_FS_NEED_ALLOC_WRITE,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
enum btree_id btree_id:8;
unsigned level:8;
bool allocated;
+ bool overwritten;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;
u64 journal_seq_base;
};
-struct btree_iter_buf {
- struct btree_iter *iter;
+struct btree_path_buf {
+ struct btree_path *path;
};
#define REPLICAS_DELTA_LIST_MAX (1U << 16)
+struct snapshot_t {
+ u32 parent;
+ u32 children[2];
+ u32 subvol; /* Nonzero only if a subvolume points to this node: */
+ u32 equiv;
+};
+
+typedef struct {
+ u32 subvol;
+ u64 inum;
+} subvol_inum;
+
+#define BCACHEFS_ROOT_SUBVOL_INUM \
+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
+
struct bch_fs {
struct closure cl;
u16 version;
u16 version_min;
- u16 encoded_extent_max;
u8 nr_devices;
u8 clean;
struct closure sb_write;
struct mutex sb_lock;
+ /* snapshot.c: */
+ GENRADIX(struct snapshot_t) snapshots;
+ struct bch_snapshot_table __rcu *snapshot_table;
+ struct mutex snapshot_table_lock;
+ struct work_struct snapshot_delete_work;
+ struct work_struct snapshot_wait_for_pagecache_and_delete_work;
+ struct snapshot_id_list snapshots_unlinked;
+ struct mutex snapshots_unlinked_lock;
+
/* BTREE CACHE */
struct bio_set btree_bio;
struct workqueue_struct *io_complete_wq;
/* btree_iter.c: */
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
- mempool_t btree_iters_pool;
+ mempool_t btree_paths_pool;
mempool_t btree_trans_mem_pool;
- struct btree_iter_buf __percpu *btree_iters_bufs;
+ struct btree_path_buf __percpu *btree_paths_bufs;
struct srcu_struct btree_trans_barrier;
+ bool btree_trans_barrier_initialized;
struct btree_key_cache btree_key_cache;
+ unsigned btree_key_cache_btrees;
struct workqueue_struct *btree_update_wq;
struct workqueue_struct *btree_io_complete_wq;
struct closure_waitlist freelist_wait;
u64 blocked_allocate;
u64 blocked_allocate_open_bucket;
+
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
+ open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT];
struct write_point btree_write_point;
struct write_point rebalance_write_point;
struct mutex write_points_hash_lock;
unsigned write_points_nr;
+ struct buckets_waiting_for_journal buckets_waiting_for_journal;
+
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic_t kick_gc;
* it's not while a gc is in progress.
*/
struct rw_semaphore gc_lock;
+ struct mutex gc_gens_lock;
/* IO PATH */
struct semaphore io_in_flight;
struct write_point copygc_write_point;
s64 copygc_wait;
+ /* DATA PROGRESS STATS */
+ struct list_head data_progress_list;
+ struct mutex data_progress_lock;
+
/* STRIPES: */
- GENRADIX(struct stripe) stripes[2];
+ GENRADIX(struct stripe) stripes;
+ GENRADIX(struct gc_stripe) gc_stripes;
ec_stripes_heap ec_stripes_heap;
spinlock_t ec_stripes_heap_lock;
u64 reflink_hint;
reflink_gc_table reflink_gc_table;
size_t reflink_gc_nr;
- size_t reflink_gc_idx;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
static inline unsigned block_bytes(const struct bch_fs *c)
{
- return c->opts.block_size << 9;
+ return c->opts.block_size;
+}
+
+static inline unsigned block_sectors(const struct bch_fs *c)
+{
+ return c->opts.block_size >> 9;
+}
+
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+ return c->opts.btree_node_size >> 9;
+}
+
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+ return c->btree_key_cache_btrees & (1U << btree);
}
-static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time)
+static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
{
struct timespec64 t;
s32 rem;
return t;
}
-static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
{
return (ts.tv_sec * c->sb.time_units_per_sec +
(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
}
-static inline s64 bch2_current_time(struct bch_fs *c)
+static inline s64 bch2_current_time(const struct bch_fs *c)
{
struct timespec64 now;
#include <asm/byteorder.h>
#include <linux/kernel.h>
#include <linux/uuid.h>
+#include "vstructs.h"
#define LE_BITMASK(_bits, name, type, field, offset, end) \
static const unsigned name##_OFFSET = offset; \
*/
#define BCH_BKEY_TYPES() \
x(deleted, 0) \
- x(discard, 1) \
+ x(whiteout, 1) \
x(error, 2) \
x(cookie, 3) \
x(hash_whiteout, 4) \
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19) \
- x(alloc_v2, 20)
+ x(alloc_v2, 20) \
+ x(subvolume, 21) \
+ x(snapshot, 22) \
+ x(inode_v2, 23) \
+ x(alloc_v3, 24)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
struct bch_val v;
};
-struct bch_discard {
+struct bch_whiteout {
struct bch_val v;
};
__u8 fields[0];
} __attribute__((packed, aligned(8)));
+struct bch_inode_v2 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[0];
+} __attribute__((packed, aligned(8)));
+
struct bch_inode_generation {
struct bch_val v;
__le32 pad;
} __attribute__((packed, aligned(8)));
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
#define BCH_INODE_FIELDS() \
x(bi_atime, 96) \
x(bi_ctime, 96) \
x(bi_erasure_code, 16) \
x(bi_fields_set, 16) \
x(bi_dir, 64) \
- x(bi_dir_offset, 64)
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+
/* Dirents */
/*
struct bch_val v;
/* Target inode number: */
+ union {
__le64 d_inum;
+ struct { /* DT_SUBVOL */
+ __le32 d_child_subvol;
+ __le32 d_parent_subvol;
+ };
+ };
/*
* Copy of mode bits 12-15 from the target inode - so userspace can get
__u8 d_name[];
} __attribute__((packed, aligned(8)));
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
sizeof(struct bkey) - \
offsetof(struct bch_dirent, d_name))
x(stripe, 32) \
x(stripe_redundancy, 8)
+struct bch_alloc_v3 {
+ struct bch_val v;
+ __le64 journal_seq;
+ __le32 flags;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __attribute__((packed, aligned(8)));
+
enum {
#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
BCH_ALLOC_FIELDS_V1()
struct bch_reflink_p {
struct bch_val v;
__le64 idx;
-
- __le32 reservation_generation;
- __u8 nr_replicas;
- __u8 pad[3];
-};
+ /*
+ * A reflink pointer might point to an indirect extent which is then
+ * later split (by copygc or rebalance). If we only pointed to part of
+ * the original indirect extent, and then one of the fragments is
+ * outside the range we point to, we'd leak a refcount: so when creating
+ * reflink pointers, we need to store pad values to remember the full
+ * range we were taking a reference on.
+ */
+ __le32 front_pad;
+ __le32 back_pad;
+} __attribute__((packed, aligned(8)));
struct bch_reflink_v {
struct bch_val v;
__le64 refcount;
union bch_extent_entry start[0];
__u64 _data[0];
-};
+} __attribute__((packed, aligned(8)));
struct bch_indirect_inline_data {
struct bch_val v;
u8 data[0];
};
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
+
+/* Snapshots */
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ __le32 pad;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
};
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4)
-/* 4-10 unused, was TIER, HAS_(META)DATA */
-LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
-#define BCH_TIER_MAX 4U
-
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
BCH_MEMBER_STATE_NR
};
-#define BCH_CACHE_REPLACEMENT_POLICIES() \
- x(lru, 0) \
- x(fifo, 1) \
- x(random, 2)
-
-enum bch_cache_replacement_policies {
-#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
- BCH_CACHE_REPLACEMENT_POLICIES()
-#undef x
- BCH_CACHE_REPLACEMENT_NR
-};
-
struct bch_sb_field_members {
struct bch_sb_field field;
struct bch_member members[0];
bcachefs_metadata_version_snapshot = 12,
bcachefs_metadata_version_inode_backpointers = 13,
bcachefs_metadata_version_btree_ptr_sectors_written = 14,
- bcachefs_metadata_version_max = 15,
+ bcachefs_metadata_version_snapshot_2 = 15,
+ bcachefs_metadata_version_reflink_p_fix = 16,
+ bcachefs_metadata_version_subvol_dirent = 17,
+ bcachefs_metadata_version_inode_v2 = 18,
+ bcachefs_metadata_version_max = 19,
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
/*
* Features:
* journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
* reflink: gates KEY_TYPE_reflink
* inline_data: gates KEY_TYPE_inline_data
- * new_siphash: gates BCH_STR_HASH_SIPHASH
+ * new_siphash: gates BCH_STR_HASH_siphash
* new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
*/
#define BCH_SB_FEATURES() \
BCH_ON_ERROR_NR
};
+#define BCH_STR_HASH_TYPES() \
+ x(crc32c, 0) \
+ x(crc64, 1) \
+ x(siphash_old, 2) \
+ x(siphash, 3)
+
enum bch_str_hash_type {
- BCH_STR_HASH_CRC32C = 0,
- BCH_STR_HASH_CRC64 = 1,
- BCH_STR_HASH_SIPHASH_OLD = 2,
- BCH_STR_HASH_SIPHASH = 3,
- BCH_STR_HASH_NR = 4,
+#define x(t, n) BCH_STR_HASH_##t = n,
+ BCH_STR_HASH_TYPES()
+#undef x
+ BCH_STR_HASH_NR
};
#define BCH_STR_HASH_OPTS() \
BCH_STR_HASH_OPT_NR
};
+#define BCH_CSUM_TYPES() \
+ x(none, 0) \
+ x(crc32c_nonzero, 1) \
+ x(crc64_nonzero, 2) \
+ x(chacha20_poly1305_80, 3) \
+ x(chacha20_poly1305_128, 4) \
+ x(crc32c, 5) \
+ x(crc64, 6) \
+ x(xxhash, 7)
+
enum bch_csum_type {
- BCH_CSUM_NONE = 0,
- BCH_CSUM_CRC32C_NONZERO = 1,
- BCH_CSUM_CRC64_NONZERO = 2,
- BCH_CSUM_CHACHA20_POLY1305_80 = 3,
- BCH_CSUM_CHACHA20_POLY1305_128 = 4,
- BCH_CSUM_CRC32C = 5,
- BCH_CSUM_CRC64 = 6,
- BCH_CSUM_XXHASH = 7,
- BCH_CSUM_NR = 8,
+#define x(t, n) BCH_CSUM_##t = n,
+ BCH_CSUM_TYPES()
+#undef x
+ BCH_CSUM_NR
};
static const unsigned bch_crc_bytes[] = {
- [BCH_CSUM_NONE] = 0,
- [BCH_CSUM_CRC32C_NONZERO] = 4,
- [BCH_CSUM_CRC32C] = 4,
- [BCH_CSUM_CRC64_NONZERO] = 8,
- [BCH_CSUM_CRC64] = 8,
- [BCH_CSUM_XXHASH] = 8,
- [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
- [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
+ [BCH_CSUM_none] = 0,
+ [BCH_CSUM_crc32c_nonzero] = 4,
+ [BCH_CSUM_crc32c] = 4,
+ [BCH_CSUM_crc64_nonzero] = 8,
+ [BCH_CSUM_crc64] = 8,
+ [BCH_CSUM_xxhash] = 8,
+ [BCH_CSUM_chacha20_poly1305_80] = 10,
+ [BCH_CSUM_chacha20_poly1305_128] = 16,
};
static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
{
switch (type) {
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128:
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128:
return true;
default:
return false;
x(usage, 5) \
x(data_usage, 6) \
x(clock, 7) \
- x(dev_usage, 8)
+ x(dev_usage, 8) \
+ x(log, 9)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
__le64 end;
};
+#define BCH_FS_USAGE_TYPES() \
+ x(reserved, 0) \
+ x(inodes, 1) \
+ x(key_version, 2)
+
enum {
- FS_USAGE_RESERVED = 0,
- FS_USAGE_INODES = 1,
- FS_USAGE_KEY_VERSION = 2,
- FS_USAGE_NR = 3
+#define x(f, nr) BCH_FS_USAGE_##f = nr,
+ BCH_FS_USAGE_TYPES()
+#undef x
+ BCH_FS_USAGE_NR
};
struct jset_entry_usage {
struct jset_entry_dev_usage_type d[];
} __attribute__((packed));
+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
+{
+ return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
+ sizeof(struct jset_entry_dev_usage_type);
+}
+
+struct jset_entry_log {
+ struct jset_entry entry;
+ u8 d[];
+} __attribute__((packed));
+
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
x(alloc, 4) \
x(quotas, 5) \
x(stripes, 6) \
- x(reflink, 7)
+ x(reflink, 7) \
+ x(subvolumes, 8) \
+ x(snapshots, 9)
enum btree_id {
#define x(kwd, val) BTREE_ID_##kwd = val,
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
__u64 nbuckets;
};
+struct bch_ioctl_subvolume {
+ __u32 flags;
+ __u32 dirfd;
+ __u16 mode;
+ __u16 pad[3];
+ __u64 dst_ptr;
+ __u64 src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
+
#endif /* _BCACHEFS_IOCTL_H */
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
#define bkey_whiteout(_k) \
- ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
return bpos_cmp(l, r) > 0 ? l : r;
}
-#define sbb(a, b, borrow) \
-do { \
- typeof(a) d1, d2; \
- \
- d1 = a - borrow; \
- borrow = d1 > a; \
- \
- d2 = d1 - b; \
- borrow += d2 > d1; \
- a = d2; \
-} while (0)
-
-/* returns a - b: */
-static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
-{
- int borrow = 0;
-
- sbb(a.snapshot, b.snapshot, borrow);
- sbb(a.offset, b.offset, borrow);
- sbb(a.inode, b.inode, borrow);
- return a;
-}
-
-static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
-{
- if (bpos_cmp(l, r) > 0)
- swap(l, r);
-
- return bpos_sub(r, l);
-}
-
void bch2_bpos_swab(struct bpos *);
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
#include "inode.h"
#include "quota.h"
#include "reflink.h"
+#include "subvolume.h"
#include "xattr.h"
const char * const bch2_bkey_types[] = {
.key_invalid = deleted_key_invalid, \
}
-#define bch2_bkey_ops_discard (struct bkey_ops) { \
+#define bch2_bkey_ops_whiteout (struct bkey_ops) { \
.key_invalid = deleted_key_invalid, \
}
static unsigned bch2_key_types_allowed[] = {
[BKEY_TYPE_extents] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_error)|
(1U << KEY_TYPE_cookie)|
(1U << KEY_TYPE_extent)|
(1U << KEY_TYPE_reflink_p)|
(1U << KEY_TYPE_inline_data),
[BKEY_TYPE_inodes] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_inode)|
+ (1U << KEY_TYPE_inode_v2)|
(1U << KEY_TYPE_inode_generation),
[BKEY_TYPE_dirents] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_dirent),
[BKEY_TYPE_xattrs] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_whiteout)|
(1U << KEY_TYPE_cookie)|
(1U << KEY_TYPE_hash_whiteout)|
(1U << KEY_TYPE_xattr),
[BKEY_TYPE_alloc] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_alloc)|
- (1U << KEY_TYPE_alloc_v2),
+ (1U << KEY_TYPE_alloc_v2)|
+ (1U << KEY_TYPE_alloc_v3),
[BKEY_TYPE_quotas] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_quota),
[BKEY_TYPE_stripes] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_stripe),
[BKEY_TYPE_reflink] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_reflink_v)|
(1U << KEY_TYPE_indirect_inline_data),
+ [BKEY_TYPE_subvolumes] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_subvolume),
+ [BKEY_TYPE_snapshots] =
+ (1U << KEY_TYPE_deleted)|
+ (1U << KEY_TYPE_snapshot),
[BKEY_TYPE_btree] =
+ (1U << KEY_TYPE_deleted)|
(1U << KEY_TYPE_btree_ptr)|
(1U << KEY_TYPE_btree_ptr_v2),
};
const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type)
{
- unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
- bch2_key_types_allowed[type] ;
-
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
- if (!(key_types_allowed & (1U << k.k->type)))
+ if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
return "invalid key type for this btree";
if (type == BKEY_TYPE_btree &&
bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
return "value too big";
- if (btree_node_type_is_extents(type)) {
- if ((k.k->size == 0) != bkey_deleted(k.k))
+ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+ if (k.k->size == 0)
return "bad size field";
if (k.k->size > k.k->p.offset)
if (type != BKEY_TYPE_btree &&
btree_type_has_snapshots(type) &&
- k.k->p.snapshot != U32_MAX)
+ !k.k->p.snapshot)
return "invalid snapshot field";
if (type != BKEY_TYPE_btree &&
return NULL;
}
-void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
- const char *invalid;
-
- BUG_ON(!k.k->u64s);
-
- invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
- bch2_bkey_in_btree_node(b, k);
- if (invalid) {
- char buf[160];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
- }
-}
-
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
{
if (!bpos_cmp(pos, POS_MIN))
pr_buf(out, "POS_MIN");
else if (!bpos_cmp(pos, POS_MAX))
pr_buf(out, "POS_MAX");
+ else if (!bpos_cmp(pos, SPOS_MAX))
+ pr_buf(out, "SPOS_MAX");
else {
if (pos.inode == U64_MAX)
pr_buf(out, "U64_MAX");
enum btree_node_type);
const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
-void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
void bch2_val_to_text(struct printbuf *, struct bch_fs *,
return nr;
}
-static void extent_sort_append(struct bch_fs *c,
- struct bkey_format *f,
- struct btree_nr_keys *nr,
- struct bkey_packed **out,
- struct bkey_s k)
-{
- if (!bkey_deleted(k.k)) {
- if (!bch2_bkey_pack_key(*out, k.k, f))
- memcpy_u64s_small(*out, k.k, BKEY_U64s);
-
- memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
-
- btree_keys_account_key_add(nr, 0, *out);
- *out = bkey_next(*out);
- }
-}
-
/* Sort + repack in a new format: */
struct btree_nr_keys
bch2_sort_repack(struct bset *dst, struct btree *src,
struct bkey_format *in_f = &src->format;
struct bkey_packed *in, *out = vstruct_last(dst);
struct btree_nr_keys nr;
+ bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
memset(&nr, 0, sizeof(nr));
if (filter_whiteouts && bkey_deleted(in))
continue;
- if (bch2_bkey_transform(out_f, out, bkey_packed(in)
- ? in_f : &bch2_bkey_format_current, in))
+ if (!transform)
+ bkey_copy(out, in);
+ else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+ ? in_f : &bch2_bkey_format_current, in))
out->format = KEY_FORMAT_LOCAL_BTREE;
else
bch2_bkey_unpack(src, (void *) out, in);
return nr;
}
-/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
-struct btree_nr_keys
-bch2_sort_repack_merge(struct bch_fs *c,
- struct bset *dst, struct btree *src,
- struct btree_node_iter *iter,
- struct bkey_format *out_f,
- bool filter_whiteouts)
-{
- struct bkey_packed *out = vstruct_last(dst), *k_packed;
- struct bkey_buf k;
- struct btree_nr_keys nr;
-
- memset(&nr, 0, sizeof(nr));
- bch2_bkey_buf_init(&k);
-
- while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
- if (filter_whiteouts && bkey_deleted(k_packed))
- continue;
-
- /*
- * NOTE:
- * bch2_bkey_normalize may modify the key we pass it (dropping
- * stale pointers) and we don't have a write lock on the src
- * node; we have to make a copy of the entire key before calling
- * normalize
- */
- bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
- bch2_bkey_unpack(src, k.k, k_packed);
-
- if (filter_whiteouts &&
- bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
- continue;
-
- extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
- }
-
- dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
- bch2_bkey_buf_exit(&k, c);
- return nr;
-}
-
static inline int sort_keys_cmp(struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
bch2_sort_repack(struct bset *, struct btree *,
struct btree_node_iter *,
struct bkey_format *, bool);
-struct btree_nr_keys
-bch2_sort_repack_merge(struct bch_fs *,
- struct bset *, struct btree *,
- struct btree_node_iter *,
- struct bkey_format *, bool);
unsigned bch2_sort_keys(struct bkey_packed *,
struct sort_iter *, bool);
return;
/* Verify no duplicates: */
- btree_node_iter_for_each(iter, set)
+ btree_node_iter_for_each(iter, set) {
+ BUG_ON(set->k > set->end);
btree_node_iter_for_each(iter, s2)
BUG_ON(set != s2 && set->end == s2->end);
+ }
/* Verify that set->end is correct: */
btree_node_iter_for_each(iter, set) {
unsigned j)
{
return cacheline_to_bkey(b, t,
- __eytzinger1_to_inorder(j, t->size, t->extra),
+ __eytzinger1_to_inorder(j, t->size - 1, t->extra),
bkey_float(b, t, j)->key_offset);
}
}
__always_inline
-static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
+static inline void make_bfloat(struct btree *b, struct bset_tree *t,
+ unsigned j,
+ struct bkey_packed *min_key,
+ struct bkey_packed *max_key)
{
struct bkey_float *f = bkey_float(b, t, j);
struct bkey_packed *m = tree_to_bkey(b, t, j);
f->mantissa = mantissa;
}
-static void make_bfloat(struct btree *b, struct bset_tree *t,
- unsigned j,
- struct bkey_packed *min_key,
- struct bkey_packed *max_key)
-{
- struct bkey_i *k;
-
- if (is_power_of_2(j) &&
- !min_key->u64s) {
- if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
- k = (void *) min_key;
- bkey_init(&k->k);
- k->k.p = b->data->min_key;
- }
- }
-
- if (is_power_of_2(j + 1) &&
- !max_key->u64s) {
- if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
- k = (void *) max_key;
- bkey_init(&k->k);
- k->k.p = b->data->max_key;
- }
- }
-
- __make_bfloat(b, t, j, min_key, max_key);
-}
-
/* bytes remaining - only valid for last bset: */
static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
/* First we figure out where the first key in each cacheline is */
- eytzinger1_for_each(j, t->size) {
+ eytzinger1_for_each(j, t->size - 1) {
while (bkey_to_cacheline(b, t, k) < cacheline)
prev = k, k = bkey_next(k);
}
/* Then we build the tree */
- eytzinger1_for_each(j, t->size)
- __make_bfloat(b, t, j,
- bkey_to_packed(&min_key),
- bkey_to_packed(&max_key));
+ eytzinger1_for_each(j, t->size - 1)
+ make_bfloat(b, t, j,
+ bkey_to_packed(&min_key),
+ bkey_to_packed(&max_key));
}
static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
do {
p = j ? tree_to_bkey(b, t,
__inorder_to_eytzinger1(j--,
- t->size, t->extra))
+ t->size - 1, t->extra))
: btree_bkey_first(b, t);
} while (p >= k);
break;
/* Insert */
-static void rw_aux_tree_fix_invalidated_key(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k)
-{
- unsigned offset = __btree_node_key_to_offset(b, k);
- unsigned j = rw_aux_tree_bsearch(b, t, offset);
-
- if (j < t->size &&
- rw_aux_tree(b, t)[j].offset == offset)
- rw_aux_tree_set(b, t, j, k);
-
- bch2_bset_verify_rw_aux_tree(b, t);
-}
-
-static void ro_aux_tree_fix_invalidated_key(struct btree *b,
- struct bset_tree *t,
- struct bkey_packed *k)
-{
- struct bkey_packed min_key, max_key;
- unsigned inorder, j;
-
- EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
- /* signal to make_bfloat() that they're uninitialized: */
- min_key.u64s = max_key.u64s = 0;
-
- if (bkey_next(k) == btree_bkey_last(b, t)) {
- for (j = 1; j < t->size; j = j * 2 + 1)
- make_bfloat(b, t, j, &min_key, &max_key);
- }
-
- inorder = bkey_to_cacheline(b, t, k);
-
- if (inorder &&
- inorder < t->size) {
- j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
-
- if (k == tree_to_bkey(b, t, j)) {
- /* Fix the node this key corresponds to */
- make_bfloat(b, t, j, &min_key, &max_key);
-
- /* Children for which this key is the right boundary */
- for (j = eytzinger1_left_child(j);
- j < t->size;
- j = eytzinger1_right_child(j))
- make_bfloat(b, t, j, &min_key, &max_key);
- }
- }
-
- if (inorder + 1 < t->size) {
- j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
-
- if (k == tree_to_prev_bkey(b, t, j)) {
- make_bfloat(b, t, j, &min_key, &max_key);
-
- /* Children for which this key is the left boundary */
- for (j = eytzinger1_right_child(j);
- j < t->size;
- j = eytzinger1_left_child(j))
- make_bfloat(b, t, j, &min_key, &max_key);
- }
- }
-}
-
-/**
- * bch2_bset_fix_invalidated_key() - given an existing key @k that has been
- * modified, fix any auxiliary search tree by remaking all the nodes in the
- * auxiliary search tree that @k corresponds to
- */
-void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
-{
- struct bset_tree *t = bch2_bkey_to_bset(b, k);
-
- switch (bset_aux_tree_type(t)) {
- case BSET_NO_AUX_TREE:
- break;
- case BSET_RO_AUX_TREE:
- ro_aux_tree_fix_invalidated_key(b, t, k);
- break;
- case BSET_RW_AUX_TREE:
- rw_aux_tree_fix_invalidated_key(b, t, k);
- break;
- }
-}
-
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
n = n * 2 + (cmp < 0);
} while (n < t->size);
- inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
+ inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
/*
* n would have been the node we recursed to - the low bit tells us if
if (unlikely(!inorder))
return btree_bkey_first(b, t);
- f = &base->f[eytzinger1_prev(n >> 1, t->size)];
+ f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
}
return cacheline_to_bkey(b, t, inorder, f->key_offset);
EBUG_ON(iter->data->k > iter->data->end);
- while (!__btree_node_iter_set_end(iter, 0) &&
- !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
- iter->data->k++;
-
if (unlikely(__btree_node_iter_set_end(iter, 0))) {
bch2_btree_node_iter_set_drop(iter, iter->data);
return;
if (!inorder || inorder >= t->size)
return;
- j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+ j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
if (k != tree_to_bkey(b, t, j))
return;
void bch2_bset_init_next(struct bch_fs *, struct btree *,
struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
struct bkey_packed *, struct bkey_i *, unsigned);
b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+ if (b->aux_data == MAP_FAILED)
+ b->aux_data = NULL;
#endif
if (!b->aux_data) {
kvpfree(b->data, btree_bytes(c));
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
- rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+ int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+ BUG_ON(ret);
/* Cause future lookups for this node to fail: */
b->hash_val = 0;
unsigned long touched = 0;
unsigned long freed = 0;
unsigned i, flags;
+ unsigned long ret = SHRINK_STOP;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
if (sc->gfp_mask & __GFP_FS)
mutex_lock(&bc->lock);
else if (!mutex_trylock(&bc->lock))
- return -1;
+ goto out_norestore;
flags = memalloc_nofs_save();
i = 0;
list_for_each_entry_safe(b, t, &bc->freeable, list) {
+ /*
+ * Leave a few nodes on the freeable list, so that a btree split
+ * won't have to hit the system allocator:
+ */
+ if (++i <= 3)
+ continue;
+
touched++;
- if (freed >= nr)
+ if (touched >= nr)
break;
- if (++i > 3 &&
- !btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
list_for_each_entry_safe(b, t, &bc->live, list) {
touched++;
- if (freed >= nr) {
+ if (touched >= nr) {
/* Save position */
if (&t->list != &bc->live)
list_move_tail(&bc->live, &t->list);
mutex_unlock(&bc->lock);
out:
+ ret = (unsigned long) freed * btree_pages(c);
memalloc_nofs_restore(flags);
- return (unsigned long) freed * btree_pages(c);
+out_norestore:
+ trace_btree_cache_scan(sc->nr_to_scan,
+ sc->nr_to_scan / btree_pages(c),
+ btree_cache_can_free(bc),
+ ret);
+ return ret;
}
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
/* Slowpath, don't want it inlined into btree_iter_traverse() */
static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
- struct btree_iter *iter,
+ struct btree_trans *trans,
+ struct btree_path *path,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level,
* Parent node must be locked, else we could read in a btree node that's
* been freed:
*/
- if (iter && !bch2_btree_node_relock(iter, level + 1)) {
- btree_trans_restart(iter->trans);
+ if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+ trace_trans_restart_relock_parent_for_fill(trans->fn,
+ _THIS_IP_, btree_id, &path->pos);
+ btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
six_unlock_intent(&b->c.lock);
/* Unlock before doing IO: */
- if (iter && sync)
- bch2_trans_unlock(iter->trans);
+ if (trans && sync)
+ bch2_trans_unlock(trans);
bch2_btree_node_read(c, b, sync);
if (!sync)
return NULL;
- if (iter &&
- (!bch2_trans_relock(iter->trans) ||
- !bch2_btree_iter_relock_intent(iter))) {
- BUG_ON(!iter->trans->restarted);
+ if (trans &&
+ (!bch2_trans_relock(trans) ||
+ !bch2_btree_path_relock_intent(trans, path))) {
+ BUG_ON(!trans->restarted);
return ERR_PTR(-EINTR);
}
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
- btree_trans_restart(iter->trans);
+ trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
+ btree_id, &path->pos);
+ btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
* The btree node will have either a read or a write lock held, depending on
* the @write parameter.
*/
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *iter,
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
const struct bkey_i *k, unsigned level,
enum six_lock_type lock_type,
unsigned long trace_ip)
EBUG_ON(level >= BTREE_MAX_DEPTH);
- if (c->opts.btree_node_mem_ptr_optimization) {
- b = btree_node_mem_ptr(k);
- if (b)
+ b = btree_node_mem_ptr(k);
+
+ /*
+ * Check b->hash_val _before_ calling btree_node_lock() - this might not
+ * be the node we want anymore, and trying to lock the wrong node could
+ * cause an unneccessary transaction restart:
+ */
+ if (likely(c->opts.btree_node_mem_ptr_optimization &&
+ b &&
+ b->hash_val == btree_ptr_hash_val(k)))
goto lock_node;
- }
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
* else we could read in a btree node from disk that's been
* freed:
*/
- b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+ b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
level, lock_type, true);
/* We raced and found the btree node in the cache */
* the parent was modified, when the pointer to the node we want
* was removed - and we'll bail out:
*/
- if (btree_node_read_locked(iter, level + 1))
- btree_node_unlock(iter, level + 1);
+ if (btree_node_read_locked(path, level + 1))
+ btree_node_unlock(path, level + 1);
- if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
+ if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
lock_node_check_fn, (void *) k, trace_ip)) {
if (!trans->restarted)
goto retry;
b->c.level != level ||
race_fault())) {
six_unlock_type(&b->c.lock, lock_type);
- if (bch2_btree_node_relock(iter, level + 1))
+ if (bch2_btree_node_relock(trans, path, level + 1))
goto retry;
- trace_trans_restart_btree_node_reused(trans->ip,
+ trace_trans_restart_btree_node_reused(trans->fn,
trace_ip,
- iter->btree_id,
- &iter->real_pos);
+ path->btree_id,
+ &path->pos);
btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
bch2_btree_node_wait_on_read(b);
/*
- * should_be_locked is not set on this iterator yet, so we need
- * to relock it specifically:
+ * should_be_locked is not set on this path yet, so we need to
+ * relock it specifically:
*/
- if (iter &&
+ if (trans &&
(!bch2_trans_relock(trans) ||
- !bch2_btree_iter_relock_intent(iter))) {
+ !bch2_btree_path_relock_intent(trans, path))) {
BUG_ON(!trans->restarted);
return ERR_PTR(-EINTR);
}
return ERR_PTR(-EIO);
}
- EBUG_ON(b->c.btree_id != iter->btree_id);
+ EBUG_ON(b->c.btree_id != path->btree_id);
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
btree_check_header(c, b);
if (nofill)
goto out;
- b = bch2_btree_node_fill(c, NULL, k, btree_id,
+ b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
level, SIX_LOCK_read, true);
/* We raced and found the btree node in the cache */
return b;
}
-int bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_node_prefetch(struct bch_fs *c,
+ struct btree_trans *trans,
+ struct btree_path *path,
const struct bkey_i *k,
enum btree_id btree_id, unsigned level)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
- BUG_ON(iter && !btree_node_locked(iter, level + 1));
+ BUG_ON(trans && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_cache_find(bc, k);
if (b)
return 0;
- b = bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
+ b = bch2_btree_node_fill(c, trans, path, k, btree_id,
+ level, SIX_LOCK_read, false);
return PTR_ERR_OR_ZERO(b);
}
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *,
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
const struct bkey_i *, unsigned,
enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned, bool);
-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
+int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
const struct bkey_i *, enum btree_id, unsigned);
void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
static inline size_t btree_bytes(struct bch_fs *c)
{
- return c->opts.btree_node_size << 9;
+ return c->opts.btree_node_size;
}
static inline size_t btree_max_u64s(struct bch_fs *c)
static inline unsigned btree_blocks(struct bch_fs *c)
{
- return c->opts.btree_node_size >> c->block_bits;
+ return btree_sectors(c) >> c->block_bits;
}
#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3)
#include "alloc_foreground.h"
#include "bkey_methods.h"
#include "bkey_buf.h"
+#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "btree_io.h"
}
}
+static void bch2_btree_node_update_key_early(struct bch_fs *c,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new)
+{
+ struct btree *b;
+ struct bkey_buf tmp;
+ int ret;
+
+ bch2_bkey_buf_init(&tmp);
+ bch2_bkey_buf_reassemble(&tmp, c, old);
+
+ b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+ if (!IS_ERR_OR_NULL(b)) {
+ mutex_lock(&c->btree_cache.lock);
+
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, new);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+
+ mutex_unlock(&c->btree_cache.lock);
+ six_unlock_read(&b->c.lock);
+ }
+
+ bch2_bkey_buf_exit(&tmp, c);
+}
+
static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
{
struct bkey_i_btree_ptr_v2 *new;
new->v.min_key = new_min;
SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
- ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
if (ret) {
kfree(new);
return ret;
new->k.p = new_max;
SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
- ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
if (ret) {
kfree(new);
return ret;
char buf[200];
int ret = 0;
+ /*
+ * XXX
+ * use check_bucket_ref here
+ */
bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
- struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
- if (fsck_err_on(g->mark.data_type &&
- g->mark.data_type != data_type, c,
- "bucket %u:%zu different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->mark.data_type],
- bch2_data_types[data_type],
- (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
- if (data_type == BCH_DATA_btree) {
- g2->_mark.data_type = g->_mark.data_type = data_type;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
- } else {
- do_update = true;
- }
- }
-
if (fsck_err_on(!g->gen_valid, c,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ g->_mark.gen = p.ptr.gen;
+ g->gen_valid = true;
} else {
do_update = true;
}
p.ptr.gen, g->mark.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
if (!p.ptr.cached) {
- g2->_mark.gen = g->_mark.gen = p.ptr.gen;
- g2->gen_valid = g->gen_valid = true;
- g2->_mark.data_type = 0;
- g2->_mark.dirty_sectors = 0;
- g2->_mark.cached_sectors = 0;
+ g->_mark.gen = p.ptr.gen;
+ g->gen_valid = true;
+ g->_mark.data_type = 0;
+ g->_mark.dirty_sectors = 0;
+ g->_mark.cached_sectors = 0;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
} else {
do_update = true;
}
}
+ if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ do_update = true;
+
if (fsck_err_on(!p.ptr.cached &&
gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
do_update = true;
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
+ continue;
+
+ if (fsck_err_on(g->mark.data_type &&
+ g->mark.data_type != data_type, c,
+ "bucket %u:%zu different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[g->mark.data_type],
+ bch2_data_types[data_type],
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g->_mark.data_type = data_type;
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
if (p.has_ec) {
- struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
if (fsck_err_on(!m || !m->alive, c,
"pointer to nonexistent stripe %llu\n"
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, true);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
ptr->gen = g->mark.gen;
}
} else {
bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, true);
+ struct bucket *g = PTR_GC_BUCKET(ca, ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
(ptr->cached &&
(!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
(!ptr->cached &&
gen_cmp(ptr->gen, g->mark.gen) < 0) ||
+ gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
(g->mark.data_type &&
g->mark.data_type != data_type);
}));
ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_extent_entry_for_each(ptrs, entry) {
if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
- struct stripe *m = genradix_ptr(&c->stripes[true],
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
entry->stripe_ptr.idx);
union bch_extent_entry *next_ptr;
}
}
- ret = bch2_journal_key_insert(c, btree_id, level, new);
- if (ret)
+ ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+ if (ret) {
kfree(new);
- else
- *k = bkey_i_to_s_c(new);
+ return ret;
+ }
+
+ if (level)
+ bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, *k);
+ bch_info(c, "updated %s", buf);
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
+ bch_info(c, "new key %s", buf);
+ *k = bkey_i_to_s_c(new);
}
fsck_err:
return ret;
/* marking of btree keys/nodes: */
-static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
unsigned level, bool is_root,
struct bkey_s_c *k,
- u8 *max_stale, bool initial)
+ bool initial)
{
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
+ struct bch_fs *c = trans->c;
+ struct bkey deleted = KEY(0, 0, 0);
+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
unsigned flags =
- BTREE_TRIGGER_INSERT|
BTREE_TRIGGER_GC|
(initial ? BTREE_TRIGGER_NOATOMIC : 0);
int ret = 0;
+ deleted.p = k->k->p;
+
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
k->k->version.lo > journal_cur_seq(&c->journal));
k->k->version.lo,
atomic64_read(&c->key_version)))
atomic64_set(&c->key_version, k->k->version.lo);
-
- if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
- "superblock not marked as containing replicas (type %u)",
- k->k->type)) {
- ret = bch2_mark_bkey_replicas(c, *k);
- if (ret) {
- bch_err(c, "error marking bkey replicas: %i", ret);
- goto err;
- }
- }
}
- ptrs = bch2_bkey_ptrs_c(*k);
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, true);
-
- if (gen_after(g->oldest_gen, ptr->gen))
- g->oldest_gen = ptr->gen;
-
- *max_stale = max(*max_stale, ptr_stale(ca, ptr));
- }
-
- bch2_mark_key(c, *k, flags);
+ ret = bch2_mark_key(trans, old, *k, flags);
fsck_err:
err:
if (ret)
return ret;
}
-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
- bool initial)
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
{
+ struct bch_fs *c = trans->c;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
struct bkey_buf prev, cur;
int ret = 0;
- *max_stale = 0;
-
if (!btree_node_type_needs_gc(btree_node_type(b)))
return 0;
bkey_init(&prev.k->k);
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
- &k, max_stale, initial);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
+ &k, initial);
if (ret)
break;
return ret;
}
-static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
bool initial, bool metadata_only)
{
- struct btree_trans trans;
- struct btree_iter *iter;
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
struct btree *b;
unsigned depth = metadata_only ? 1
: bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
- u8 max_stale = 0;
int ret = 0;
- bch2_trans_init(&trans, c, 0, 0);
-
gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
- __for_each_btree_node(&trans, iter, btree_id, POS_MIN,
- 0, depth, BTREE_ITER_PREFETCH, b) {
+ __for_each_btree_node(trans, iter, btree_id, POS_MIN,
+ 0, depth, BTREE_ITER_PREFETCH, b, ret) {
bch2_verify_btree_nr_keys(b);
gc_pos_set(c, gc_pos_btree_node(b));
- ret = btree_gc_mark_node(c, b, &max_stale, initial);
+ ret = btree_gc_mark_node(trans, b, initial);
if (ret)
break;
-
- if (!initial) {
- if (max_stale > 64)
- bch2_btree_node_rewrite(&trans, iter,
- b->data->keys.seq,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_GC_LOCK_HELD);
- else if (!bch2_btree_gc_rewrite_disabled &&
- (bch2_btree_gc_always_rewrite || max_stale > 16))
- bch2_btree_node_rewrite(&trans, iter,
- b->data->keys.seq,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_GC_LOCK_HELD);
- }
-
- bch2_trans_cond_resched(&trans);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
- ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
return ret;
if (!btree_node_fake(b)) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
- &k, &max_stale, initial);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+ true, &k, initial);
}
gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
mutex_unlock(&c->btree_root_lock);
return ret;
}
-static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
unsigned target_depth)
{
+ struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
- u8 max_stale = 0;
char buf[200];
int ret = 0;
BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
- &k, &max_stale, true);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+ false, &k, true);
if (ret) {
bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
goto fsck_err;
break;
}
- ret = bch2_gc_btree_init_recurse(c, child,
+ ret = bch2_gc_btree_init_recurse(trans, child,
target_depth);
six_unlock_read(&child->c.lock);
return ret;
}
-static int bch2_gc_btree_init(struct bch_fs *c,
+static int bch2_gc_btree_init(struct btree_trans *trans,
enum btree_id btree_id,
bool metadata_only)
{
+ struct bch_fs *c = trans->c;
struct btree *b;
unsigned target_depth = metadata_only ? 1
: bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
- u8 max_stale = 0;
char buf[100];
int ret = 0;
}
if (b->c.level >= target_depth)
- ret = bch2_gc_btree_init_recurse(c, b, target_depth);
+ ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
if (!ret) {
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
- ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
- &k, &max_stale, true);
+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
+ &k, true);
}
fsck_err:
six_unlock_read(&b->c.lock);
static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
{
+ struct btree_trans trans;
enum btree_id ids[BTREE_ID_NR];
unsigned i;
int ret = 0;
+ bch2_trans_init(&trans, c, 0, 0);
+
for (i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
for (i = 0; i < BTREE_ID_NR && !ret; i++)
ret = initial
- ? bch2_gc_btree_init(c, ids[i], metadata_only)
- : bch2_gc_btree(c, ids[i], initial, metadata_only);
+ ? bch2_gc_btree_init(&trans, ids[i], metadata_only)
+ : bch2_gc_btree(&trans, ids[i], initial, metadata_only);
if (ret < 0)
bch_err(c, "%s: ret %i", __func__, ret);
+
+ bch2_trans_exit(&trans);
return ret;
}
} while (start < end);
}
-void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
- unsigned flags)
+static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+ unsigned flags)
{
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
unsigned i;
u64 b;
- /*
- * This conditional is kind of gross, but we may be called from the
- * device add path, before the new device has actually been added to the
- * running filesystem:
- */
- if (c) {
- lockdep_assert_held(&c->sb_lock);
- percpu_down_read(&c->mark_lock);
- }
-
for (i = 0; i < layout->nr_superblocks; i++) {
u64 offset = le64_to_cpu(layout->sb_offset[i]);
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB), flags);
}
-
- if (c)
- percpu_up_read(&c->mark_lock);
}
static void bch2_mark_superblocks(struct bch_fs *c)
for_each_pending_btree_node_free(c, as, d)
if (d->index_update_done)
- bch2_mark_key(c, bkey_i_to_s_c(&d->key),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_GC);
+ bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
mutex_unlock(&c->btree_interior_update_lock);
}
struct bch_dev *ca;
unsigned i;
- genradix_free(&c->stripes[1]);
+ genradix_free(&c->reflink_gc_table);
+ genradix_free(&c->gc_stripes);
for_each_member_device(ca, c, i) {
kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
unsigned i, dev;
int ret = 0;
+ percpu_down_write(&c->mark_lock);
+
#define copy_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
if (verify) \
fsck_err(c, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
iter.pos, ##__VA_ARGS__, \
dst->_f, src->_f); \
dst->_f = src->_f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
- }
-#define copy_bucket_field(_f) \
- if (dst->b[b].mark._f != src->b[b].mark._f) { \
- if (verify) \
- fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \
- ": got %u, should be %u", dev, b, \
- dst->b[b].mark.gen, \
- bch2_data_types[dst->b[b].mark.data_type],\
- dst->b[b].mark._f, src->b[b].mark._f); \
- dst->b[b]._mark._f = src->b[b].mark._f; \
- set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \
copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
- if (!metadata_only) {
- struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
- struct stripe *dst, *src;
-
- while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
- dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
-
- if (dst->alive != src->alive ||
- dst->sectors != src->sectors ||
- dst->algorithm != src->algorithm ||
- dst->nr_blocks != src->nr_blocks ||
- dst->nr_redundant != src->nr_redundant) {
- bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
- ret = -EINVAL;
- goto fsck_err;
- }
-
- for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
- copy_stripe_field(block_sectors[i],
- "block_sectors[%u]", i);
-
- dst->blocks_nonempty = 0;
- for (i = 0; i < dst->nr_blocks; i++)
- dst->blocks_nonempty += dst->block_sectors[i] != 0;
-
- genradix_iter_advance(&iter, &c->stripes[1]);
- }
- }
-
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
for_each_member_device(ca, c, dev) {
- struct bucket_array *dst = __bucket_array(ca, 0);
- struct bucket_array *src = __bucket_array(ca, 1);
- size_t b;
-
- for (b = 0; b < src->nbuckets; b++) {
- copy_bucket_field(gen);
- copy_bucket_field(data_type);
- copy_bucket_field(stripe);
- copy_bucket_field(dirty_sectors);
- copy_bucket_field(cached_sectors);
-
- dst->b[b].oldest_gen = src->b[b].oldest_gen;
- }
-
- {
- struct bch_dev_usage *dst = ca->usage_base;
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage_gc,
- dev_usage_u64s());
-
- copy_dev_field(buckets_ec, "buckets_ec");
- copy_dev_field(buckets_unavailable, "buckets_unavailable");
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
- copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
- copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
- }
+ struct bch_dev_usage *dst = ca->usage_base;
+ struct bch_dev_usage *src = (void *)
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
+ dev_usage_u64s());
+
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
+
+ for (i = 0; i < BCH_DATA_NR; i++) {
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
};
#undef copy_fs_field
#undef copy_dev_field
-#undef copy_bucket_field
#undef copy_stripe_field
#undef copy_field
fsck_err:
percpu_ref_put(&ca->ref);
if (ret)
bch_err(c, "%s: ret %i", __func__, ret);
+
+ percpu_up_write(&c->mark_lock);
return ret;
}
{
struct bch_dev *ca = NULL;
unsigned i;
- int ret;
BUG_ON(c->usage_gc);
BUG_ON(ca->buckets[1]);
BUG_ON(ca->usage_gc);
- ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
- ca->mi.nbuckets * sizeof(struct bucket),
- GFP_KERNEL|__GFP_ZERO);
- if (!ca->buckets[1]) {
- percpu_ref_put(&ca->ref);
- bch_err(c, "error allocating ca->buckets[gc]");
- return -ENOMEM;
- }
-
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
}
}
- ret = bch2_ec_mem_alloc(c, true);
- if (ret) {
- bch_err(c, "error allocating ec gc mem");
- return ret;
- }
+ return 0;
+}
- percpu_down_write(&c->mark_lock);
+static int bch2_alloc_write_key(struct btree_trans *trans,
+ struct btree_iter *iter,
+ bool initial, bool metadata_only)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+ struct bucket *g;
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked old_u, new_u, gc_u;
+ struct bkey_alloc_buf *a;
+ int ret;
- /*
- * indicate to stripe code that we need to allocate for the gc stripes
- * radix tree, too
- */
- gc_pos_set(c, gc_phase(GC_PHASE_START));
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
- for_each_member_device(ca, c, i) {
- struct bucket_array *dst = __bucket_array(ca, 1);
- struct bucket_array *src = __bucket_array(ca, 0);
- size_t b;
+ old_u = new_u = bch2_alloc_unpack(k);
- dst->first_bucket = src->first_bucket;
- dst->nbuckets = src->nbuckets;
+ percpu_down_read(&c->mark_lock);
+ g = gc_bucket(ca, iter->pos.offset);
+ gc_u = (struct bkey_alloc_unpacked) {
+ .dev = iter->pos.inode,
+ .bucket = iter->pos.offset,
+ .gen = g->mark.gen,
+ .data_type = g->mark.data_type,
+ .dirty_sectors = g->mark.dirty_sectors,
+ .cached_sectors = g->mark.cached_sectors,
+ .read_time = g->io_time[READ],
+ .write_time = g->io_time[WRITE],
+ .stripe = g->stripe,
+ .stripe_redundancy = g->stripe_redundancy,
+ };
+ percpu_up_read(&c->mark_lock);
- for (b = 0; b < src->nbuckets; b++) {
- struct bucket *d = &dst->b[b];
- struct bucket *s = &src->b[b];
+ if (metadata_only &&
+ gc_u.data_type != BCH_DATA_sb &&
+ gc_u.data_type != BCH_DATA_journal &&
+ gc_u.data_type != BCH_DATA_btree)
+ return 0;
- d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
- d->gen_valid = s->gen_valid;
+ if (gen_after(old_u.gen, gc_u.gen))
+ return 0;
- if (metadata_only &&
- (s->mark.data_type == BCH_DATA_user ||
- s->mark.data_type == BCH_DATA_cached))
- d->_mark = s->mark;
- }
- };
+#define copy_bucket_field(_f) \
+ if (fsck_err_on(new_u._f != gc_u._f, c, \
+ "bucket %llu:%llu gen %u data type %s has wrong " #_f \
+ ": got %u, should be %u", \
+ iter->pos.inode, iter->pos.offset, \
+ new_u.gen, \
+ bch2_data_types[new_u.data_type], \
+ new_u._f, gc_u._f)) \
+ new_u._f = gc_u._f; \
+
+ copy_bucket_field(gen);
+ copy_bucket_field(data_type);
+ copy_bucket_field(stripe);
+ copy_bucket_field(dirty_sectors);
+ copy_bucket_field(cached_sectors);
+ copy_bucket_field(stripe_redundancy);
+ copy_bucket_field(stripe);
+#undef copy_bucket_field
- percpu_up_write(&c->mark_lock);
+ if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+ return 0;
- return 0;
+ a = bch2_alloc_pack(trans, new_u);
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
+ : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+fsck_err:
+ return ret;
}
-static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
{
- struct reflink_gc *r;
- const __le64 *refcount = bkey_refcount_c(k);
- char buf[200];
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_dev *ca;
+ unsigned i;
int ret = 0;
- if (!refcount)
- return 0;
+ bch2_trans_init(&trans, c, 0, 0);
- r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
- if (!r)
- return -ENOMEM;
+ for_each_member_device(ca, c, i) {
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+ break;
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- return -EINVAL;
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW,
+ bch2_alloc_write_key(&trans, &iter,
+ initial, metadata_only));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error writing alloc info: %i", ret);
+ percpu_ref_put(&ca->ref);
+ break;
+ }
}
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- r->refcount)) {
- struct bkey_i *new;
+ bch2_trans_exit(&trans);
+ return ret;
+}
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- goto fsck_err;
+static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+ ca->mi.nbuckets * sizeof(struct bucket),
+ GFP_KERNEL|__GFP_ZERO);
+ if (!buckets) {
+ percpu_ref_put(&ca->ref);
+ percpu_up_write(&c->mark_lock);
+ bch_err(c, "error allocating ca->buckets[gc]");
+ return -ENOMEM;
}
- bkey_reassemble(new, k);
+ buckets->first_bucket = ca->mi.first_bucket;
+ buckets->nbuckets = ca->mi.nbuckets;
+ rcu_assign_pointer(ca->buckets[1], buckets);
+ };
- if (!r->refcount) {
- new->k.type = KEY_TYPE_deleted;
- new->k.size = 0;
- } else {
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
- }
+ return bch2_alloc_read(c, true, metadata_only);
+}
- ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
- if (ret)
- kfree(new);
- }
-fsck_err:
- return ret;
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_member_device(ca, c, i) {
+ struct bucket_array *buckets = __bucket_array(ca, true);
+ struct bucket *g;
+
+ for_each_bucket(g, buckets) {
+ if (metadata_only &&
+ (g->mark.data_type == BCH_DATA_user ||
+ g->mark.data_type == BCH_DATA_cached ||
+ g->mark.data_type == BCH_DATA_parity))
+ continue;
+ g->_mark.dirty_sectors = 0;
+ g->_mark.cached_sectors = 0;
+ }
+ };
}
static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bool metadata_only)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct reflink_gc *r;
size_t idx = 0;
if (metadata_only)
return 0;
- if (initial) {
- c->reflink_gc_idx = 0;
-
- ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
- bch2_gc_reflink_done_initial_fn);
- goto out;
- }
-
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
if (!refcount)
continue;
- r = genradix_ptr(&c->reflink_gc_table, idx);
+ r = genradix_ptr(&c->reflink_gc_table, idx++);
if (!r ||
r->offset != k.k->p.offset ||
r->size != k.k->size) {
bkey_reassemble(new, k);
- if (!r->refcount)
+ if (!r->refcount) {
new->k.type = KEY_TYPE_deleted;
- else
+ /*
+ * XXX ugly: bch2_journal_key_insert() queues up
+ * the key for the journal replay code, which
+ * doesn't run the extent overwrite pass
+ */
+ if (initial)
+ new->k.size = 0;
+ } else {
*bkey_refcount(new) = cpu_to_le64(r->refcount);
+ }
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
+ : __bch2_trans_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
kfree(new);
}
}
fsck_err:
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
-out:
- genradix_free(&c->reflink_gc_table);
+ bch2_trans_iter_exit(&trans, &iter);
c->reflink_gc_nr = 0;
+ bch2_trans_exit(&trans);
return ret;
}
-static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k)
-{
-
- struct reflink_gc *r;
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- return 0;
-
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- return 0;
-}
-
static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
bool metadata_only)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct reflink_gc *r;
- int ret;
+ int ret = 0;
if (metadata_only)
return 0;
- genradix_free(&c->reflink_gc_table);
- c->reflink_gc_nr = 0;
-
- if (initial)
- return bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
- bch2_gc_reflink_start_initial_fn);
-
bch2_trans_init(&trans, c, 0, 0);
+ c->reflink_gc_nr = 0;
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
r->size = k.k->size;
r->refcount = 0;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- return 0;
+ return ret;
+}
+
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
+ bool metadata_only)
+{
+ struct genradix_iter iter;
+ struct reflink_gc *r;
+
+ genradix_for_each(&c->reflink_gc_table, iter, r)
+ r->refcount = 0;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
+ bool metadata_only)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct gc_stripe *m;
+ const struct bch_stripe *s;
+ char buf[200];
+ unsigned i;
+ int ret = 0;
+
+ if (metadata_only)
+ return 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
+
+ s = bkey_s_c_to_stripe(k).v;
+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+
+ for (i = 0; i < s->nr_blocks; i++)
+ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+ goto inconsistent;
+ continue;
+inconsistent:
+ if (fsck_err_on(true, c,
+ "stripe has wrong block sector count %u:\n"
+ " %s\n"
+ " should be %u", i,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+ m ? m->block_sectors[i] : 0)) {
+ struct bkey_i_stripe *new;
+
+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ bkey_reassemble(&new->k_i, k);
+
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
+ : __bch2_trans_do(&trans, NULL, NULL, 0,
+ __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
+ kfree(new);
+ }
+ }
+fsck_err:
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
+ bool metadata_only)
+{
+ genradix_free(&c->gc_stripes);
}
/**
/* flush interior btree updates: */
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
-again:
+
ret = bch2_gc_start(c, metadata_only) ?:
+ bch2_gc_alloc_start(c, initial, metadata_only) ?:
bch2_gc_reflink_start(c, initial, metadata_only);
if (ret)
goto out;
+again:
+ gc_pos_set(c, gc_phase(GC_PHASE_START));
bch2_mark_superblocks(c);
- if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) &&
+ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
c->opts.fix_errors != FSCK_OPT_NO) {
bch_info(c, "starting topology repair pass");
if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
(!iter && bch2_test_restart_gc)) {
+ if (iter++ > 2) {
+ bch_info(c, "Unable to fix bucket gens, looping");
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* XXX: make sure gens we fixed got saved
*/
- if (iter++ <= 2) {
- bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
- __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+ bch_info(c, "Second GC pass needed, restarting:");
+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
- percpu_down_write(&c->mark_lock);
- bch2_gc_free(c);
- percpu_up_write(&c->mark_lock);
- /* flush fsck errors, reset counters */
- bch2_flush_fsck_errs(c);
-
- goto again;
- }
+ bch2_gc_stripes_reset(c, initial, metadata_only);
+ bch2_gc_alloc_reset(c, initial, metadata_only);
+ bch2_gc_reflink_reset(c, initial, metadata_only);
- bch_info(c, "Unable to fix bucket gens, looping");
- ret = -EINVAL;
+ /* flush fsck errors, reset counters */
+ bch2_flush_fsck_errs(c);
+ goto again;
}
out:
if (!ret) {
bch2_journal_block(&c->journal);
- percpu_down_write(&c->mark_lock);
- ret = bch2_gc_reflink_done(c, initial, metadata_only) ?:
+ ret = bch2_gc_stripes_done(c, initial, metadata_only) ?:
+ bch2_gc_reflink_done(c, initial, metadata_only) ?:
+ bch2_gc_alloc_done(c, initial, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only);
bch2_journal_unblock(&c->journal);
- } else {
- percpu_down_write(&c->mark_lock);
}
+ percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
percpu_down_read(&c->mark_lock);
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, false);
- if (gen_after(g->mark.gen, ptr->gen) > 16) {
+ if (ptr_stale(ca, ptr) > 16) {
percpu_up_read(&c->mark_lock);
return true;
}
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, false);
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(g->gc_gen, ptr->gen))
- g->gc_gen = ptr->gen;
+ if (gen_after(*gen, ptr->gen))
+ *gen = ptr->gen;
}
percpu_up_read(&c->mark_lock);
* For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
* node pointers currently never have cached pointers that can become stale:
*/
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
{
- struct btree_trans trans;
- struct btree_iter *iter;
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf sk;
int ret = 0, commit_err = 0;
bch2_bkey_buf_init(&sk);
- bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS);
+ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
- while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k))) {
- c->gc_gens_pos = iter->pos;
+ while ((bch2_trans_begin(trans),
+ k = bch2_btree_iter_peek(&iter)).k) {
+ ret = bkey_err(k);
+
+ if (ret == -EINTR)
+ continue;
+ if (ret)
+ break;
+
+ c->gc_gens_pos = iter.pos;
if (gc_btree_gens_key(c, k) && !commit_err) {
bch2_bkey_buf_reassemble(&sk, c, k);
bch2_extent_normalize(c, bkey_i_to_s(sk.k));
-
commit_err =
- bch2_trans_update(&trans, iter, sk.k, 0) ?:
- bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOWAIT|
- BTREE_INSERT_NOFAIL);
+ bch2_trans_update(trans, &iter, sk.k, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOWAIT|
+ BTREE_INSERT_NOFAIL);
if (commit_err == -EINTR) {
commit_err = 0;
continue;
}
}
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
- bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+{
+ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked u;
+ int ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ u = bch2_alloc_unpack(k);
+
+ if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+ return 0;
+
+ u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+
+ return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+}
+
int bch2_gc_gens(struct bch_fs *c)
{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct bch_dev *ca;
- struct bucket_array *buckets;
- struct bucket *g;
+ u64 b, start_time = local_clock();
unsigned i;
int ret;
* introduces a deadlock in the RO path - we currently take the state
* lock at the start of going RO, thus the gc thread may get stuck:
*/
+ if (!mutex_trylock(&c->gc_gens_lock))
+ return 0;
+
down_read(&c->gc_lock);
+ bch2_trans_init(&trans, c, 0, 0);
for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
+ struct bucket_gens *gens;
+
+ BUG_ON(ca->oldest_gen);
- for_each_bucket(g, buckets)
- g->gc_gen = g->mark.gen;
- up_read(&ca->bucket_lock);
+ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+ if (!ca->oldest_gen) {
+ percpu_ref_put(&ca->ref);
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ gens = bucket_gens(ca);
+
+ for (b = gens->first_bucket;
+ b < gens->nbuckets; b++)
+ ca->oldest_gen[b] = gens->b[b];
}
for (i = 0; i < BTREE_ID_NR; i++)
if ((1 << i) & BTREE_ID_HAS_PTRS) {
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = bch2_gc_btree_gens(c, i);
+ ret = bch2_gc_btree_gens(&trans, i);
if (ret) {
bch_err(c, "error recalculating oldest_gen: %i", ret);
goto err;
}
}
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets)
- g->oldest_gen = g->gc_gen;
- up_read(&ca->bucket_lock);
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_alloc_write_oldest_gen(&trans, &iter));
+ if (ret) {
+ bch_err(c, "error writing oldest_gen: %i", ret);
+ break;
+ }
}
+ bch2_trans_iter_exit(&trans, &iter);
c->gc_gens_btree = 0;
c->gc_gens_pos = POS_MIN;
c->gc_count++;
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
err:
+ for_each_member_device(ca, c, i) {
+ kvfree(ca->oldest_gen);
+ ca->oldest_gen = NULL;
+ }
+
+ bch2_trans_exit(&trans);
up_read(&c->gc_lock);
+ mutex_unlock(&c->gc_gens_lock);
return ret;
}
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
-void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
/*
* For concurrent mark and sweep (with other index updates), we define a total
bch2_btree_node_iter_init_from_start(&src_iter, src);
- if (btree_node_is_extents(src))
- nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
- src, &src_iter,
- &dst->format,
- true);
- else
- nr = bch2_sort_repack(btree_bset_first(dst),
- src, &src_iter,
- &dst->format,
- true);
+ nr = bch2_sort_repack(btree_bset_first(dst),
+ src, &src_iter,
+ &dst->format,
+ true);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
start_time);
*
* Returns true if we sorted (i.e. invalidated iterators
*/
-void bch2_btree_init_next(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree *b)
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
{
struct bch_fs *c = trans->c;
struct btree_node_entry *bne;
bool reinit_iter = false;
EBUG_ON(!(b->c.lock.state.seq & 1));
- EBUG_ON(iter && iter->l[b->c.level].b != b);
BUG_ON(bset_written(b, bset(b, &b->set[1])));
if (b->nsets == MAX_BSETS &&
bch2_btree_build_aux_trees(b);
- if (iter && reinit_iter)
- bch2_btree_iter_reinit_node(iter, b);
+ if (reinit_iter)
+ bch2_trans_node_reinit_iter(trans, b);
}
static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
\
switch (write) { \
case READ: \
- bch_err(c, "%s", _buf2); \
+ if (_buf2) \
+ bch_err(c, "%s", _buf2); \
\
switch (type) { \
case BTREE_ERR_FIXABLE: \
BTREE_ERR_FATAL, c, ca, b, i,
"BSET_SEPARATE_WHITEOUTS no longer supported");
- if (btree_err_on(offset + sectors > c->opts.btree_node_size,
+ if (btree_err_on(offset + sectors > btree_sectors(c),
BTREE_ERR_FIXABLE, c, ca, b, i,
"bset past end of btree node")) {
i->u64s = 0;
b->data->keys.seq, bp->seq);
}
- while (b->written < (ptr_written ?: c->opts.btree_node_size)) {
+ while (b->written < (ptr_written ?: btree_sectors(c))) {
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
- b->written += sectors;
-
blacklisted = bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(i->journal_seq),
true);
btree_err_on(blacklisted && first,
BTREE_ERR_FIXABLE, c, ca, b, i,
- "first btree node bset has blacklisted journal seq");
+ "first btree node bset has blacklisted journal seq (%llu)",
+ le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
BTREE_ERR_FIXABLE, c, ca, b, i,
- "found blacklisted bset in btree node with sectors_written");
+ "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
+ le64_to_cpu(i->journal_seq),
+ b->written, b->written + sectors, ptr_written);
+
+ b->written += sectors;
+
if (blacklisted && !first)
continue;
if (le64_to_cpu(bn->magic) != bset_magic(c))
return 0;
- while (offset < c->opts.btree_node_size) {
+ while (offset < btree_sectors(c)) {
if (!offset) {
offset += vstruct_sectors(bn, c->block_bits);
} else {
if (!offset)
return false;
- while (offset < c->opts.btree_node_size) {
+ while (offset < btree_sectors(c)) {
bne = data + (offset << 9);
if (bne->keys.seq == bn->keys.seq)
return true;
bool dump_bset_maps = false;
bool have_retry = false;
int ret = 0, best = -1, write = READ;
- unsigned i, written, written2;
+ unsigned i, written = 0, written2 = 0;
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
if (ra->err[i])
continue;
- while (offset < c->opts.btree_node_size) {
+ while (offset < btree_sectors(c)) {
if (!offset) {
sectors = vstruct_sectors(bn, c->block_bits);
} else {
offset += sectors;
}
- while (offset < c->opts.btree_node_size) {
+ while (offset < btree_sectors(c)) {
bne = ra->buf[i] + (offset << 9);
if (bne->keys.seq == bn->keys.seq) {
if (!gap)
BUG_ON(btree_node_fake(b));
BUG_ON((b->will_make_reachable != 0) != !b->written);
- BUG_ON(b->written >= c->opts.btree_node_size);
- BUG_ON(b->written & (c->opts.block_size - 1));
+ BUG_ON(b->written >= btree_sectors(c));
+ BUG_ON(b->written & (block_sectors(c) - 1));
BUG_ON(bset_written(b, btree_bset_last(b)));
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);
- BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
+ BUG_ON(b->written + sectors_to_write > btree_sectors(c));
BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
BUG_ON(i->seq != b->data->keys.seq);
void bch2_btree_node_drop_keys_outside_node(struct btree *);
void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree_iter *,
- struct btree *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
struct btree *, bool);
#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "recovery.h"
#include "replicas.h"
+#include "subvolume.h"
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
-static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
-static void btree_trans_sort_iters(struct btree_trans *);
-static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *);
-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long);
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
- struct btree_iter *);
-static void btree_iter_copy(struct btree_iter *, struct btree_iter *);
+static void btree_trans_verify_sorted(struct btree_trans *);
+static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
-static inline int btree_iter_cmp(const struct btree_iter *l,
- const struct btree_iter *r)
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
+ struct btree_path *);
+
+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
{
- return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
- bkey_cmp(l->real_pos, r->real_pos);
+#ifdef CONFIG_BCACHEFS_DEBUG
+ return iter->ip_allocated;
+#else
+ return 0;
+#endif
}
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline int bch2_trans_cond_resched(struct btree_trans *trans)
+{
+ if (need_resched() || race_fault()) {
+ bch2_trans_unlock(trans);
+ schedule();
+ return bch2_trans_relock(trans) ? 0 : -EINTR;
+ } else {
+ return 0;
+ }
+}
+
+static inline int __btree_path_cmp(const struct btree_path *l,
+ enum btree_id r_btree_id,
+ bool r_cached,
+ struct bpos r_pos,
+ unsigned r_level)
+{
+ /*
+ * Must match lock ordering as defined by __bch2_btree_node_lock:
+ */
+ return cmp_int(l->btree_id, r_btree_id) ?:
+ cmp_int((int) l->cached, (int) r_cached) ?:
+ bpos_cmp(l->pos, r_pos) ?:
+ -cmp_int(l->level, r_level);
+}
+
+static inline int btree_path_cmp(const struct btree_path *l,
+ const struct btree_path *r)
{
- EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+ return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
+}
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
/* Are we iterating over keys in all snapshots? */
if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
p = bpos_successor(p);
static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
{
- EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
-
/* Are we iterating over keys in all snapshots? */
if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
p = bpos_predecessor(p);
return p;
}
-static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
{
return l < BTREE_MAX_DEPTH &&
- (unsigned long) iter->l[l].b >= 128;
+ (unsigned long) path->l[l].b >= 128;
}
static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
return pos;
}
-static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+static inline bool btree_path_pos_before_node(struct btree_path *path,
struct btree *b)
{
- return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
+ return bpos_cmp(path->pos, b->data->min_key) < 0;
}
-static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+static inline bool btree_path_pos_after_node(struct btree_path *path,
struct btree *b)
{
- return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
+ return bpos_cmp(b->key.k.p, path->pos) < 0;
}
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+static inline bool btree_path_pos_in_node(struct btree_path *path,
struct btree *b)
{
- return iter->btree_id == b->c.btree_id &&
- !btree_iter_pos_before_node(iter, b) &&
- !btree_iter_pos_after_node(iter, b);
+ return path->btree_id == b->c.btree_id &&
+ !btree_path_pos_before_node(path, b) &&
+ !btree_path_pos_after_node(path, b);
}
/* Btree node locking: */
-void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+ struct btree_path *path, struct btree *b)
{
- bch2_btree_node_unlock_write_inlined(b, iter);
+ bch2_btree_node_unlock_write_inlined(trans, path, b);
}
-void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
{
- struct btree_iter *linked;
+ struct btree_path *linked;
unsigned readers = 0;
- EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
-
- trans_for_each_iter(iter->trans, linked)
+ trans_for_each_path(trans, linked)
if (linked->l[b->c.level].b == b &&
btree_node_read_locked(linked, b->c.level))
readers++;
* goes to 0, and it's safe because we have the node intent
* locked:
*/
- atomic64_sub(__SIX_VAL(read_lock, readers),
- &b->c.lock.state.counter);
- btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
- atomic64_add(__SIX_VAL(read_lock, readers),
- &b->c.lock.state.counter);
+ if (!b->c.lock.readers)
+ atomic64_sub(__SIX_VAL(read_lock, readers),
+ &b->c.lock.state.counter);
+ else
+ this_cpu_sub(*b->c.lock.readers, readers);
+
+ six_lock_write(&b->c.lock, NULL, NULL);
+
+ if (!b->c.lock.readers)
+ atomic64_add(__SIX_VAL(read_lock, readers),
+ &b->c.lock.state.counter);
+ else
+ this_cpu_add(*b->c.lock.readers, readers);
}
-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- struct btree *b = btree_iter_node(iter, level);
- int want = __btree_lock_want(iter, level);
+ struct btree *b = btree_path_node(path, level);
+ int want = __btree_lock_want(path, level);
- if (!is_btree_node(iter, level))
- return false;
+ if (!is_btree_node(path, level))
+ goto fail;
if (race_fault())
- return false;
+ goto fail;
- if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
- (btree_node_lock_seq_matches(iter, b, level) &&
- btree_node_lock_increment(iter->trans, b, level, want))) {
- mark_btree_node_locked(iter, level, want);
+ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+ (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, b, level, want))) {
+ mark_btree_node_locked(path, level, want);
return true;
- } else {
- return false;
}
+fail:
+ trace_btree_node_relock_fail(trans->fn, _RET_IP_,
+ path->btree_id,
+ &path->pos,
+ (unsigned long) b,
+ path->l[level].lock_seq,
+ is_btree_node(path, level) ? b->c.lock.state.seq : 0);
+ return false;
}
-static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- struct btree *b = iter->l[level].b;
+ struct btree *b = path->l[level].b;
- EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
-
- if (!is_btree_node(iter, level))
+ if (!is_btree_node(path, level))
return false;
- if (btree_node_intent_locked(iter, level))
+ switch (btree_lock_want(path, level)) {
+ case BTREE_NODE_UNLOCKED:
+ BUG_ON(btree_node_locked(path, level));
+ return true;
+ case BTREE_NODE_READ_LOCKED:
+ BUG_ON(btree_node_intent_locked(path, level));
+ return bch2_btree_node_relock(trans, path, level);
+ case BTREE_NODE_INTENT_LOCKED:
+ break;
+ }
+
+ if (btree_node_intent_locked(path, level))
return true;
if (race_fault())
return false;
- if (btree_node_locked(iter, level)
+ if (btree_node_locked(path, level)
? six_lock_tryupgrade(&b->c.lock)
- : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
goto success;
- if (btree_node_lock_seq_matches(iter, b, level) &&
- btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
- btree_node_unlock(iter, level);
+ if (btree_node_lock_seq_matches(path, b, level) &&
+ btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
+ btree_node_unlock(path, level);
goto success;
}
return false;
success:
- mark_btree_node_intent_locked(iter, level);
+ mark_btree_node_intent_locked(path, level);
return true;
}
-static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
- unsigned long trace_ip)
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+ struct btree_path *path,
+ bool upgrade)
{
- unsigned l = iter->level;
+ unsigned l = path->level;
int fail_idx = -1;
do {
- if (!btree_iter_node(iter, l))
+ if (!btree_path_node(path, l))
break;
if (!(upgrade
- ? bch2_btree_node_upgrade(iter, l)
- : bch2_btree_node_relock(iter, l))) {
- (upgrade
- ? trace_node_upgrade_fail
- : trace_node_relock_fail)(iter->trans->ip, trace_ip,
- btree_iter_type(iter) == BTREE_ITER_CACHED,
- iter->btree_id, &iter->real_pos,
- l, iter->l[l].lock_seq,
- is_btree_node(iter, l)
- ? 0
- : (unsigned long) iter->l[l].b,
- is_btree_node(iter, l)
- ? iter->l[l].b->c.lock.state.seq
- : 0);
+ ? bch2_btree_node_upgrade(trans, path, l)
+ : bch2_btree_node_relock(trans, path, l)))
fail_idx = l;
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- }
l++;
- } while (l < iter->locks_want);
+ } while (l < path->locks_want);
/*
* When we fail to get a lock, we have to ensure that any child nodes
- * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+ * can't be relocked so bch2_btree_path_traverse has to walk back up to
* the node that we failed to relock:
*/
- while (fail_idx >= 0) {
- btree_node_unlock(iter, fail_idx);
- iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
- --fail_idx;
+ if (fail_idx >= 0) {
+ __bch2_btree_path_unlock(path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+ do {
+ path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+ --fail_idx;
+ } while (fail_idx >= 0);
}
- if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
- iter->uptodate = BTREE_ITER_NEED_PEEK;
+ if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+ path->uptodate = BTREE_ITER_UPTODATE;
- bch2_btree_trans_verify_locks(iter->trans);
+ bch2_trans_verify_locks(trans);
- return iter->uptodate < BTREE_ITER_NEED_RELOCK;
+ return path->uptodate < BTREE_ITER_NEED_RELOCK;
}
static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
- enum btree_iter_type type)
+ bool cached)
{
- return type != BTREE_ITER_CACHED
+ return !cached
? container_of(_b, struct btree, c)->key.k.p
: container_of(_b, struct bkey_cached, c)->key.pos;
}
/* Slowpath: */
-bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
- unsigned level, struct btree_iter *iter,
+bool __bch2_btree_node_lock(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
- struct btree_trans *trans = iter->trans;
- struct btree_iter *linked, *deadlock_iter = NULL;
- u64 start_time = local_clock();
- unsigned reason = 9;
- bool ret;
+ struct btree_path *linked;
+ unsigned reason;
/* Check if it's safe to block: */
- trans_for_each_iter(trans, linked) {
+ trans_for_each_path(trans, linked) {
if (!linked->nodes_locked)
continue;
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
- deadlock_iter = linked;
reason = 1;
+ goto deadlock;
}
- if (linked->btree_id != iter->btree_id) {
- if (linked->btree_id > iter->btree_id) {
- deadlock_iter = linked;
- reason = 3;
- }
- continue;
+ if (linked->btree_id != path->btree_id) {
+ if (linked->btree_id < path->btree_id)
+ continue;
+
+ reason = 3;
+ goto deadlock;
}
/*
- * Within the same btree, cached iterators come before non
- * cached iterators:
+ * Within the same btree, non-cached paths come before cached
+ * paths:
*/
- if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
- if (btree_iter_is_cached(iter)) {
- deadlock_iter = linked;
- reason = 4;
- }
- continue;
+ if (linked->cached != path->cached) {
+ if (!linked->cached)
+ continue;
+
+ reason = 4;
+ goto deadlock;
}
/*
* Interior nodes must be locked before their descendants: if
- * another iterator has possible descendants locked of the node
+ * another path has possible descendants locked of the node
* we're about to lock, it must have the ancestors locked too:
*/
if (level > __fls(linked->nodes_locked)) {
- deadlock_iter = linked;
reason = 5;
+ goto deadlock;
}
/* Must lock btree nodes in key order: */
if (btree_node_locked(linked, level) &&
bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
- btree_iter_type(linked))) <= 0) {
- deadlock_iter = linked;
- reason = 7;
+ linked->cached)) <= 0) {
BUG_ON(trans->in_traverse_all);
+ reason = 7;
+ goto deadlock;
}
}
- if (unlikely(deadlock_iter)) {
- trace_trans_restart_would_deadlock(trans->ip, ip,
- trans->in_traverse_all, reason,
- deadlock_iter->btree_id,
- btree_iter_type(deadlock_iter),
- &deadlock_iter->real_pos,
- iter->btree_id,
- btree_iter_type(iter),
- &pos);
- btree_trans_restart(trans);
- return false;
- }
-
- if (six_trylock_type(&b->c.lock, type))
- return true;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans->locking_iter_idx = iter->idx;
- trans->locking_pos = pos;
- trans->locking_btree_id = iter->btree_id;
- trans->locking_level = level;
- trans->locking = b;
-#endif
-
- ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans->locking = NULL;
-#endif
- if (ret)
- bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
- start_time);
- return ret;
+ return btree_node_lock_type(trans, path, b, pos, level,
+ type, should_sleep_fn, p);
+deadlock:
+ trace_trans_restart_would_deadlock(trans->fn, ip,
+ trans->in_traverse_all, reason,
+ linked->btree_id,
+ linked->cached,
+ &linked->pos,
+ path->btree_id,
+ path->cached,
+ &pos);
+ btree_trans_restart(trans);
+ return false;
}
/* Btree iterator locking: */
#ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+
+static void bch2_btree_path_verify_locks(struct btree_path *path)
{
unsigned l;
- if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
- BUG_ON(iter->nodes_locked);
+ if (!path->nodes_locked) {
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+ btree_path_node(path, path->level));
return;
}
- for (l = 0; btree_iter_node(iter, l); l++) {
- if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
- !btree_node_locked(iter, l))
- continue;
-
- BUG_ON(btree_lock_want(iter, l) !=
- btree_node_locked_type(iter, l));
- }
+ for (l = 0; btree_path_node(path, l); l++)
+ BUG_ON(btree_lock_want(path, l) !=
+ btree_node_locked_type(path, l));
}
-void bch2_btree_trans_verify_locks(struct btree_trans *trans)
+void bch2_trans_verify_locks(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- bch2_btree_iter_verify_locks(iter);
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify_locks(path);
}
#else
-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
#endif
+/* Btree path locking: */
+
/*
* Only for btree_cache.c - only relocks intent locks
*/
-bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
+bool bch2_btree_path_relock_intent(struct btree_trans *trans,
+ struct btree_path *path)
{
unsigned l;
- for (l = iter->level;
- l < iter->locks_want && btree_iter_node(iter, l);
+ for (l = path->level;
+ l < path->locks_want && btree_path_node(path, l);
l++) {
- if (!bch2_btree_node_relock(iter, l)) {
- trace_node_relock_fail(iter->trans->ip, _RET_IP_,
- btree_iter_type(iter) == BTREE_ITER_CACHED,
- iter->btree_id, &iter->real_pos,
- l, iter->l[l].lock_seq,
- is_btree_node(iter, l)
- ? 0
- : (unsigned long) iter->l[l].b,
- is_btree_node(iter, l)
- ? iter->l[l].b->c.lock.state.seq
- : 0);
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- btree_trans_restart(iter->trans);
+ if (!bch2_btree_node_relock(trans, path, l)) {
+ __bch2_btree_path_unlock(path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
+ path->btree_id, &path->pos);
+ btree_trans_restart(trans);
return false;
}
}
}
__flatten
-bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
+static bool bch2_btree_path_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned long trace_ip)
{
- bool ret = btree_iter_get_locks(iter, false, trace_ip);
+ bool ret = btree_path_get_locks(trans, path, false);
- if (!ret)
- btree_trans_restart(iter->trans);
+ if (!ret) {
+ trace_trans_restart_relock_path(trans->fn, trace_ip,
+ path->btree_id, &path->pos);
+ btree_trans_restart(trans);
+ }
return ret;
}
-bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
unsigned new_locks_want)
{
- struct btree_iter *linked;
+ struct btree_path *linked;
- EBUG_ON(iter->locks_want >= new_locks_want);
+ EBUG_ON(path->locks_want >= new_locks_want);
- iter->locks_want = new_locks_want;
+ path->locks_want = new_locks_want;
- if (btree_iter_get_locks(iter, true, _THIS_IP_))
+ if (btree_path_get_locks(trans, path, true))
return true;
/*
* iterators in the btree_trans here.
*
* On failure to upgrade the iterator, setting iter->locks_want and
- * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
+ * calling get_locks() is sufficient to make bch2_btree_path_traverse()
* get the locks we want on transaction restart.
*
* But if this iterator was a clone, on transaction restart what we did
*
* The code below used to be needed to ensure ancestor nodes get locked
* before interior nodes - now that's handled by
- * bch2_btree_iter_traverse_all().
+ * bch2_btree_path_traverse_all().
*/
- trans_for_each_iter(iter->trans, linked)
- if (linked != iter &&
- btree_iter_type(linked) == btree_iter_type(iter) &&
- linked->btree_id == iter->btree_id &&
+ trans_for_each_path(trans, linked)
+ if (linked != path &&
+ linked->cached == path->cached &&
+ linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
- btree_iter_get_locks(linked, true, _THIS_IP_);
+ btree_path_get_locks(trans, linked, true);
}
- if (iter->should_be_locked)
- btree_trans_restart(iter->trans);
return false;
}
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+void __bch2_btree_path_downgrade(struct btree_path *path,
unsigned new_locks_want)
{
unsigned l;
- EBUG_ON(iter->locks_want < new_locks_want);
+ EBUG_ON(path->locks_want < new_locks_want);
- iter->locks_want = new_locks_want;
+ path->locks_want = new_locks_want;
- while (iter->nodes_locked &&
- (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
- if (l > iter->level) {
- btree_node_unlock(iter, l);
+ while (path->nodes_locked &&
+ (l = __fls(path->nodes_locked)) >= path->locks_want) {
+ if (l > path->level) {
+ btree_node_unlock(path, l);
} else {
- if (btree_node_intent_locked(iter, l)) {
- six_lock_downgrade(&iter->l[l].b->c.lock);
- iter->nodes_intent_locked ^= 1 << l;
+ if (btree_node_intent_locked(path, l)) {
+ six_lock_downgrade(&path->l[l].b->c.lock);
+ path->nodes_intent_locked ^= 1 << l;
}
break;
}
}
- bch2_btree_trans_verify_locks(iter->trans);
+ bch2_btree_path_verify_locks(path);
}
void bch2_trans_downgrade(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- bch2_btree_iter_downgrade(iter);
+ trans_for_each_path(trans, path)
+ bch2_btree_path_downgrade(path);
}
/* Btree transaction locking: */
-static inline bool btree_iter_should_be_locked(struct btree_iter *iter)
-{
- return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
- iter->should_be_locked;
-}
-
bool bch2_trans_relock(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
if (unlikely(trans->restarted))
return false;
- trans_for_each_iter(trans, iter)
- if (btree_iter_should_be_locked(iter) &&
- !bch2_btree_iter_relock(iter, _RET_IP_)) {
- trace_trans_restart_relock(trans->ip, _RET_IP_,
- iter->btree_id, &iter->real_pos);
+ trans_for_each_path(trans, path)
+ if (path->should_be_locked &&
+ !bch2_btree_path_relock(trans, path, _RET_IP_)) {
+ trace_trans_restart_relock(trans->fn, _RET_IP_,
+ path->btree_id, &path->pos);
BUG_ON(!trans->restarted);
return false;
}
void bch2_trans_unlock(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- __bch2_btree_iter_unlock(iter);
+ trans_for_each_path(trans, path)
+ __bch2_btree_path_unlock(path);
BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
}
#ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+ struct btree_path *path)
{
struct bkey_cached *ck;
- bool locked = btree_node_locked(iter, 0);
+ bool locked = btree_node_locked(path, 0);
- if (!bch2_btree_node_relock(iter, 0))
+ if (!bch2_btree_node_relock(trans, path, 0))
return;
- ck = (void *) iter->l[0].b;
- BUG_ON(ck->key.btree_id != iter->btree_id ||
- bkey_cmp(ck->key.pos, iter->pos));
+ ck = (void *) path->l[0].b;
+ BUG_ON(ck->key.btree_id != path->btree_id ||
+ bkey_cmp(ck->key.pos, path->pos));
if (!locked)
- btree_node_unlock(iter, 0);
+ btree_node_unlock(path, 0);
}
-static void bch2_btree_iter_verify_level(struct btree_iter *iter,
- unsigned level)
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- struct btree_iter_level *l;
+ struct btree_path_level *l;
struct btree_node_iter tmp;
bool locked;
struct bkey_packed *p, *k;
if (!bch2_debug_check_iterators)
return;
- l = &iter->l[level];
+ l = &path->l[level];
tmp = l->iter;
- locked = btree_node_locked(iter, level);
+ locked = btree_node_locked(path, level);
- if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+ if (path->cached) {
if (!level)
- bch2_btree_iter_verify_cached(iter);
+ bch2_btree_path_verify_cached(trans, path);
return;
}
- BUG_ON(iter->level < iter->min_depth);
-
- if (!btree_iter_node(iter, level))
+ if (!btree_path_node(path, level))
return;
- if (!bch2_btree_node_relock(iter, level))
+ if (!bch2_btree_node_relock(trans, path, level))
return;
- BUG_ON(!btree_iter_pos_in_node(iter, l->b));
-
- /*
- * node iterators don't use leaf node iterator:
- */
- if (btree_iter_type(iter) == BTREE_ITER_NODES &&
- level <= iter->min_depth)
- goto unlock;
+ BUG_ON(!btree_path_pos_in_node(path, l->b));
bch2_btree_node_iter_verify(&l->iter, l->b);
/*
- * For interior nodes, the iterator will have skipped past
- * deleted keys:
- *
- * For extents, the iterator may have skipped past deleted keys (but not
- * whiteouts)
+ * For interior nodes, the iterator will have skipped past deleted keys:
*/
- p = level || btree_node_type_is_extents(iter->btree_id)
+ p = level
? bch2_btree_node_iter_prev(&tmp, l->b)
: bch2_btree_node_iter_prev_all(&tmp, l->b);
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
- if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
+ if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
msg = "before";
goto err;
}
- if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+ if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
msg = "after";
goto err;
}
-unlock:
+
if (!locked)
- btree_node_unlock(iter, level);
+ btree_node_unlock(path, level);
return;
err:
strcpy(buf2, "(none)");
strcpy(buf3, "(none)");
- bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+ bch2_bpos_to_text(&PBUF(buf1), path->pos);
if (p) {
struct bkey uk = bkey_unpack_key(l->b, p);
bch2_bkey_to_text(&PBUF(buf3), &uk);
}
- panic("iterator should be %s key at level %u:\n"
- "iter pos %s\n"
+ panic("path should be %s key at level %u:\n"
+ "path pos %s\n"
"prev key %s\n"
"cur key %s\n",
msg, level, buf1, buf2, buf3);
}
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void bch2_btree_path_verify(struct btree_trans *trans,
+ struct btree_path *path)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
- enum btree_iter_type type = btree_iter_type(iter);
unsigned i;
- EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+ EBUG_ON(path->btree_id >= BTREE_ID_NR);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
- iter->pos.snapshot != iter->snapshot);
+ for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+ if (!path->l[i].b) {
+ BUG_ON(!path->cached &&
+ c->btree_roots[path->btree_id].b->c.level > i);
+ break;
+ }
+
+ bch2_btree_path_verify_level(trans, path, i);
+ }
+
+ bch2_btree_path_verify_locks(path);
+}
+
+void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ bch2_btree_path_verify(trans, path);
+}
+
+static void bch2_btree_iter_verify(struct btree_iter *iter)
+{
+ struct btree_trans *trans = iter->trans;
+
+ BUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
- BUG_ON(type == BTREE_ITER_NODES &&
- !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
-
- BUG_ON(type != BTREE_ITER_NODES &&
+ BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(iter->btree_id));
- for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) {
- if (!iter->l[i].b) {
- BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
- break;
- }
-
- bch2_btree_iter_verify_level(iter, i);
- }
-
- bch2_btree_iter_verify_locks(iter);
+ if (iter->update_path)
+ bch2_btree_path_verify(trans, iter->update_path);
+ bch2_btree_path_verify(trans, iter->path);
}
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
- enum btree_iter_type type = btree_iter_type(iter);
+ BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !iter->pos.snapshot);
BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
iter->pos.snapshot != iter->snapshot);
- BUG_ON((type == BTREE_ITER_KEYS ||
- type == BTREE_ITER_CACHED) &&
- (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
- bkey_cmp(iter->pos, iter->k.p) > 0));
+ BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+ bkey_cmp(iter->pos, iter->k.p) > 0);
}
-void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
{
- struct btree_iter *iter;
+ struct btree_trans *trans = iter->trans;
+ struct btree_iter copy;
+ struct bkey_s_c prev;
+ int ret = 0;
if (!bch2_debug_check_iterators)
- return;
+ return 0;
+
+ if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+ return 0;
+
+ if (bkey_err(k) || !k.k)
+ return 0;
+
+ BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot));
+
+ bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos,
+ BTREE_ITER_NOPRESERVE|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ prev = bch2_btree_iter_prev(©);
+ if (!prev.k)
+ goto out;
+
+ ret = bkey_err(prev);
+ if (ret)
+ goto out;
+
+ if (!bkey_cmp(prev.k->p, k.k->p) &&
+ bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+ prev.k->p.snapshot) > 0) {
+ char buf1[100], buf2[200];
+
+ bch2_bkey_to_text(&PBUF(buf1), k.k);
+ bch2_bkey_to_text(&PBUF(buf2), prev.k);
+
+ panic("iter snap %u\n"
+ "k %s\n"
+ "prev %s\n",
+ iter->snapshot,
+ buf1, buf2);
+ }
+out:
+ bch2_trans_iter_exit(trans, ©);
+ return ret;
+}
+
+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+ struct bpos pos, bool key_cache)
+{
+ struct btree_path *path;
+ unsigned idx;
+ char buf[100];
+
+ trans_for_each_path_inorder(trans, path, idx) {
+ int cmp = cmp_int(path->btree_id, id) ?:
+ cmp_int(path->cached, key_cache);
+
+ if (cmp > 0)
+ break;
+ if (cmp < 0)
+ continue;
+
+ if (!(path->nodes_locked & 1) ||
+ !path->should_be_locked)
+ continue;
+
+ if (!key_cache) {
+ if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
+ bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
+ return;
+ } else {
+ if (!bkey_cmp(pos, path->pos))
+ return;
+ }
+ }
- trans_for_each_iter_with_node(trans, b, iter)
- bch2_btree_iter_verify_level(iter, b->c.level);
+ bch2_dump_trans_paths_updates(trans);
+ panic("not locked: %s %s%s\n",
+ bch2_btree_ids[id],
+ (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+ key_cache ? " cached" : "");
}
#else
-static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+ struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+ struct btree_path *path) {}
static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
#endif
+/* Btree path: fixups after btree updates */
+
static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
struct btree *b,
struct bset_tree *t,
bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
}
-static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
struct btree *b,
struct bkey_packed *where)
{
- struct btree_iter_level *l = &iter->l[b->c.level];
+ struct btree_path_level *l = &path->l[b->c.level];
if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
return;
- if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
+ if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
bch2_btree_node_iter_advance(&l->iter, l->b);
-
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
-void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
struct btree *b,
struct bkey_packed *where)
{
- struct btree_iter *linked;
+ struct btree_path *path;
- trans_for_each_iter_with_node(iter->trans, b, linked) {
- __bch2_btree_iter_fix_key_modified(linked, b, where);
- bch2_btree_iter_verify_level(linked, b->c.level);
+ trans_for_each_path_with_node(trans, b, path) {
+ __bch2_btree_path_fix_key_modified(path, b, where);
+ bch2_btree_path_verify_level(trans, path, b->c.level);
}
}
-static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
- struct btree *b,
- struct btree_node_iter *node_iter,
- struct bset_tree *t,
- struct bkey_packed *where,
- unsigned clobber_u64s,
- unsigned new_u64s)
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter *node_iter,
+ struct bset_tree *t,
+ struct bkey_packed *where,
+ unsigned clobber_u64s,
+ unsigned new_u64s)
{
const struct bkey_packed *end = btree_bkey_last(b, t);
struct btree_node_iter_set *set;
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
bch2_btree_node_iter_push(node_iter, b, where, end);
goto fixup_done;
} else {
return;
if (new_u64s &&
- bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
set->k = offset;
} else if (set->k < offset + clobber_u64s) {
set->k = offset + new_u64s;
*/
if (!bch2_btree_node_iter_end(node_iter) &&
iter_current_key_modified &&
- (b->c.level ||
- btree_node_type_is_extents(iter->btree_id))) {
+ b->c.level) {
struct bset_tree *t;
struct bkey_packed *k, *k2, *p;
b, t, k2);
}
}
-
- if (!b->c.level &&
- node_iter == &iter->l[0].iter &&
- iter_current_key_modified)
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
-void bch2_btree_node_iter_fix(struct btree_iter *iter,
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+ struct btree_path *path,
struct btree *b,
struct btree_node_iter *node_iter,
struct bkey_packed *where,
unsigned new_u64s)
{
struct bset_tree *t = bch2_bkey_to_bset(b, where);
- struct btree_iter *linked;
+ struct btree_path *linked;
- if (node_iter != &iter->l[b->c.level].iter) {
- __bch2_btree_node_iter_fix(iter, b, node_iter, t,
+ if (node_iter != &path->l[b->c.level].iter) {
+ __bch2_btree_node_iter_fix(path, b, node_iter, t,
where, clobber_u64s, new_u64s);
if (bch2_debug_check_iterators)
bch2_btree_node_iter_verify(node_iter, b);
}
- trans_for_each_iter_with_node(iter->trans, b, linked) {
+ trans_for_each_path_with_node(trans, b, linked) {
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->c.level].iter, t,
where, clobber_u64s, new_u64s);
- bch2_btree_iter_verify_level(linked, b->c.level);
+ bch2_btree_path_verify_level(trans, linked, b->c.level);
}
}
-static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
- struct btree_iter_level *l,
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+ struct btree_path_level *l,
struct bkey *u,
struct bkey_packed *k)
{
- struct bkey_s_c ret;
-
if (unlikely(!k)) {
/*
* signal to bch2_btree_iter_peek_slot() that we're currently at
return bkey_s_c_null;
}
- ret = bkey_disassemble(l->b, k, u);
-
- /*
- * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
- * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
- * being overwritten but doesn't change k->size. But this is ok, because
- * those keys are never written out, we just have to avoid a spurious
- * assertion here:
- */
- if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
- bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
-
- return ret;
+ return bkey_disassemble(l->b, k, u);
}
-/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
- struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+ struct btree_path_level *l,
+ struct bkey *u)
{
- return __btree_iter_unpack(iter, l, &iter->k,
+ return __btree_iter_unpack(c, l, u,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
- struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
+ struct btree_path *path,
+ struct btree_path_level *l,
+ struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+ struct bkey_s_c k = __btree_iter_unpack(c, l, u,
bch2_btree_node_iter_peek(&l->iter, l->b));
- iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+ path->pos = k.k ? k.k->p : l->b->key.k.p;
return k;
}
-static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
- struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c,
+ struct btree_path *path,
+ struct btree_path_level *l,
+ struct bkey *u)
{
- struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+ struct bkey_s_c k = __btree_iter_unpack(c, l, u,
bch2_btree_node_iter_prev(&l->iter, l->b));
- iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+ path->pos = k.k ? k.k->p : l->b->data->min_key;
return k;
}
-static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
- struct btree_iter_level *l,
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+ struct btree_path_level *l,
int max_advance)
{
struct bkey_packed *k;
int nr_advanced = 0;
while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
- bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+ bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
if (max_advance > 0 && nr_advanced >= max_advance)
return false;
/*
* Verify that iterator for parent node points to child node:
*/
-static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+static void btree_path_verify_new_node(struct btree_trans *trans,
+ struct btree_path *path, struct btree *b)
{
- struct btree_iter_level *l;
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l;
unsigned plevel;
bool parent_locked;
struct bkey_packed *k;
if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
return;
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ return;
+
plevel = b->c.level + 1;
- if (!btree_iter_node(iter, plevel))
+ if (!btree_path_node(path, plevel))
return;
- parent_locked = btree_node_locked(iter, plevel);
+ parent_locked = btree_node_locked(path, plevel);
- if (!bch2_btree_node_relock(iter, plevel))
+ if (!bch2_btree_node_relock(trans, path, plevel))
return;
- l = &iter->l[plevel];
+ l = &path->l[plevel];
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
if (!k ||
bkey_deleted(k) ||
char buf4[100];
struct bkey uk = bkey_unpack_key(b, k);
- bch2_dump_btree_node(iter->trans->c, l->b);
- bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+ bch2_dump_btree_node(c, l->b);
+ bch2_bpos_to_text(&PBUF(buf1), path->pos);
bch2_bkey_to_text(&PBUF(buf2), &uk);
bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
"iter pos %s %s\n"
"iter key %s\n"
"new node %s-%s\n",
- bch2_btree_ids[iter->btree_id], buf1,
+ bch2_btree_ids[path->btree_id], buf1,
buf2, buf3, buf4);
}
if (!parent_locked)
- btree_node_unlock(iter, b->c.level + 1);
+ btree_node_unlock(path, plevel);
}
-static inline void __btree_iter_init(struct btree_iter *iter,
- unsigned level)
+static inline void __btree_path_level_init(struct btree_path *path,
+ unsigned level)
{
- struct btree_iter_level *l = &iter->l[level];
+ struct btree_path_level *l = &path->l[level];
- bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
+ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
/*
* Iterators to interior nodes should always be pointed at the first non
*/
if (level)
bch2_btree_node_iter_peek(&l->iter, l->b);
-
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
-static inline void btree_iter_node_set(struct btree_iter *iter,
- struct btree *b)
+static inline void btree_path_level_init(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
- BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
+ BUG_ON(path->cached);
- btree_iter_verify_new_node(iter, b);
+ btree_path_verify_new_node(trans, path, b);
- EBUG_ON(!btree_iter_pos_in_node(iter, b));
+ EBUG_ON(!btree_path_pos_in_node(path, b));
EBUG_ON(b->c.lock.state.seq & 1);
- iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
- iter->l[b->c.level].b = b;
- __btree_iter_init(iter, b->c.level);
+ path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+ path->l[b->c.level].b = b;
+ __btree_path_level_init(path, b->c.level);
}
+/* Btree path: fixups after btree node updates: */
+
/*
* A btree node is being replaced - update the iterator to point to the new
* node:
*/
-void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
{
- enum btree_node_locked_type t;
- struct btree_iter *linked;
+ struct btree_path *path;
- trans_for_each_iter(iter->trans, linked)
- if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
- btree_iter_pos_in_node(linked, b)) {
- /*
- * bch2_btree_iter_node_drop() has already been called -
- * the old node we're replacing has already been
- * unlocked and the pointer invalidated
- */
- BUG_ON(btree_node_locked(linked, b->c.level));
+ trans_for_each_path(trans, path)
+ if (!path->cached &&
+ btree_path_pos_in_node(path, b)) {
+ enum btree_node_locked_type t =
+ btree_lock_want(path, b->c.level);
- t = btree_lock_want(linked, b->c.level);
- if (t != BTREE_NODE_UNLOCKED) {
+ if (path->nodes_locked &&
+ t != BTREE_NODE_UNLOCKED) {
+ btree_node_unlock(path, b->c.level);
six_lock_increment(&b->c.lock, t);
- mark_btree_node_locked(linked, b->c.level, t);
+ mark_btree_node_locked(path, b->c.level, t);
}
- btree_iter_node_set(linked, b);
- }
-}
-
-void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
-{
- struct btree_iter *linked;
- unsigned level = b->c.level;
-
- trans_for_each_iter(iter->trans, linked)
- if (linked->l[level].b == b) {
- btree_node_unlock(linked, level);
- linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
+ btree_path_level_init(trans, path, b);
}
}
* A btree node has been modified in such a way as to invalidate iterators - fix
* them:
*/
-void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
{
- struct btree_iter *linked;
+ struct btree_path *path;
- trans_for_each_iter_with_node(iter->trans, b, linked)
- __btree_iter_init(linked, b->c.level);
+ trans_for_each_path_with_node(trans, b, path)
+ __btree_path_level_init(path, b->c.level);
}
+/* Btree path: traverse, set_pos: */
+
static int lock_root_check_fn(struct six_lock *lock, void *p)
{
struct btree *b = container_of(lock, struct btree, c.lock);
return b == *rootp ? 0 : -1;
}
-static inline int btree_iter_lock_root(struct btree_trans *trans,
- struct btree_iter *iter,
+static inline int btree_path_lock_root(struct btree_trans *trans,
+ struct btree_path *path,
unsigned depth_want,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
+ struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
enum six_lock_type lock_type;
unsigned i;
- EBUG_ON(iter->nodes_locked);
+ EBUG_ON(path->nodes_locked);
while (1) {
b = READ_ONCE(*rootp);
- iter->level = READ_ONCE(b->c.level);
+ path->level = READ_ONCE(b->c.level);
- if (unlikely(iter->level < depth_want)) {
+ if (unlikely(path->level < depth_want)) {
/*
* the root is at a lower depth than the depth we want:
* got to the end of the btree, or we're walking nodes
* greater than some depth and there are no nodes >=
* that depth
*/
- iter->level = depth_want;
- for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
- iter->l[i].b = NULL;
+ path->level = depth_want;
+ for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+ path->l[i].b = NULL;
return 1;
}
- lock_type = __btree_lock_want(iter, iter->level);
- if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
- iter, lock_type,
+ lock_type = __btree_lock_want(path, path->level);
+ if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
+ path->level, lock_type,
lock_root_check_fn, rootp,
trace_ip))) {
if (trans->restarted)
}
if (likely(b == READ_ONCE(*rootp) &&
- b->c.level == iter->level &&
+ b->c.level == path->level &&
!race_fault())) {
- for (i = 0; i < iter->level; i++)
- iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
- iter->l[iter->level].b = b;
- for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
- iter->l[i].b = NULL;
-
- mark_btree_node_locked(iter, iter->level, lock_type);
- btree_iter_node_set(iter, b);
+ for (i = 0; i < path->level; i++)
+ path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+ path->l[path->level].b = b;
+ for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+ path->l[i].b = NULL;
+
+ mark_btree_node_locked(path, path->level, lock_type);
+ btree_path_level_init(trans, path, b);
return 0;
}
}
noinline
-static int btree_iter_prefetch(struct btree_iter *iter)
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
{
- struct bch_fs *c = iter->trans->c;
- struct btree_iter_level *l = &iter->l[iter->level];
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
struct bkey_buf tmp;
unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
- ? (iter->level > 1 ? 0 : 2)
- : (iter->level > 1 ? 1 : 16);
- bool was_locked = btree_node_locked(iter, iter->level);
+ ? (path->level > 1 ? 0 : 2)
+ : (path->level > 1 ? 1 : 16);
+ bool was_locked = btree_node_locked(path, path->level);
int ret = 0;
bch2_bkey_buf_init(&tmp);
while (nr && !ret) {
- if (!bch2_btree_node_relock(iter, iter->level))
+ if (!bch2_btree_node_relock(trans, path, path->level))
break;
bch2_btree_node_iter_advance(&node_iter, l->b);
break;
bch2_bkey_buf_unpack(&tmp, c, l->b, k);
- ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
- iter->level - 1);
+ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+ path->level - 1);
+ }
+
+ if (!was_locked)
+ btree_node_unlock(path, path->level);
+
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+ struct btree_and_journal_iter *jiter)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ ? (path->level > 1 ? 0 : 2)
+ : (path->level > 1 ? 1 : 16);
+ bool was_locked = btree_node_locked(path, path->level);
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr && !ret) {
+ if (!bch2_btree_node_relock(trans, path, path->level))
+ break;
+
+ bch2_btree_and_journal_iter_advance(jiter);
+ k = bch2_btree_and_journal_iter_peek(jiter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+ path->level - 1);
}
if (!was_locked)
- btree_node_unlock(iter, iter->level);
+ btree_node_unlock(path, path->level);
bch2_bkey_buf_exit(&tmp, c);
return ret;
}
-static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+ struct btree_path *path,
unsigned plevel, struct btree *b)
{
- struct btree_iter_level *l = &iter->l[plevel];
- bool locked = btree_node_locked(iter, plevel);
+ struct btree_path_level *l = &path->l[plevel];
+ bool locked = btree_node_locked(path, plevel);
struct bkey_packed *k;
struct bch_btree_ptr_v2 *bp;
- if (!bch2_btree_node_relock(iter, plevel))
+ if (!bch2_btree_node_relock(trans, path, plevel))
return;
k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
bp->mem_ptr = (unsigned long)b;
if (!locked)
- btree_node_unlock(iter, plevel);
+ btree_node_unlock(path, plevel);
+}
+
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
+ struct bkey_buf *out)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
+ struct btree_and_journal_iter jiter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+ k = bch2_btree_and_journal_iter_peek(&jiter);
+
+ bch2_bkey_buf_reassemble(out, c, k);
+
+ if (flags & BTREE_ITER_PREFETCH)
+ ret = btree_path_prefetch_j(trans, path, &jiter);
+
+ bch2_btree_and_journal_iter_exit(&jiter);
+ return ret;
}
-static __always_inline int btree_iter_down(struct btree_trans *trans,
- struct btree_iter *iter,
+static __always_inline int btree_path_down(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_iter_level *l = &iter->l[iter->level];
+ struct btree_path_level *l = path_l(path);
struct btree *b;
- unsigned level = iter->level - 1;
- enum six_lock_type lock_type = __btree_lock_want(iter, level);
+ unsigned level = path->level - 1;
+ enum six_lock_type lock_type = __btree_lock_want(path, level);
+ bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
struct bkey_buf tmp;
int ret;
- EBUG_ON(!btree_node_locked(iter, iter->level));
+ EBUG_ON(!btree_node_locked(path, path->level));
bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
- b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip);
+ if (unlikely(!replay_done)) {
+ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ if (ret)
+ goto err;
+ } else {
+ bch2_bkey_buf_unpack(&tmp, c, l->b,
+ bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ if (flags & BTREE_ITER_PREFETCH) {
+ ret = btree_path_prefetch(trans, path);
+ if (ret)
+ goto err;
+ }
+ }
+
+ b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
if (unlikely(ret))
goto err;
- mark_btree_node_locked(iter, level, lock_type);
- btree_iter_node_set(iter, b);
+ mark_btree_node_locked(path, level, lock_type);
+ btree_path_level_init(trans, path, b);
- if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+ if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
unlikely(b != btree_node_mem_ptr(tmp.k)))
- btree_node_mem_ptr_set(iter, level + 1, b);
-
- if (iter->flags & BTREE_ITER_PREFETCH)
- ret = btree_iter_prefetch(iter);
+ btree_node_mem_ptr_set(trans, path, level + 1, b);
- if (btree_node_read_locked(iter, level + 1))
- btree_node_unlock(iter, level + 1);
- iter->level = level;
+ if (btree_node_read_locked(path, level + 1))
+ btree_node_unlock(path, level + 1);
+ path->level = level;
- bch2_btree_iter_verify_locks(iter);
+ bch2_btree_path_verify_locks(path);
err:
bch2_bkey_buf_exit(&tmp, c);
return ret;
}
-static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
+static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+ unsigned, unsigned long);
-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
+static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_path *path;
int i;
if (trans->in_traverse_all)
retry_all:
trans->restarted = false;
- trans_for_each_iter(trans, iter)
- iter->should_be_locked = false;
+ trans_for_each_path(trans, path)
+ path->should_be_locked = false;
- btree_trans_sort_iters(trans);
+ btree_trans_verify_sorted(trans);
for (i = trans->nr_sorted - 2; i >= 0; --i) {
- struct btree_iter *iter1 = trans->iters + trans->sorted[i];
- struct btree_iter *iter2 = trans->iters + trans->sorted[i + 1];
-
- if (iter1->btree_id == iter2->btree_id &&
- iter1->locks_want < iter2->locks_want)
- __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
- else if (!iter1->locks_want && iter2->locks_want)
- __bch2_btree_iter_upgrade(iter1, 1);
+ struct btree_path *path1 = trans->paths + trans->sorted[i];
+ struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
+
+ if (path1->btree_id == path2->btree_id &&
+ path1->locks_want < path2->locks_want)
+ __bch2_btree_path_upgrade(trans, path1, path2->locks_want);
+ else if (!path1->locks_want && path2->locks_want)
+ __bch2_btree_path_upgrade(trans, path1, 1);
}
bch2_trans_unlock(trans);
} while (ret);
}
- if (unlikely(ret == -EIO)) {
- trans->error = true;
+ if (unlikely(ret == -EIO))
goto out;
- }
BUG_ON(ret && ret != -EINTR);
/* Now, redo traversals in correct order: */
- trans_for_each_iter_inorder(trans, iter) {
- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
-
- ret = btree_iter_traverse_one(iter, _THIS_IP_);
- if (ret)
- goto retry_all;
+ i = 0;
+ while (i < trans->nr_sorted) {
+ path = trans->paths + trans->sorted[i];
- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+ /*
+ * Traversing a path can cause another path to be added at about
+ * the same position:
+ */
+ if (path->uptodate) {
+ ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+ if (ret)
+ goto retry_all;
+ } else {
+ i++;
+ }
}
- trans_for_each_iter(trans, iter)
- BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+ /*
+ * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
+ * and relock(), relock() won't relock since path->should_be_locked
+ * isn't set yet, which is all fine
+ */
+ trans_for_each_path(trans, path)
+ BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
out:
bch2_btree_cache_cannibalize_unlock(c);
trans->in_traverse_all = false;
- trace_trans_traverse_all(trans->ip, trace_ip);
+ trace_trans_traverse_all(trans->fn, trace_ip);
return ret;
}
-static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
- return __btree_iter_traverse_all(trans, 0, _RET_IP_);
+ return __btree_path_traverse_all(trans, 0, _RET_IP_);
}
-static inline bool btree_iter_good_node(struct btree_iter *iter,
+static inline bool btree_path_good_node(struct btree_trans *trans,
+ struct btree_path *path,
unsigned l, int check_pos)
{
- if (!is_btree_node(iter, l) ||
- !bch2_btree_node_relock(iter, l))
+ if (!is_btree_node(path, l) ||
+ !bch2_btree_node_relock(trans, path, l))
return false;
- if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+ if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
return false;
- if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+ if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
return false;
return true;
}
-static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+ struct btree_path *path,
int check_pos)
{
- unsigned l = iter->level;
+ unsigned i, l = path->level;
- while (btree_iter_node(iter, l) &&
- !btree_iter_good_node(iter, l, check_pos)) {
- btree_node_unlock(iter, l);
- iter->l[l].b = BTREE_ITER_NO_NODE_UP;
+ while (btree_path_node(path, l) &&
+ !btree_path_good_node(trans, path, l, check_pos)) {
+ btree_node_unlock(path, l);
+ path->l[l].b = BTREE_ITER_NO_NODE_UP;
l++;
}
+ /* If we need intent locks, take them too: */
+ for (i = l + 1;
+ i < path->locks_want && btree_path_node(path, i);
+ i++)
+ if (!bch2_btree_node_relock(trans, path, i))
+ while (l <= i) {
+ btree_node_unlock(path, l);
+ path->l[l].b = BTREE_ITER_NO_NODE_UP;
+ l++;
+ }
+
return l;
}
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
* stashed in the iterator and returned from bch2_trans_exit().
*/
-static int btree_iter_traverse_one(struct btree_iter *iter,
+static int btree_path_traverse_one(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
unsigned long trace_ip)
{
- struct btree_trans *trans = iter->trans;
- unsigned l, depth_want = iter->level;
+ unsigned depth_want = path->level;
int ret = 0;
+ if (unlikely(trans->restarted)) {
+ ret = -EINTR;
+ goto out;
+ }
+
/*
- * Ensure we obey iter->should_be_locked: if it's set, we can't unlock
- * and re-traverse the iterator without a transaction restart:
+ * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+ * and re-traverse the path without a transaction restart:
*/
- if (iter->should_be_locked) {
- ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR;
+ if (path->should_be_locked) {
+ ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
goto out;
}
- if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
- ret = bch2_btree_iter_traverse_cached(iter);
+ if (path->cached) {
+ ret = bch2_btree_path_traverse_cached(trans, path, flags);
goto out;
}
- if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+ if (unlikely(path->level >= BTREE_MAX_DEPTH))
goto out;
- iter->level = btree_iter_up_until_good_node(iter, 0);
-
- /* If we need intent locks, take them too: */
- for (l = iter->level + 1;
- l < iter->locks_want && btree_iter_node(iter, l);
- l++)
- if (!bch2_btree_node_relock(iter, l))
- while (iter->level <= l) {
- btree_node_unlock(iter, iter->level);
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
- iter->level++;
- }
+ path->level = btree_path_up_until_good_node(trans, path, 0);
/*
- * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+ * Note: path->nodes[path->level] may be temporarily NULL here - that
* would indicate to other code that we got to the end of the btree,
* here it indicates that relocking the root failed - it's critical that
- * btree_iter_lock_root() comes next and that it can't fail
+ * btree_path_lock_root() comes next and that it can't fail
*/
- while (iter->level > depth_want) {
- ret = btree_iter_node(iter, iter->level)
- ? btree_iter_down(trans, iter, trace_ip)
- : btree_iter_lock_root(trans, iter, depth_want, trace_ip);
+ while (path->level > depth_want) {
+ ret = btree_path_node(path, path->level)
+ ? btree_path_down(trans, path, flags, trace_ip)
+ : btree_path_lock_root(trans, path, depth_want, trace_ip);
if (unlikely(ret)) {
if (ret == 1) {
/*
- * Got to the end of the btree (in
- * BTREE_ITER_NODES mode)
+ * No nodes at this level - got to the end of
+ * the btree:
*/
ret = 0;
goto out;
}
- __bch2_btree_iter_unlock(iter);
- iter->level = depth_want;
+ __bch2_btree_path_unlock(path);
+ path->level = depth_want;
- if (ret == -EIO) {
- iter->flags |= BTREE_ITER_ERROR;
- iter->l[iter->level].b =
+ if (ret == -EIO)
+ path->l[path->level].b =
BTREE_ITER_NO_NODE_ERROR;
- } else {
- iter->l[iter->level].b =
+ else
+ path->l[path->level].b =
BTREE_ITER_NO_NODE_DOWN;
- }
goto out;
}
}
- iter->uptodate = BTREE_ITER_NEED_PEEK;
+ path->uptodate = BTREE_ITER_UPTODATE;
out:
BUG_ON((ret == -EINTR) != !!trans->restarted);
- trace_iter_traverse(trans->ip, trace_ip,
- btree_iter_type(iter) == BTREE_ITER_CACHED,
- iter->btree_id, &iter->real_pos, ret);
- bch2_btree_iter_verify(iter);
+ bch2_btree_path_verify(trans, path);
return ret;
}
-static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- int ret;
+static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
- ret = bch2_trans_cond_resched(trans) ?:
- btree_iter_traverse_one(iter, _RET_IP_);
- if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
- ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
- BUG_ON(ret == -EINTR);
- }
+int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+ struct btree_path *path, unsigned flags)
+{
+ if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+ return 0;
- return ret;
+ return bch2_trans_cond_resched(trans) ?:
+ btree_path_traverse_one(trans, path, flags, _RET_IP_);
}
-/*
- * Note:
- * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
- * for internal btree iterator users
- *
- * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
- * btree_iter_traverse() does not:
- */
-static inline int __must_check
-btree_iter_traverse(struct btree_iter *iter)
+static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+ struct btree_path *src)
{
- return iter->uptodate >= BTREE_ITER_NEED_RELOCK
- ? __bch2_btree_iter_traverse(iter)
- : 0;
+ unsigned i;
+
+ memcpy(&dst->pos, &src->pos,
+ sizeof(struct btree_path) - offsetof(struct btree_path, pos));
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
+ if (btree_node_locked(dst, i))
+ six_lock_increment(&dst->l[i].b->c.lock,
+ __btree_lock_want(dst, i));
+
+ btree_path_check_sort(trans, dst, 0);
}
-int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
+ bool intent)
{
- int ret;
+ struct btree_path *new = btree_path_alloc(trans, src);
- btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+ btree_path_copy(trans, new, src);
+ __btree_path_get(new, intent);
+ return new;
+}
+
+inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+ struct btree_path *path, bool intent,
+ unsigned long ip)
+{
+ if (path->ref > 1 || path->preserve) {
+ __btree_path_put(path, intent);
+ path = btree_path_clone(trans, path, intent);
+ path->preserve = false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ path->ip_allocated = ip;
+#endif
+ btree_trans_verify_sorted(trans);
+ }
+
+ return path;
+}
+
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
+ struct btree_path *path, struct bpos new_pos,
+ bool intent, unsigned long ip)
+{
+ int cmp = bpos_cmp(new_pos, path->pos);
+ unsigned l = path->level;
+
+ EBUG_ON(trans->restarted);
+ EBUG_ON(!path->ref);
+
+ if (!cmp)
+ return path;
+
+ path = bch2_btree_path_make_mut(trans, path, intent, ip);
+
+ path->pos = new_pos;
+ path->should_be_locked = false;
+
+ btree_path_check_sort(trans, path, cmp);
+
+ if (unlikely(path->cached)) {
+ btree_node_unlock(path, 0);
+ path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ goto out;
+ }
- ret = btree_iter_traverse(iter);
+ l = btree_path_up_until_good_node(trans, path, cmp);
+
+ if (btree_path_node(path, l)) {
+ /*
+ * We might have to skip over many keys, or just a few: try
+ * advancing the node iterator, and if we have to skip over too
+ * many keys just reinit it (or if we're rewinding, since that
+ * is expensive).
+ */
+ if (cmp < 0 ||
+ !btree_path_advance_to_pos(path, &path->l[l], 8))
+ __btree_path_level_init(path, l);
+ }
+
+ if (l != path->level) {
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ __bch2_btree_path_unlock(path);
+ }
+out:
+ bch2_btree_path_verify(trans, path);
+ return path;
+}
+
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+ struct btree_path *next;
+
+ next = prev_btree_path(trans, path);
+ if (next && !btree_path_cmp(next, path))
+ return next;
+
+ next = next_btree_path(trans, path);
+ if (next && !btree_path_cmp(next, path))
+ return next;
+
+ return NULL;
+}
+
+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+ struct btree_path *next;
+
+ next = prev_btree_path(trans, path);
+ if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
+ return next;
+
+ next = next_btree_path(trans, path);
+ if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
+ return next;
+
+ return NULL;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+{
+ __bch2_btree_path_unlock(path);
+ btree_path_list_remove(trans, path);
+ trans->paths_allocated &= ~(1ULL << path->idx);
+}
+
+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+{
+ struct btree_path *dup;
+
+ EBUG_ON(trans->paths + path->idx != path);
+ EBUG_ON(!path->ref);
+
+ if (!__btree_path_put(path, intent))
+ return;
+
+ /*
+ * Perhaps instead we should check for duplicate paths in traverse_all:
+ */
+ if (path->preserve &&
+ (dup = have_path_at_pos(trans, path))) {
+ dup->preserve = true;
+ path->preserve = false;
+ goto free;
+ }
+
+ if (!path->preserve &&
+ (dup = have_node_at_pos(trans, path)))
+ goto free;
+ return;
+free:
+ if (path->should_be_locked &&
+ !btree_node_locked(dup, path->level))
+ return;
+
+ dup->should_be_locked |= path->should_be_locked;
+ __bch2_path_free(trans, path);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ struct btree_insert_entry *i;
+ unsigned idx;
+ char buf1[300], buf2[300];
+
+ btree_trans_verify_sorted(trans);
+
+ trans_for_each_path_inorder(trans, path, idx)
+ printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
+ path->idx, path->ref, path->intent_ref,
+ path->should_be_locked ? " S" : "",
+ path->preserve ? " P" : "",
+ bch2_btree_ids[path->btree_id],
+ (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
+ path->nodes_locked,
+#ifdef CONFIG_BCACHEFS_DEBUG
+ (void *) path->ip_allocated
+#else
+ NULL
+#endif
+ );
+
+ trans_for_each_update(trans, i) {
+ struct bkey u;
+ struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+
+ printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s",
+ bch2_btree_ids[i->btree_id],
+ (void *) i->ip_allocated,
+ (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
+ (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
+ }
+}
+
+static struct btree_path *btree_path_alloc(struct btree_trans *trans,
+ struct btree_path *pos)
+{
+ struct btree_path *path;
+ unsigned idx;
+
+ if (unlikely(trans->paths_allocated ==
+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
+ bch2_dump_trans_paths_updates(trans);
+ panic("trans path oveflow\n");
+ }
+
+ idx = __ffs64(~trans->paths_allocated);
+ trans->paths_allocated |= 1ULL << idx;
+
+ path = &trans->paths[idx];
+
+ path->idx = idx;
+ path->ref = 0;
+ path->intent_ref = 0;
+ path->nodes_locked = 0;
+ path->nodes_intent_locked = 0;
+
+ btree_path_list_add(trans, pos, path);
+ return path;
+}
+
+struct btree_path *bch2_path_get(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos,
+ unsigned locks_want, unsigned level,
+ unsigned flags, unsigned long ip)
+{
+ struct btree_path *path, *path_pos = NULL;
+ bool cached = flags & BTREE_ITER_CACHED;
+ bool intent = flags & BTREE_ITER_INTENT;
+ int i;
+
+ BUG_ON(trans->restarted);
+
+ trans_for_each_path_inorder(trans, path, i) {
+ if (__btree_path_cmp(path,
+ btree_id,
+ cached,
+ pos,
+ level) > 0)
+ break;
+
+ path_pos = path;
+ }
+
+ if (path_pos &&
+ path_pos->cached == cached &&
+ path_pos->btree_id == btree_id &&
+ path_pos->level == level) {
+ __btree_path_get(path_pos, intent);
+ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ } else {
+ path = btree_path_alloc(trans, path_pos);
+ path_pos = NULL;
+
+ __btree_path_get(path, intent);
+ path->pos = pos;
+ path->btree_id = btree_id;
+ path->cached = cached;
+ path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ path->should_be_locked = false;
+ path->level = level;
+ path->locks_want = locks_want;
+ path->nodes_locked = 0;
+ path->nodes_intent_locked = 0;
+ for (i = 0; i < ARRAY_SIZE(path->l); i++)
+ path->l[i].b = BTREE_ITER_NO_NODE_INIT;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ path->ip_allocated = ip;
+#endif
+ btree_trans_verify_sorted(trans);
+ }
+
+ if (!(flags & BTREE_ITER_NOPRESERVE))
+ path->preserve = true;
+
+ if (path->intent_ref)
+ locks_want = max(locks_want, level + 1);
+
+ /*
+ * If the path has locks_want greater than requested, we don't downgrade
+ * it here - on transaction restart because btree node split needs to
+ * upgrade locks, we might be putting/getting the iterator again.
+ * Downgrading iterators only happens via bch2_trans_downgrade(), after
+ * a successful transaction commit.
+ */
+
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
+ if (locks_want > path->locks_want) {
+ path->locks_want = locks_want;
+ btree_path_get_locks(trans, path, true);
+ }
+
+ return path;
+}
+
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+ struct bkey_s_c k;
+
+ if (!path->cached) {
+ struct btree_path_level *l = path_l(path);
+ struct bkey_packed *_k;
+
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+
+ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+ k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+ EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
+
+ if (!k.k || bpos_cmp(path->pos, k.k->p))
+ goto hole;
+ } else {
+ struct bkey_cached *ck = (void *) path->l[0].b;
+
+ EBUG_ON(ck &&
+ (path->btree_id != ck->key.btree_id ||
+ bkey_cmp(path->pos, ck->key.pos)));
+
+ /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
+ if (unlikely(!ck || !ck->valid))
+ return bkey_s_c_null;
+
+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+
+ k = bkey_i_to_s_c(ck->k);
+ }
+
+ return k;
+hole:
+ bkey_init(u);
+ u->p = path->pos;
+ return (struct bkey_s_c) { u, NULL };
+}
+
+/* Btree iterators: */
+
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+ return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+}
+
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+ int ret;
+
+ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
+ btree_iter_search_key(iter),
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
if (ret)
return ret;
- iter->should_be_locked = true;
+ iter->path->should_be_locked = true;
return 0;
}
struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
{
- struct btree *b;
+ struct btree_trans *trans = iter->trans;
+ struct btree *b = NULL;
int ret;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+ EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
- ret = btree_iter_traverse(iter);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
- return NULL;
+ goto err;
- b = btree_iter_node(iter, iter->level);
+ b = btree_path_node(iter->path, iter->path->level);
if (!b)
- return NULL;
+ goto out;
BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
- iter->pos = iter->real_pos = b->key.k.p;
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = b->key.k.p;
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ iter->path->should_be_locked = true;
+ BUG_ON(iter->path->uptodate);
+out:
+ bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
return b;
+err:
+ b = ERR_PTR(ret);
+ goto out;
}
struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
{
- struct btree *b;
+ struct btree_trans *trans = iter->trans;
+ struct btree_path *path = iter->path;
+ struct btree *b = NULL;
+ unsigned l;
int ret;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+ BUG_ON(trans->restarted);
+ EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
- /* already got to end? */
- if (!btree_iter_node(iter, iter->level))
- return NULL;
-
- bch2_trans_cond_resched(iter->trans);
-
- btree_node_unlock(iter, iter->level);
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
- iter->level++;
-
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- ret = btree_iter_traverse(iter);
- if (ret)
+ /* already at end? */
+ if (!btree_path_node(path, path->level))
return NULL;
/* got to end? */
- b = btree_iter_node(iter, iter->level);
- if (!b)
+ if (!btree_path_node(path, path->level + 1)) {
+ btree_node_unlock(path, path->level);
+ path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+ path->level++;
return NULL;
+ }
+
+ if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+ __bch2_btree_path_unlock(path);
+ path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+ path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+ trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
+ path->btree_id, &path->pos);
+ btree_trans_restart(trans);
+ ret = -EINTR;
+ goto err;
+ }
+
+ b = btree_path_node(path, path->level + 1);
- if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
+ if (!bpos_cmp(iter->pos, b->key.k.p)) {
+ btree_node_unlock(path, path->level);
+ path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+ path->level++;
+ } else {
/*
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
- btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
+ path = iter->path =
+ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ path->level = iter->min_depth;
- /* Unlock to avoid screwing up our lock invariants: */
- btree_node_unlock(iter, iter->level);
+ for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+ btree_node_unlock(path, l);
- iter->level = iter->min_depth;
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
bch2_btree_iter_verify(iter);
- ret = btree_iter_traverse(iter);
+ ret = bch2_btree_path_traverse(trans, path, iter->flags);
if (ret)
- return NULL;
+ goto err;
- b = iter->l[iter->level].b;
+ b = path->l[path->level].b;
}
- iter->pos = iter->real_pos = b->key.k.p;
+ bkey_init(&iter->k);
+ iter->k.p = iter->pos = b->key.k.p;
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ iter->path->should_be_locked = true;
+ BUG_ON(iter->path->uptodate);
+out:
+ bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
return b;
+err:
+ b = ERR_PTR(ret);
+ goto out;
}
/* Iterate across keys (in leaf nodes only) */
-static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct bpos old_pos = iter->real_pos;
-#endif
- int cmp = bpos_cmp(new_pos, iter->real_pos);
- unsigned l = iter->level;
-
- EBUG_ON(iter->trans->restarted);
-
- if (!cmp)
- goto out;
-
- iter->real_pos = new_pos;
- iter->should_be_locked = false;
-
- btree_iter_check_sort(iter->trans, iter);
-
- if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
- btree_node_unlock(iter, 0);
- iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- return;
- }
-
- l = btree_iter_up_until_good_node(iter, cmp);
-
- if (btree_iter_node(iter, l)) {
- /*
- * We might have to skip over many keys, or just a few: try
- * advancing the node iterator, and if we have to skip over too
- * many keys just reinit it (or if we're rewinding, since that
- * is expensive).
- */
- if (cmp < 0 ||
- !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
- __btree_iter_init(iter, l);
-
- /* Don't leave it locked if we're not supposed to: */
- if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
- btree_node_unlock(iter, l);
- }
-out:
- if (l != iter->level)
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
- else
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
- bch2_btree_iter_verify(iter);
-#ifdef CONFIG_BCACHEFS_DEBUG
- trace_iter_set_search_pos(iter->trans->ip, _RET_IP_,
- iter->btree_id,
- &old_pos, &new_pos, l);
-#endif
-}
-
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
- bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
+ bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_cmp(pos, SPOS_MAX)
+ : bkey_cmp(pos, SPOS_MAX)) != 0;
if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
pos = bkey_successor(iter, pos);
return ret;
}
-static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bpos pos)
{
- struct bpos next_pos = iter->l[0].b->key.k.p;
- bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0;
+ struct btree_insert_entry *i;
- /*
- * Typically, we don't want to modify iter->pos here, since that
- * indicates where we searched from - unless we got to the end of the
- * btree, in that case we want iter->pos to reflect that:
- */
- if (ret)
- btree_iter_set_search_pos(iter, bpos_successor(next_pos));
- else
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ trans_for_each_update(trans, i)
+ if ((cmp_int(btree_id, i->btree_id) ?:
+ bpos_cmp(pos, i->k->k.p)) <= 0) {
+ if (btree_id == i->btree_id)
+ return i->k;
+ break;
+ }
- return ret;
+ return NULL;
}
-static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
+static noinline
+struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_path *path)
{
- struct bpos next_pos = iter->l[0].b->data->min_key;
- bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
+ struct journal_keys *keys = &trans->c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, path->btree_id,
+ path->level, path->pos);
- if (ret)
- btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
- else
- bch2_btree_iter_set_pos(iter, POS_MIN);
+ while (idx < keys->nr && keys->d[idx].overwritten)
+ idx++;
- return ret;
+ return (idx < keys->nr &&
+ keys->d[idx].btree_id == path->btree_id &&
+ keys->d[idx].level == path->level)
+ ? keys->d[idx].k
+ : NULL;
}
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter,
- struct bpos pos)
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
- struct btree_insert_entry *i;
+ struct bkey_i *next_journal =
+ __btree_trans_peek_journal(trans, iter->path);
- if (!(iter->flags & BTREE_ITER_WITH_UPDATES))
- return NULL;
+ if (next_journal &&
+ bpos_cmp(next_journal->k.p,
+ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ iter->k = next_journal->k;
+ k = bkey_i_to_s_c(next_journal);
+ }
- trans_for_each_update(iter->trans, i)
- if ((cmp_int(iter->btree_id, i->iter->btree_id) ?:
- bkey_cmp(pos, i->k->k.p)) <= 0) {
- if (iter->btree_id == i->iter->btree_id)
- return i->k;
+ return k;
+}
+
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bch_fs *c = trans->c;
+ struct bkey u;
+ int ret;
+
+ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+ return bkey_s_c_null;
+
+ if (!iter->key_cache_path)
+ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+ iter->flags & BTREE_ITER_INTENT, 0,
+ iter->flags|BTREE_ITER_CACHED,
+ _THIS_IP_);
+
+ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
+
+ iter->key_cache_path->should_be_locked = true;
+
+ return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+{
+ struct btree_trans *trans = iter->trans;
+ struct bkey_i *next_update;
+ struct bkey_s_c k, k2;
+ int ret;
+
+ EBUG_ON(iter->path->cached || iter->path->level);
+ bch2_btree_iter_verify(iter);
+
+ while (1) {
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ goto out;
+ }
+
+ iter->path->should_be_locked = true;
+
+ k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ k.k &&
+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ ret = bkey_err(k2);
+ if (ret) {
+ k = k2;
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ goto out;
+ }
+
+ k = k2;
+ iter->k = *k.k;
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+ k = btree_trans_peek_journal(trans, iter, k);
+
+ next_update = iter->flags & BTREE_ITER_WITH_UPDATES
+ ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
+ : NULL;
+ if (next_update &&
+ bpos_cmp(next_update->k.p,
+ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ iter->k = next_update->k;
+ k = bkey_i_to_s_c(next_update);
+ }
+
+ if (k.k && bkey_deleted(k.k)) {
+ /*
+ * If we've got a whiteout, and it's after the search
+ * key, advance the search key to the whiteout instead
+ * of just after the whiteout - it might be a btree
+ * whiteout, with a real key at the same position, since
+ * in the btree deleted keys sort before non deleted.
+ */
+ search_key = bpos_cmp(search_key, k.k->p)
+ ? k.k->p
+ : bpos_successor(k.k->p);
+ continue;
+ }
+
+ if (likely(k.k)) {
break;
+ } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
+ /* Advance to next leaf node: */
+ search_key = bpos_successor(iter->path->l[0].b->key.k.p);
+ } else {
+ /* End of btree: */
+ bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ goto out;
}
+ }
+out:
+ bch2_btree_iter_verify(iter);
- return NULL;
+ return k;
}
/**
*/
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
+ struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
- struct bkey_i *next_update;
struct bkey_s_c k;
int ret;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
- bch2_btree_iter_verify(iter);
+ if (iter->update_path) {
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+ }
+
bch2_btree_iter_verify_entry_exit(iter);
-start:
- next_update = btree_trans_peek_updates(iter, search_key);
- btree_iter_set_search_pos(iter, search_key);
while (1) {
- ret = btree_iter_traverse(iter);
- if (unlikely(ret))
- return bkey_s_c_err(ret);
+ k = __bch2_btree_iter_peek(iter, search_key);
+ if (!k.k || bkey_err(k))
+ goto out;
- k = btree_iter_level_peek(iter, &iter->l[0]);
+ if (iter->update_path &&
+ bkey_cmp(iter->update_path->pos, k.k->p)) {
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+ }
- if (next_update &&
- bpos_cmp(next_update->k.p, iter->real_pos) <= 0) {
- iter->k = next_update->k;
- k = bkey_i_to_s_c(next_update);
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ (iter->flags & BTREE_ITER_INTENT) &&
+ !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ !iter->update_path) {
+ struct bpos pos = k.k->p;
+
+ if (pos.snapshot < iter->snapshot) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
+
+ pos.snapshot = iter->snapshot;
+
+ /*
+ * advance, same as on exit for iter->path, but only up
+ * to snapshot
+ */
+ __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = iter->path;
+
+ iter->update_path = bch2_btree_path_set_pos(trans,
+ iter->update_path, pos,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ BUG_ON(!(iter->update_path->nodes_locked & 1));
+ iter->update_path->should_be_locked = true;
+ }
+
+ /*
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
+ */
+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+ !bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
}
- if (likely(k.k)) {
- if (bkey_deleted(k.k)) {
- search_key = bkey_successor(iter, k.k->p);
- goto start;
- }
-
- break;
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
}
- if (!btree_iter_set_pos_to_next_leaf(iter))
- return bkey_s_c_null;
+ break;
}
/*
else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
iter->pos = bkey_start_pos(k.k);
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ BUG_ON(!iter->path->nodes_locked);
+out:
+ if (iter->update_path) {
+ BUG_ON(!(iter->update_path->nodes_locked & 1));
+ iter->update_path->should_be_locked = true;
+ }
+ iter->path->should_be_locked = true;
+
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ iter->pos.snapshot = iter->snapshot;
+
+ ret = bch2_btree_iter_verify_ret(iter, k);
+ if (unlikely(ret)) {
+ bch2_btree_iter_set_pos(iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ }
+
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
+
return k;
}
*/
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
- struct btree_iter_level *l = &iter->l[0];
+ struct btree_trans *trans = iter->trans;
+ struct bpos search_key = iter->pos;
+ struct btree_path *saved_path = NULL;
struct bkey_s_c k;
+ struct bkey saved_k;
+ const struct bch_val *saved_v;
int ret;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+ EBUG_ON(iter->path->cached || iter->path->level);
EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+ if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+ return bkey_s_c_err(-EIO);
+
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- btree_iter_set_search_pos(iter, iter->pos);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ search_key.snapshot = U32_MAX;
while (1) {
- ret = btree_iter_traverse(iter);
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
- goto no_key;
+ goto out;
}
- k = btree_iter_level_peek(iter, l);
+ k = btree_path_level_peek(trans->c, iter->path,
+ &iter->path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
- ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
- : bkey_cmp(k.k->p, iter->pos) > 0))
- k = btree_iter_level_prev(iter, l);
+ ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
+ : bpos_cmp(k.k->p, search_key) > 0))
+ k = btree_path_level_prev(trans->c, iter->path,
+ &iter->path->l[0], &iter->k);
- if (likely(k.k))
- break;
+ btree_path_check_sort(trans, iter->path, 0);
+
+ if (likely(k.k)) {
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+ if (k.k->p.snapshot == iter->snapshot)
+ goto got_key;
+
+ /*
+ * If we have a saved candidate, and we're no
+ * longer at the same _key_ (not pos), return
+ * that candidate
+ */
+ if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
+ bch2_path_put(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->path = saved_path;
+ saved_path = NULL;
+ iter->k = saved_k;
+ k.v = saved_v;
+ goto got_key;
+ }
+
+ if (bch2_snapshot_is_ancestor(iter->trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ if (saved_path)
+ bch2_path_put(trans, saved_path,
+ iter->flags & BTREE_ITER_INTENT);
+ saved_path = btree_path_clone(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ saved_k = *k.k;
+ saved_v = k.v;
+ }
+
+ search_key = bpos_predecessor(k.k->p);
+ continue;
+ }
+got_key:
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+ search_key = bkey_predecessor(iter, k.k->p);
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ search_key.snapshot = U32_MAX;
+ continue;
+ }
- if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+ break;
+ } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
+ /* Advance to previous leaf node: */
+ search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+ } else {
+ /* Start of btree: */
+ bch2_btree_iter_set_pos(iter, POS_MIN);
k = bkey_s_c_null;
- goto no_key;
+ goto out;
}
}
/* Extents can straddle iter->pos: */
if (bkey_cmp(k.k->p, iter->pos) < 0)
iter->pos = k.k->p;
+
+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+ iter->pos.snapshot = iter->snapshot;
out:
+ if (saved_path)
+ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+ iter->path->should_be_locked = true;
+
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
+
return k;
-no_key:
- /*
- * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
- * then we errored going to the previous leaf - make sure it's
- * consistent with iter->pos:
- */
- bkey_init(&iter->k);
- iter->k.p = iter->pos;
- goto out;
}
/**
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
{
+ struct btree_trans *trans = iter->trans;
struct bpos search_key;
struct bkey_s_c k;
int ret;
- EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS &&
- btree_iter_type(iter) != BTREE_ITER_CACHED);
+ EBUG_ON(iter->path->level);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
}
search_key = btree_iter_search_key(iter);
- btree_iter_set_search_pos(iter, search_key);
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
- ret = btree_iter_traverse(iter);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret))
return bkey_s_c_err(ret);
- if (btree_iter_type(iter) == BTREE_ITER_CACHED ||
- !(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+ if ((iter->flags & BTREE_ITER_CACHED) ||
+ !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
struct bkey_i *next_update;
- struct bkey_cached *ck;
- switch (btree_iter_type(iter)) {
- case BTREE_ITER_KEYS:
- k = btree_iter_level_peek_all(iter, &iter->l[0]);
- EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
- break;
- case BTREE_ITER_CACHED:
- ck = (void *) iter->l[0].b;
- EBUG_ON(iter->btree_id != ck->key.btree_id ||
- bkey_cmp(iter->pos, ck->key.pos));
- BUG_ON(!ck->valid);
-
- k = bkey_i_to_s_c(ck->k);
- break;
- case BTREE_ITER_NODES:
- BUG();
+ if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ (next_update = btree_trans_peek_updates(trans,
+ iter->btree_id, search_key)) &&
+ !bpos_cmp(next_update->k.p, iter->pos)) {
+ iter->k = next_update->k;
+ k = bkey_i_to_s_c(next_update);
+ goto out;
}
- next_update = btree_trans_peek_updates(iter, search_key);
- if (next_update &&
- (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+ (next_update = __btree_trans_peek_journal(trans, iter->path)) &&
+ !bpos_cmp(next_update->k.p, iter->pos)) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
+ goto out;
}
+
+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ if (!bkey_err(k))
+ iter->k = *k.k;
+ goto out;
+ }
+
+ k = bch2_btree_path_peek_slot(iter->path, &iter->k);
} else {
- if ((iter->flags & BTREE_ITER_INTENT)) {
- struct btree_iter *child =
- btree_iter_child_alloc(iter, _THIS_IP_);
+ struct bpos next;
- btree_iter_copy(child, iter);
- k = bch2_btree_iter_peek(child);
+ if (iter->flags & BTREE_ITER_INTENT) {
+ struct btree_iter iter2;
- if (k.k && !bkey_err(k))
- iter->k = child->k;
+ bch2_trans_copy_iter(&iter2, iter);
+ k = bch2_btree_iter_peek(&iter2);
+
+ if (k.k && !bkey_err(k)) {
+ iter->k = iter2.k;
+ k.k = &iter->k;
+ }
+ bch2_trans_iter_exit(trans, &iter2);
} else {
struct bpos pos = iter->pos;
if (unlikely(bkey_err(k)))
return k;
- }
- if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
- if (!k.k ||
- ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
- ? bpos_cmp(iter->pos, k.k->p)
- : bkey_cmp(iter->pos, k.k->p))) {
- bkey_init(&iter->k);
- iter->k.p = iter->pos;
- k = (struct bkey_s_c) { &iter->k, NULL };
- }
- } else {
- struct bpos next = k.k ? bkey_start_pos(k.k) : POS_MAX;
+ next = k.k ? bkey_start_pos(k.k) : POS_MAX;
if (bkey_cmp(iter->pos, next) < 0) {
bkey_init(&iter->k);
iter->k.p = iter->pos;
- bch2_key_resize(&iter->k,
- min_t(u64, KEY_SIZE_MAX,
- (next.inode == iter->pos.inode
- ? next.offset
- : KEY_OFFSET_MAX) -
- iter->pos.offset));
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ bch2_key_resize(&iter->k,
+ min_t(u64, KEY_SIZE_MAX,
+ (next.inode == iter->pos.inode
+ ? next.offset
+ : KEY_OFFSET_MAX) -
+ iter->pos.offset));
+ EBUG_ON(!iter->k.size);
+ }
k = (struct bkey_s_c) { &iter->k, NULL };
- EBUG_ON(!k.k->size);
}
}
+out:
+ iter->path->should_be_locked = true;
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
- iter->should_be_locked = true;
+ ret = bch2_btree_iter_verify_ret(iter, k);
+ if (unlikely(ret))
+ return bkey_s_c_err(ret);
return k;
}
return bch2_btree_iter_peek_slot(iter);
}
-static inline void bch2_btree_iter_init(struct btree_trans *trans,
- struct btree_iter *iter, enum btree_id btree_id)
-{
- struct bch_fs *c = trans->c;
- unsigned i;
-
- iter->trans = trans;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
- iter->btree_id = btree_id;
- iter->real_pos = POS_MIN;
- iter->level = 0;
- iter->min_depth = 0;
- iter->locks_want = 0;
- iter->nodes_locked = 0;
- iter->nodes_intent_locked = 0;
- for (i = 0; i < ARRAY_SIZE(iter->l); i++)
- iter->l[i].b = BTREE_ITER_NO_NODE_INIT;
-
- prefetch(c->btree_roots[btree_id].b);
-}
-
/* new transactional stuff: */
-static inline void btree_iter_verify_sorted_ref(struct btree_trans *trans,
- struct btree_iter *iter)
+static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
+ struct btree_path *path)
{
- EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
- EBUG_ON(trans->sorted[iter->sorted_idx] != iter->idx);
- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+ EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+ EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
}
static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
unsigned i;
for (i = 0; i < trans->nr_sorted; i++)
- btree_iter_verify_sorted_ref(trans, trans->iters + trans->sorted[i]);
+ btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
#endif
}
-static inline void btree_trans_verify_sorted(struct btree_trans *trans)
+static void btree_trans_verify_sorted(struct btree_trans *trans)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- struct btree_iter *iter, *prev = NULL;
+ struct btree_path *path, *prev = NULL;
+ unsigned i;
- trans_for_each_iter_inorder(trans, iter)
- BUG_ON(prev && btree_iter_cmp(prev, iter) > 0);
+ trans_for_each_path_inorder(trans, path, i) {
+ BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+ prev = path;
+ }
#endif
}
-static inline void btree_iter_swap(struct btree_trans *trans,
- struct btree_iter *l, struct btree_iter *r)
+static inline void btree_path_swap(struct btree_trans *trans,
+ struct btree_path *l, struct btree_path *r)
{
swap(l->sorted_idx, r->sorted_idx);
swap(trans->sorted[l->sorted_idx],
trans->sorted[r->sorted_idx]);
- btree_iter_verify_sorted_ref(trans, l);
- btree_iter_verify_sorted_ref(trans, r);
+ btree_path_verify_sorted_ref(trans, l);
+ btree_path_verify_sorted_ref(trans, r);
}
-static void btree_trans_sort_iters(struct btree_trans *trans)
+static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
+ int cmp)
{
- bool swapped = false;
- int i, l = 0, r = trans->nr_sorted;
-
- while (1) {
- for (i = l; i + 1 < r; i++) {
- if (btree_iter_cmp(trans->iters + trans->sorted[i],
- trans->iters + trans->sorted[i + 1]) > 0) {
- swap(trans->sorted[i], trans->sorted[i + 1]);
- trans->iters[trans->sorted[i]].sorted_idx = i;
- trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
- swapped = true;
- }
- }
+ struct btree_path *n;
- if (!swapped)
- break;
+ if (cmp <= 0) {
+ n = prev_btree_path(trans, path);
+ if (n && btree_path_cmp(n, path) > 0) {
+ do {
+ btree_path_swap(trans, n, path);
+ n = prev_btree_path(trans, path);
+ } while (n && btree_path_cmp(n, path) > 0);
- r--;
- swapped = false;
-
- for (i = r - 2; i >= l; --i) {
- if (btree_iter_cmp(trans->iters + trans->sorted[i],
- trans->iters + trans->sorted[i + 1]) > 0) {
- swap(trans->sorted[i],
- trans->sorted[i + 1]);
- trans->iters[trans->sorted[i]].sorted_idx = i;
- trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
- swapped = true;
- }
+ goto out;
}
-
- if (!swapped)
- break;
-
- l++;
- swapped = false;
- }
-
- btree_trans_verify_sorted_refs(trans);
- btree_trans_verify_sorted(trans);
-}
-
-static void btree_iter_check_sort(struct btree_trans *trans, struct btree_iter *iter)
-{
- struct btree_iter *n;
-
- EBUG_ON(iter->sorted_idx == U8_MAX);
-
- n = next_btree_iter(trans, iter);
- if (n && btree_iter_cmp(iter, n) > 0) {
- do {
- btree_iter_swap(trans, iter, n);
- n = next_btree_iter(trans, iter);
- } while (n && btree_iter_cmp(iter, n) > 0);
-
- return;
}
- n = prev_btree_iter(trans, iter);
- if (n && btree_iter_cmp(n, iter) > 0) {
- do {
- btree_iter_swap(trans, n, iter);
- n = prev_btree_iter(trans, iter);
- } while (n && btree_iter_cmp(n, iter) > 0);
+ if (cmp >= 0) {
+ n = next_btree_path(trans, path);
+ if (n && btree_path_cmp(path, n) > 0) {
+ do {
+ btree_path_swap(trans, path, n);
+ n = next_btree_path(trans, path);
+ } while (n && btree_path_cmp(path, n) > 0);
+ }
}
-
+out:
btree_trans_verify_sorted(trans);
}
-static inline void btree_iter_list_remove(struct btree_trans *trans,
- struct btree_iter *iter)
+static inline void btree_path_list_remove(struct btree_trans *trans,
+ struct btree_path *path)
{
unsigned i;
- EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
+ EBUG_ON(path->sorted_idx >= trans->nr_sorted);
- array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx);
+ array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
- for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
- trans->iters[trans->sorted[i]].sorted_idx = i;
+ for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ trans->paths[trans->sorted[i]].sorted_idx = i;
- iter->sorted_idx = U8_MAX;
+ path->sorted_idx = U8_MAX;
btree_trans_verify_sorted_refs(trans);
}
-static inline void btree_iter_list_add(struct btree_trans *trans,
- struct btree_iter *pos,
- struct btree_iter *iter)
+static inline void btree_path_list_add(struct btree_trans *trans,
+ struct btree_path *pos,
+ struct btree_path *path)
{
unsigned i;
btree_trans_verify_sorted_refs(trans);
- iter->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted;
+ path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
- array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx);
+ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
- for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
- trans->iters[trans->sorted[i]].sorted_idx = i;
+ for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ trans->paths[trans->sorted[i]].sorted_idx = i;
btree_trans_verify_sorted_refs(trans);
}
-static void btree_iter_child_free(struct btree_iter *iter)
-{
- struct btree_iter *child = btree_iter_child(iter);
-
- if (child) {
- bch2_trans_iter_free(iter->trans, child);
- iter->child_idx = U8_MAX;
- }
-}
-
-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
- unsigned long ip)
-{
- struct btree_trans *trans = iter->trans;
- struct btree_iter *child = btree_iter_child(iter);
-
- if (!child) {
- child = btree_trans_iter_alloc(trans, iter);
- child->ip_allocated = ip;
- iter->child_idx = child->idx;
-
- trans->iters_live |= 1ULL << child->idx;
- trans->iters_touched |= 1ULL << child->idx;
- }
-
- return child;
-}
-
-static inline void __bch2_trans_iter_free(struct btree_trans *trans,
- unsigned idx)
-{
- btree_iter_child_free(&trans->iters[idx]);
-
- btree_iter_list_remove(trans, &trans->iters[idx]);
-
- __bch2_btree_iter_unlock(&trans->iters[idx]);
- trans->iters_linked &= ~(1ULL << idx);
- trans->iters_live &= ~(1ULL << idx);
- trans->iters_touched &= ~(1ULL << idx);
-}
-
-int bch2_trans_iter_put(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- int ret;
-
- if (IS_ERR_OR_NULL(iter))
- return 0;
-
- BUG_ON(trans->iters + iter->idx != iter);
- BUG_ON(!btree_iter_live(trans, iter));
-
- ret = btree_iter_err(iter);
-
- if (!(trans->iters_touched & (1ULL << iter->idx)) &&
- !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
- __bch2_trans_iter_free(trans, iter->idx);
-
- trans->iters_live &= ~(1ULL << iter->idx);
- return ret;
-}
-
-int bch2_trans_iter_free(struct btree_trans *trans,
- struct btree_iter *iter)
-{
- if (IS_ERR_OR_NULL(iter))
- return 0;
-
- set_btree_iter_dontneed(trans, iter);
-
- return bch2_trans_iter_put(trans, iter);
-}
-
-noinline __cold
-static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
-{
-
- struct btree_iter *iter;
- struct btree_insert_entry *i;
- char buf[100];
-
- btree_trans_sort_iters(trans);
-
- trans_for_each_iter_inorder(trans, iter)
- printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
- bch2_btree_ids[iter->btree_id],
- (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf),
- btree_iter_live(trans, iter) ? " live" : "",
- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
- (void *) iter->ip_allocated);
-
- trans_for_each_update(trans, i) {
- char buf[300];
-
- bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
- printk(KERN_ERR "update: btree %s %s\n",
- bch2_btree_ids[i->iter->btree_id], buf);
- }
- panic("trans iter oveflow\n");
-}
-
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
- struct btree_iter *pos)
-{
- struct btree_iter *iter;
- unsigned idx;
-
- if (unlikely(trans->iters_linked ==
- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
- btree_trans_iter_alloc_fail(trans);
-
- idx = __ffs64(~trans->iters_linked);
- iter = &trans->iters[idx];
-
- iter->trans = trans;
- iter->idx = idx;
- iter->child_idx = U8_MAX;
- iter->sorted_idx = U8_MAX;
- iter->flags = 0;
- iter->nodes_locked = 0;
- iter->nodes_intent_locked = 0;
- trans->iters_linked |= 1ULL << idx;
-
- btree_iter_list_add(trans, pos, iter);
- return iter;
-}
-
-static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{
- unsigned i;
-
- __bch2_btree_iter_unlock(dst);
- btree_iter_child_free(dst);
-
- memcpy(&dst->flags, &src->flags,
- sizeof(struct btree_iter) - offsetof(struct btree_iter, flags));
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_locked(dst, i))
- six_lock_increment(&dst->l[i].b->c.lock,
- __btree_lock_want(dst, i));
-
- dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
- dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
-
- btree_iter_check_sort(dst->trans, dst);
+ if (iter->path)
+ bch2_path_put(trans, iter->path,
+ iter->flags & BTREE_ITER_INTENT);
+ if (iter->update_path)
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ if (iter->key_cache_path)
+ bch2_path_put(trans, iter->key_cache_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->path = NULL;
+ iter->update_path = NULL;
+ iter->key_cache_path = NULL;
}
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
- unsigned btree_id, struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags)
+static void __bch2_trans_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags,
+ unsigned long ip)
{
- struct btree_iter *iter, *best = NULL;
- struct bpos real_pos, pos_min = POS_MIN;
-
EBUG_ON(trans->restarted);
- if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
- btree_node_type_is_extents(btree_id) &&
- !(flags & BTREE_ITER_NOT_EXTENTS) &&
- !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+ if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+ btree_node_type_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
- if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+ if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
- if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
- pos.snapshot = btree_type_has_snapshots(btree_id)
- ? U32_MAX : 0;
-
- real_pos = pos;
-
- if ((flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(pos, POS_MAX))
- real_pos = bpos_nosnap_successor(pos);
-
- trans_for_each_iter(trans, iter) {
- if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
- continue;
-
- if (iter->btree_id != btree_id)
- continue;
-
- if (best) {
- int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
- bpos_diff(iter->real_pos, real_pos));
-
- if (cmp < 0 ||
- ((cmp == 0 && btree_iter_keep(trans, iter))))
- continue;
- }
-
- best = iter;
- }
-
- if (!best) {
- iter = btree_trans_iter_alloc(trans, NULL);
- bch2_btree_iter_init(trans, iter, btree_id);
- } else if (btree_iter_keep(trans, best)) {
- iter = btree_trans_iter_alloc(trans, best);
- btree_iter_copy(iter, best);
- } else {
- iter = best;
- }
-
- trans->iters_live |= 1ULL << iter->idx;
- trans->iters_touched |= 1ULL << iter->idx;
-
- iter->flags = flags;
-
- iter->snapshot = pos.snapshot;
+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+ btree_type_has_snapshots(btree_id))
+ flags |= BTREE_ITER_FILTER_SNAPSHOTS;
- /*
- * If the iterator has locks_want greater than requested, we explicitly
- * do not downgrade it here - on transaction restart because btree node
- * split needs to upgrade locks, we might be putting/getting the
- * iterator again. Downgrading iterators only happens via an explicit
- * bch2_trans_downgrade().
- */
-
- locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > iter->locks_want) {
- iter->locks_want = locks_want;
- btree_iter_get_locks(iter, true, _THIS_IP_);
- }
+ if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
+ flags |= BTREE_ITER_WITH_JOURNAL;
- while (iter->level != depth) {
- btree_node_unlock(iter, iter->level);
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
- iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
- if (iter->level < depth)
- iter->level++;
- else
- iter->level--;
- }
+ if (!btree_id_cached(trans->c, btree_id)) {
+ flags &= ~BTREE_ITER_CACHED;
+ flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+ } else if (!(flags & BTREE_ITER_CACHED))
+ flags |= BTREE_ITER_WITH_KEY_CACHE;
+ iter->trans = trans;
+ iter->path = NULL;
+ iter->update_path = NULL;
+ iter->key_cache_path = NULL;
+ iter->btree_id = btree_id;
iter->min_depth = depth;
+ iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+ iter->pos = pos;
+ iter->k.type = KEY_TYPE_deleted;
+ iter->k.p = pos;
+ iter->k.size = 0;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ iter->ip_allocated = ip;
+#endif
- bch2_btree_iter_set_pos(iter, pos);
- btree_iter_set_search_pos(iter, real_pos);
-
- trace_trans_get_iter(_RET_IP_, trans->ip,
- btree_id,
- &real_pos, locks_want, iter->uptodate,
- best ? &best->real_pos : &pos_min,
- best ? best->locks_want : U8_MAX,
- best ? best->uptodate : U8_MAX);
-
- return iter;
+ iter->path = bch2_path_get(trans, btree_id, iter->pos,
+ locks_want, depth, flags, ip);
}
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bpos pos,
- unsigned locks_want,
- unsigned depth,
- unsigned flags)
+void bch2_trans_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ unsigned btree_id, struct bpos pos,
+ unsigned flags)
{
- struct btree_iter *iter =
- __bch2_trans_get_iter(trans, btree_id, pos,
- locks_want, depth,
- BTREE_ITER_NODES|
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_ALL_SNAPSHOTS|
- flags);
-
- BUG_ON(bkey_cmp(iter->pos, pos));
- BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
- BUG_ON(iter->level != depth);
- BUG_ON(iter->min_depth != depth);
- iter->ip_allocated = _RET_IP_;
-
- return iter;
+ __bch2_trans_iter_init(trans, iter, btree_id, pos,
+ 0, 0, flags, _RET_IP_);
}
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
- struct btree_iter *src)
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+ struct btree_iter *iter,
+ enum btree_id btree_id,
+ struct bpos pos,
+ unsigned locks_want,
+ unsigned depth,
+ unsigned flags)
{
- struct btree_iter *iter;
-
- iter = btree_trans_iter_alloc(trans, src);
- btree_iter_copy(iter, src);
-
- trans->iters_live |= 1ULL << iter->idx;
- /*
- * We don't need to preserve this iter since it's cheap to copy it
- * again - this will cause trans_iter_put() to free it right away:
- */
- set_btree_iter_dontneed(trans, iter);
+ __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
+ BTREE_ITER_NOT_EXTENTS|
+ __BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ flags, _RET_IP_);
+ BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
+ BUG_ON(iter->path->level != depth);
+ BUG_ON(iter->min_depth != depth);
+}
- return iter;
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+ *dst = *src;
+ if (src->path)
+ __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+ if (src->update_path)
+ __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+ dst->key_cache_path = NULL;
}
void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trans->mem_bytes = new_bytes;
if (old_bytes) {
- trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
+ trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
btree_trans_restart(trans);
return ERR_PTR(-EINTR);
}
return p;
}
-inline void bch2_trans_unlink_iters(struct btree_trans *trans)
-{
- u64 iters = trans->iters_linked &
- ~trans->iters_touched &
- ~trans->iters_live;
-
- while (iters) {
- unsigned idx = __ffs64(iters);
-
- iters &= ~(1ULL << idx);
- __bch2_trans_iter_free(trans, idx);
- }
-}
-
/**
* bch2_trans_begin() - reset a transaction after a interrupted attempt
* @trans: transaction to reset
*/
void bch2_trans_begin(struct btree_trans *trans)
{
- struct btree_iter *iter;
-
- trans_for_each_iter(trans, iter)
- iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
- BTREE_ITER_SET_POS_AFTER_COMMIT);
+ struct btree_insert_entry *i;
+ struct btree_path *path;
- /*
- * XXX: we shouldn't be doing this if the transaction was restarted, but
- * currently we still overflow transaction iterators if we do that
- * */
- bch2_trans_unlink_iters(trans);
- trans->iters_touched &= trans->iters_live;
+ trans_for_each_update(trans, i)
+ __btree_path_put(i->path, true);
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
trans->extra_journal_res = 0;
trans->nr_updates = 0;
trans->mem_top = 0;
(void *) &trans->fs_usage_deltas->memset_start);
}
+ trans_for_each_path(trans, path) {
+ path->should_be_locked = false;
+
+ /*
+ * XXX: we probably shouldn't be doing this if the transaction
+ * was restarted, but currently we still overflow transaction
+ * iterators if we do that
+ */
+ if (!path->ref && !path->preserve)
+ __bch2_path_free(trans, path);
+ else if (!path->ref)
+ path->preserve = false;
+ }
+
bch2_trans_cond_resched(trans);
if (trans->restarted)
- bch2_btree_iter_traverse_all(trans);
+ bch2_btree_path_traverse_all(trans);
trans->restarted = false;
}
-static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
{
- size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX;
+ size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX;
size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
- size_t sorted_bytes = sizeof(u8) * BTREE_ITER_MAX;
void *p = NULL;
BUG_ON(trans->used_mempool);
#ifdef __KERNEL__
- p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+ p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
#endif
if (!p)
- p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+ p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
- trans->iters = p; p += iters_bytes;
+ trans->paths = p; p += paths_bytes;
trans->updates = p; p += updates_bytes;
- trans->sorted = p; p += sorted_bytes;
}
-void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
- unsigned expected_nr_iters,
- size_t expected_mem_bytes)
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
+ unsigned expected_nr_iters,
+ size_t expected_mem_bytes,
+ const char *fn)
__acquires(&c->btree_trans_barrier)
{
+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+
memset(trans, 0, sizeof(*trans));
trans->c = c;
- trans->ip = _RET_IP_;
+ trans->fn = fn;
- /*
- * reallocating iterators currently completely breaks
- * bch2_trans_iter_put(), we always allocate the max:
- */
- bch2_trans_alloc_iters(trans, c);
+ bch2_trans_alloc_paths(trans, c);
if (expected_mem_bytes) {
trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-#ifdef CONFIG_BCACHEFS_DEBUG
trans->pid = current->pid;
mutex_lock(&c->btree_trans_lock);
list_add(&trans->list, &c->btree_trans_list);
mutex_unlock(&c->btree_trans_lock);
+}
+
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+
+ trans_for_each_path(trans, path)
+ if (path->ref)
+ goto leaked;
+ return;
+leaked:
+ bch_err(c, "btree paths leaked from %s!", trans->fn);
+ trans_for_each_path(trans, path)
+ if (path->ref)
+ printk(KERN_ERR " btree %s %pS\n",
+ bch2_btree_ids[path->btree_id],
+ (void *) path->ip_allocated);
+ /* Be noisy about this: */
+ bch2_fatal_error(c);
#endif
}
-int bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_exit(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
+ struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
bch2_trans_unlock(trans);
-#ifdef CONFIG_BCACHEFS_DEBUG
- if (trans->iters_live) {
- struct btree_iter *iter;
-
- trans_for_each_iter(trans, iter)
- btree_iter_child_free(iter);
- }
-
- if (trans->iters_live) {
- struct btree_iter *iter;
+ trans_for_each_update(trans, i)
+ __btree_path_put(i->path, true);
+ trans->nr_updates = 0;
- bch_err(c, "btree iterators leaked!");
- trans_for_each_iter(trans, iter)
- if (btree_iter_live(trans, iter))
- printk(KERN_ERR " btree %s allocated at %pS\n",
- bch2_btree_ids[iter->btree_id],
- (void *) iter->ip_allocated);
- /* Be noisy about this: */
- bch2_fatal_error(c);
- }
+ check_btree_paths_leaked(trans);
- mutex_lock(&trans->c->btree_trans_lock);
+ mutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
- mutex_unlock(&trans->c->btree_trans_lock);
-#endif
+ mutex_unlock(&c->btree_trans_lock);
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
- bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+ bch2_journal_preres_put(&c->journal, &trans->journal_preres);
if (trans->fs_usage_deltas) {
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
REPLICAS_DELTA_LIST_MAX)
mempool_free(trans->fs_usage_deltas,
- &trans->c->replicas_delta_pool);
+ &c->replicas_delta_pool);
else
kfree(trans->fs_usage_deltas);
}
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
- mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
+ mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
kfree(trans->mem);
/*
* Userspace doesn't have a real percpu implementation:
*/
- trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+ trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
#endif
- if (trans->iters)
- mempool_free(trans->iters, &trans->c->btree_iters_pool);
+ if (trans->paths)
+ mempool_free(trans->paths, &c->btree_paths_pool);
trans->mem = (void *) 0x1;
- trans->iters = (void *) 0x1;
-
- return trans->error ? -EIO : 0;
+ trans->paths = (void *) 0x1;
}
static void __maybe_unused
-bch2_btree_iter_node_to_text(struct printbuf *out,
+bch2_btree_path_node_to_text(struct printbuf *out,
struct btree_bkey_cached_common *_b,
- enum btree_iter_type type)
+ bool cached)
{
pr_buf(out, " l=%u %s:",
_b->level, bch2_btree_ids[_b->btree_id]);
- bch2_bpos_to_text(out, btree_node_pos(_b, type));
+ bch2_bpos_to_text(out, btree_node_pos(_b, cached));
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+static bool trans_has_locks(struct btree_trans *trans)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
- iter->nodes_locked)
+ trans_for_each_path(trans, path)
+ if (path->nodes_locked)
return true;
return false;
}
-#endif
void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
struct btree_trans *trans;
- struct btree_iter *iter;
+ struct btree_path *path;
struct btree *b;
+ static char lock_types[] = { 'r', 'i', 'w' };
unsigned l;
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (!trans_has_btree_nodes_locked(trans))
+ if (!trans_has_locks(trans))
continue;
- pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
+ pr_buf(out, "%i %s\n", trans->pid, trans->fn);
- trans_for_each_iter(trans, iter) {
- if (!iter->nodes_locked)
+ trans_for_each_path(trans, path) {
+ if (!path->nodes_locked)
continue;
- pr_buf(out, " iter %u %c %s:",
- iter->idx,
- btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
- bch2_btree_ids[iter->btree_id]);
- bch2_bpos_to_text(out, iter->pos);
+ pr_buf(out, " path %u %c l=%u %s:",
+ path->idx,
+ path->cached ? 'c' : 'b',
+ path->level,
+ bch2_btree_ids[path->btree_id]);
+ bch2_bpos_to_text(out, path->pos);
pr_buf(out, "\n");
for (l = 0; l < BTREE_MAX_DEPTH; l++) {
- if (btree_node_locked(iter, l)) {
+ if (btree_node_locked(path, l)) {
pr_buf(out, " %s l=%u ",
- btree_node_intent_locked(iter, l) ? "i" : "r", l);
- bch2_btree_iter_node_to_text(out,
- (void *) iter->l[l].b,
- btree_iter_type(iter));
+ btree_node_intent_locked(path, l) ? "i" : "r", l);
+ bch2_btree_path_node_to_text(out,
+ (void *) path->l[l].b,
+ path->cached);
pr_buf(out, "\n");
}
}
b = READ_ONCE(trans->locking);
if (b) {
- iter = &trans->iters[trans->locking_iter_idx];
- pr_buf(out, " locking iter %u %c l=%u %s:",
- trans->locking_iter_idx,
- btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
+ path = &trans->paths[trans->locking_path_idx];
+ pr_buf(out, " locking path %u %c l=%u %c %s:",
+ trans->locking_path_idx,
+ path->cached ? 'c' : 'b',
trans->locking_level,
+ lock_types[trans->locking_lock_type],
bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos);
pr_buf(out, " node ");
- bch2_btree_iter_node_to_text(out,
- (void *) b,
- btree_iter_type(iter));
+ bch2_btree_path_node_to_text(out,
+ (void *) b, path->cached);
pr_buf(out, "\n");
}
}
mutex_unlock(&c->btree_trans_lock);
-#endif
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
+ if (c->btree_trans_barrier_initialized)
+ cleanup_srcu_struct(&c->btree_trans_barrier);
mempool_exit(&c->btree_trans_mem_pool);
- mempool_exit(&c->btree_iters_pool);
- cleanup_srcu_struct(&c->btree_trans_barrier);
+ mempool_exit(&c->btree_paths_pool);
}
int bch2_fs_btree_iter_init(struct bch_fs *c)
{
unsigned nr = BTREE_ITER_MAX;
+ int ret;
INIT_LIST_HEAD(&c->btree_trans_list);
mutex_init(&c->btree_trans_lock);
- return init_srcu_struct(&c->btree_trans_barrier) ?:
- mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
- sizeof(u8) * nr +
- sizeof(struct btree_iter) * nr +
+ ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
+ sizeof(struct btree_path) * nr +
sizeof(struct btree_insert_entry) * nr) ?:
mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
- BTREE_TRANS_MEM_MAX);
+ BTREE_TRANS_MEM_MAX) ?:
+ init_srcu_struct(&c->btree_trans_barrier);
+ if (!ret)
+ c->btree_trans_barrier_initialized = true;
+ return ret;
}
#include "bset.h"
#include "btree_types.h"
-static inline void btree_iter_set_dirty(struct btree_iter *iter,
- enum btree_iter_uptodate u)
+static inline void __btree_path_get(struct btree_path *path, bool intent)
{
- iter->uptodate = max_t(unsigned, iter->uptodate, u);
+ path->ref++;
+ path->intent_ref += intent;
}
-static inline struct btree *btree_iter_node(struct btree_iter *iter,
- unsigned level)
+static inline bool __btree_path_put(struct btree_path *path, bool intent)
{
- return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+ EBUG_ON(!path->ref);
+ EBUG_ON(!path->intent_ref && intent);
+ path->intent_ref -= intent;
+ return --path->ref == 0;
}
-static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
- const struct btree *b, unsigned level)
+static inline void btree_path_set_dirty(struct btree_path *path,
+ enum btree_path_uptodate u)
{
- /*
- * We don't compare the low bits of the lock sequence numbers because
- * @iter might have taken a write lock on @b, and we don't want to skip
- * the linked iterator if the sequence numbers were equal before taking
- * that write lock. The lock sequence number is incremented by taking
- * and releasing write locks and is even when unlocked:
- */
- return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+ path->uptodate = max_t(unsigned, path->uptodate, u);
}
-static inline struct btree *btree_node_parent(struct btree_iter *iter,
- struct btree *b)
+static inline struct btree *btree_path_node(struct btree_path *path,
+ unsigned level)
{
- return btree_iter_node(iter, b->c.level + 1);
+ return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
}
-static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
+ const struct btree *b, unsigned level)
{
- return hweight64(trans->iters_linked) > 1;
+ /*
+ * We don't compare the low bits of the lock sequence numbers because
+ * @path might have taken a write lock on @b, and we don't want to skip
+ * the linked path if the sequence numbers were equal before taking that
+ * write lock. The lock sequence number is incremented by taking and
+ * releasing write locks and is even when unlocked:
+ */
+ return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
}
-static inline int btree_iter_err(const struct btree_iter *iter)
+static inline struct btree *btree_node_parent(struct btree_path *path,
+ struct btree *b)
{
- return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+ return btree_path_node(path, b->c.level + 1);
}
-/* Iterate over iters within a transaction: */
+/* Iterate over paths within a transaction: */
-static inline struct btree_iter *
-__trans_next_iter(struct btree_trans *trans, unsigned idx)
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned idx)
{
u64 l;
if (idx == BTREE_ITER_MAX)
return NULL;
- l = trans->iters_linked >> idx;
+ l = trans->paths_allocated >> idx;
if (!l)
return NULL;
idx += __ffs64(l);
EBUG_ON(idx >= BTREE_ITER_MAX);
- EBUG_ON(trans->iters[idx].idx != idx);
- return &trans->iters[idx];
+ EBUG_ON(trans->paths[idx].idx != idx);
+ return &trans->paths[idx];
}
-#define trans_for_each_iter(_trans, _iter) \
- for (_iter = __trans_next_iter((_trans), 0); \
- (_iter); \
- _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
+#define trans_for_each_path(_trans, _path) \
+ for (_path = __trans_next_path((_trans), 0); \
+ (_path); \
+ _path = __trans_next_path((_trans), (_path)->idx + 1))
-static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{
- unsigned idx = iter ? iter->sorted_idx + 1 : 0;
+ unsigned idx = path ? path->sorted_idx + 1 : 0;
EBUG_ON(idx > trans->nr_sorted);
return idx < trans->nr_sorted
- ? trans->iters + trans->sorted[idx]
+ ? trans->paths + trans->sorted[idx]
: NULL;
}
-static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
{
- EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
- return iter->sorted_idx
- ? trans->iters + trans->sorted[iter->sorted_idx - 1]
+ EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+ return path->sorted_idx
+ ? trans->paths + trans->sorted[path->sorted_idx - 1]
: NULL;
}
-#define trans_for_each_iter_inorder(_trans, _iter) \
- for (_iter = next_btree_iter(trans, NULL); \
- (_iter); \
- _iter = next_btree_iter((_trans), (_iter)))
+#define trans_for_each_path_inorder(_trans, _path, _i) \
+ for (_i = 0; \
+ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+ _i++)
-static inline bool __iter_has_node(const struct btree_iter *iter,
+static inline bool __path_has_node(const struct btree_path *path,
const struct btree *b)
{
- return iter->l[b->c.level].b == b &&
- btree_node_lock_seq_matches(iter, b, b->c.level);
+ return path->l[b->c.level].b == b &&
+ btree_node_lock_seq_matches(path, b, b->c.level);
}
-static inline struct btree_iter *
-__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
unsigned idx)
{
- struct btree_iter *iter = __trans_next_iter(trans, idx);
+ struct btree_path *path = __trans_next_path(trans, idx);
- while (iter && !__iter_has_node(iter, b))
- iter = __trans_next_iter(trans, iter->idx + 1);
+ while (path && !__path_has_node(path, b))
+ path = __trans_next_path(trans, path->idx + 1);
- return iter;
+ return path;
}
-#define trans_for_each_iter_with_node(_trans, _b, _iter) \
- for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \
- (_iter); \
- _iter = __trans_next_iter_with_node((_trans), (_b), \
- (_iter)->idx + 1))
+#define trans_for_each_path_with_node(_trans, _b, _path) \
+ for (_path = __trans_next_path_with_node((_trans), (_b), 0); \
+ (_path); \
+ _path = __trans_next_path_with_node((_trans), (_b), \
+ (_path)->idx + 1))
+
+struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+ bool, unsigned long);
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+ struct bpos, bool, unsigned long);
+int __must_check bch2_btree_path_traverse(struct btree_trans *,
+ struct btree_path *, unsigned);
+struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+ unsigned, unsigned, unsigned, unsigned long);
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *);
-void bch2_btree_trans_verify_locks(struct btree_trans *);
+void bch2_trans_verify_paths(struct btree_trans *);
+void bch2_trans_verify_locks(struct btree_trans *);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
+ struct bpos, bool);
#else
-static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
- struct btree *b) {}
-static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+ struct bpos pos, bool key_cache) {}
#endif
-void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
- struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
- struct btree_node_iter *, struct bkey_packed *,
- unsigned, unsigned);
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+ struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
+ struct btree *, struct btree_node_iter *,
+ struct bkey_packed *, unsigned, unsigned);
-bool bch2_btree_iter_relock_intent(struct btree_iter *);
-bool bch2_btree_iter_relock(struct btree_iter *, unsigned long);
+bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+
+void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
bool bch2_trans_relock(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
return -EINTR;
}
-bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+bool bch2_btree_node_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
+
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
-static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
unsigned new_locks_want)
{
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
- return iter->locks_want < new_locks_want
- ? __bch2_btree_iter_upgrade(iter, new_locks_want)
- : iter->uptodate <= BTREE_ITER_NEED_PEEK;
+ return path->locks_want < new_locks_want
+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+ : path->uptodate == BTREE_ITER_UPTODATE;
}
-void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
-static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+static inline void bch2_btree_path_downgrade(struct btree_path *path)
{
- unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT);
+ unsigned new_locks_want = path->level + !!path->intent_ref;
- if (iter->locks_want > new_locks_want)
- __bch2_btree_iter_downgrade(iter, new_locks_want);
+ if (path->locks_want > new_locks_want)
+ __bch2_btree_path_downgrade(path, new_locks_want);
}
void bch2_trans_downgrade(struct btree_trans *);
-void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
-
-void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
int __must_check bch2_btree_iter_traverse(struct btree_iter *);
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
bool bch2_btree_iter_advance(struct btree_iter *);
bool bch2_btree_iter_rewind(struct btree_iter *);
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
- if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
- new_pos.snapshot = iter->snapshot;
-
iter->k.type = KEY_TYPE_deleted;
iter->k.p.inode = iter->pos.inode = new_pos.inode;
iter->k.p.offset = iter->pos.offset = new_pos.offset;
iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot;
iter->k.size = 0;
- iter->should_be_locked = false;
+}
+
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+ if (unlikely(iter->update_path))
+ bch2_path_put(iter->trans, iter->update_path,
+ iter->flags & BTREE_ITER_INTENT);
+ iter->update_path = NULL;
+
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+ new_pos.snapshot = iter->snapshot;
+
+ __bch2_btree_iter_set_pos(iter, new_pos);
}
static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
iter->pos = bkey_start_pos(&iter->k);
}
-static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx)
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
{
- return idx != U8_MAX ? trans->iters + idx : NULL;
+ struct bpos pos = iter->pos;
+
+ iter->snapshot = snapshot;
+ pos.snapshot = snapshot;
+ bch2_btree_iter_set_pos(iter, pos);
}
-static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
+ unsigned, struct bpos, unsigned);
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+ enum btree_id, struct bpos,
+ unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
- return idx_to_btree_iter(iter->trans, iter->child_idx);
+ iter->path->preserve = false;
}
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline int bch2_trans_cond_resched(struct btree_trans *trans)
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+void bch2_trans_begin(struct btree_trans *);
+
+static inline struct btree *
+__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
{
- if (need_resched() || race_fault()) {
- bch2_trans_unlock(trans);
- schedule();
- return bch2_trans_relock(trans) ? 0 : -EINTR;
- } else {
- return 0;
- }
+ struct btree *b;
+
+ while (b = bch2_btree_iter_peek_node(iter),
+ PTR_ERR_OR_ZERO(b) == -EINTR)
+ bch2_trans_begin(trans);
+
+ return b;
}
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _locks_want, _depth, _flags, _b) \
- for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \
- _start, _locks_want, _depth, _flags), \
- _b = bch2_btree_iter_peek_node(_iter); \
- (_b); \
- (_b) = bch2_btree_iter_next_node(_iter))
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
+ _locks_want, _depth, _flags, _b, _ret) \
+ for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \
+ _start, _locks_want, _depth, _flags); \
+ (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\
+ !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \
+ (_b) = bch2_btree_iter_next_node(&(_iter)))
#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
- _flags, _b) \
+ _flags, _b, _ret) \
__for_each_btree_node(_trans, _iter, _btree_id, _start, \
- 0, 0, _flags, _b)
+ 0, 0, _flags, _b, _ret)
+
+static inline int bkey_err(struct bkey_s_c k)
+{
+ return PTR_ERR_OR_ZERO(k.k);
+}
-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
return flags & BTREE_ITER_SLOTS
: bch2_btree_iter_peek(iter);
}
-static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
- unsigned flags)
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- return flags & BTREE_ITER_SLOTS
- ? bch2_btree_iter_next_slot(iter)
- : bch2_btree_iter_next(iter);
+ return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
+ ? -EINTR : 0;
}
-static inline int bkey_err(struct bkey_s_c k)
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
{
- return PTR_ERR_OR_ZERO(k.k);
+ struct bkey_s_c k;
+
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(iter, flags),
+ bkey_err(k) == -EINTR))
+ bch2_trans_begin(trans);
+
+ return k;
}
#define for_each_btree_key(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
- for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \
- (_start), (_flags)), \
- (_k) = __bch2_btree_iter_peek(_iter, _flags); \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
- (_k) = __bch2_btree_iter_next(_iter, _flags))
+ bch2_btree_iter_advance(&(_iter)))
-#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \
- for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
- (_k) = __bch2_btree_iter_next(_iter, _flags))
-
-/* new multiple iterator interface: */
-
-int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
-int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
+ bch2_btree_iter_advance(&(_iter)))
-void bch2_trans_unlink_iters(struct btree_trans *);
-
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
- struct bpos, unsigned,
- unsigned, unsigned);
-
-static inline struct btree_iter *
-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
- struct bpos pos, unsigned flags)
-{
- struct btree_iter *iter =
- __bch2_trans_get_iter(trans, btree_id, pos,
- (flags & BTREE_ITER_INTENT) != 0, 0,
- flags);
- iter->ip_allocated = _THIS_IP_;
- return iter;
-}
-
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
- struct btree_iter *);
-static inline struct btree_iter *
-bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
-{
- struct btree_iter *iter =
- __bch2_trans_copy_iter(trans, src);
-
- iter->ip_allocated = _THIS_IP_;
- return iter;
-}
-
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
- enum btree_id, struct bpos,
- unsigned, unsigned, unsigned);
-
-static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
-{
- return (trans->iters_live & (1ULL << iter->idx)) != 0;
-}
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \
+ for (; \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
-static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
-{
- return btree_iter_live(trans, iter) ||
- (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-}
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
+ for (; \
+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_advance(&(_iter)))
-static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
-{
- trans->iters_touched &= ~(1ULL << iter->idx);
-}
+/* new multiple iterator interface: */
-void bch2_trans_begin(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
+ unsigned, size_t, const char *);
+void bch2_trans_exit(struct btree_trans *);
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
-int bch2_trans_exit(struct btree_trans *);
+#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__)
void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
}
static struct bkey_cached *
-btree_key_cache_create(struct btree_key_cache *c,
+btree_key_cache_create(struct bch_fs *c,
enum btree_id btree_id,
struct bpos pos)
{
+ struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck;
bool was_new = true;
- ck = bkey_cached_alloc(c);
+ ck = bkey_cached_alloc(bc);
if (unlikely(!ck)) {
- ck = bkey_cached_reuse(c);
- if (unlikely(!ck))
+ ck = bkey_cached_reuse(bc);
+ if (unlikely(!ck)) {
+ bch_err(c, "error allocating memory for key cache item, btree %s",
+ bch2_btree_ids[btree_id]);
return ERR_PTR(-ENOMEM);
+ }
was_new = false;
}
+ if (btree_id == BTREE_ID_subvolumes)
+ six_lock_pcpu_alloc(&ck->c.lock);
+ else
+ six_lock_pcpu_free(&ck->c.lock);
+
ck->c.level = 0;
ck->c.btree_id = btree_id;
ck->key.btree_id = btree_id;
ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
- if (unlikely(rhashtable_lookup_insert_fast(&c->table,
+ if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
&ck->hash,
bch2_btree_key_cache_params))) {
/* We raced with another fill: */
six_unlock_intent(&ck->c.lock);
kfree(ck);
} else {
- mutex_lock(&c->lock);
- bkey_cached_free(c, ck);
- mutex_unlock(&c->lock);
+ mutex_lock(&bc->lock);
+ bkey_cached_free(bc, ck);
+ mutex_unlock(&bc->lock);
}
return NULL;
}
- atomic_long_inc(&c->nr_keys);
+ atomic_long_inc(&bc->nr_keys);
six_unlock_write(&ck->c.lock);
}
static int btree_key_cache_fill(struct btree_trans *trans,
- struct btree_iter *ck_iter,
+ struct btree_path *ck_path,
struct bkey_cached *ck)
{
- struct btree_iter *iter;
+ struct btree_path *path;
struct bkey_s_c k;
unsigned new_u64s = 0;
struct bkey_i *new_k = NULL;
+ struct bkey u;
int ret;
- iter = bch2_trans_get_iter(trans, ck->key.btree_id,
- ck->key.pos, BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
+ path = bch2_path_get(trans, ck->key.btree_id,
+ ck->key.pos, 0, 0, 0, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, path, 0);
if (ret)
goto err;
- if (!bch2_btree_node_relock(ck_iter, 0)) {
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+ k = bch2_btree_path_peek_slot(path, &u);
+
+ if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+ trace_trans_restart_relock_key_cache_fill(trans->fn,
+ _THIS_IP_, ck_path->btree_id, &ck_path->pos);
ret = btree_trans_restart(trans);
goto err;
}
new_u64s = roundup_pow_of_two(new_u64s);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_ids[ck->key.btree_id], new_u64s);
ret = -ENOMEM;
goto err;
}
* XXX: not allowed to be holding read locks when we take a write lock,
* currently
*/
- bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
+ bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
if (new_k) {
kfree(ck->k);
ck->u64s = new_u64s;
bkey_reassemble(ck->k, k);
ck->valid = true;
- bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
+ bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
/* We're not likely to need this iterator again: */
- set_btree_iter_dontneed(trans, iter);
+ path->preserve = false;
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_path_put(trans, path, 0);
return ret;
}
static int bkey_cached_check_fn(struct six_lock *lock, void *p)
{
struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
- const struct btree_iter *iter = p;
+ const struct btree_path *path = p;
- return ck->key.btree_id == iter->btree_id &&
- !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+ return ck->key.btree_id == path->btree_id &&
+ !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
}
__flatten
-int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+ unsigned flags)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
int ret = 0;
- BUG_ON(iter->level);
+ BUG_ON(path->level);
- iter->l[1].b = NULL;
+ path->l[1].b = NULL;
- if (bch2_btree_node_relock(iter, 0)) {
- ck = (void *) iter->l[0].b;
+ if (bch2_btree_node_relock(trans, path, 0)) {
+ ck = (void *) path->l[0].b;
goto fill;
}
retry:
- ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
+ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
if (!ck) {
- if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
- iter->l[0].b = NULL;
+ if (flags & BTREE_ITER_CACHED_NOCREATE) {
+ path->l[0].b = NULL;
return 0;
}
- ck = btree_key_cache_create(&c->btree_key_cache,
- iter->btree_id, iter->pos);
+ ck = btree_key_cache_create(c, path->btree_id, path->pos);
ret = PTR_ERR_OR_ZERO(ck);
if (ret)
goto err;
if (!ck)
goto retry;
- mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
- iter->locks_want = 1;
+ mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+ path->locks_want = 1;
} else {
- enum six_lock_type lock_want = __btree_lock_want(iter, 0);
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
- if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
- bkey_cached_check_fn, iter, _THIS_IP_)) {
+ if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
+ lock_want,
+ bkey_cached_check_fn, path, _THIS_IP_)) {
if (!trans->restarted)
goto retry;
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
ret = -EINTR;
goto err;
}
- if (ck->key.btree_id != iter->btree_id ||
- bpos_cmp(ck->key.pos, iter->pos)) {
+ if (ck->key.btree_id != path->btree_id ||
+ bpos_cmp(ck->key.pos, path->pos)) {
six_unlock_type(&ck->c.lock, lock_want);
goto retry;
}
- mark_btree_node_locked(iter, 0, lock_want);
+ mark_btree_node_locked(path, 0, lock_want);
}
- iter->l[0].lock_seq = ck->c.lock.state.seq;
- iter->l[0].b = (void *) ck;
+ path->l[0].lock_seq = ck->c.lock.state.seq;
+ path->l[0].b = (void *) ck;
fill:
- if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
- if (!iter->locks_want &&
- !!__bch2_btree_iter_upgrade(iter, 1)) {
- trace_transaction_restart_ip(trans->ip, _THIS_IP_);
- BUG_ON(!trans->restarted);
- ret = -EINTR;
+ if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+ if (!path->locks_want &&
+ !__bch2_btree_path_upgrade(trans, path, 1)) {
+ trace_transaction_restart_ip(trans->fn, _THIS_IP_);
+ ret = btree_trans_restart(trans);
goto err;
}
- ret = btree_key_cache_fill(trans, iter, ck);
+ ret = btree_key_cache_fill(trans, path, ck);
if (ret)
goto err;
}
if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- iter->uptodate = BTREE_ITER_NEED_PEEK;
-
- if ((iter->flags & BTREE_ITER_INTENT) &&
- !bch2_btree_iter_upgrade(iter, 1)) {
- BUG_ON(!trans->restarted);
- ret = -EINTR;
- }
-
- BUG_ON(!ret && !btree_node_locked(iter, 0));
+ path->uptodate = BTREE_ITER_UPTODATE;
+ BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
return ret;
err:
if (ret != -EINTR) {
- btree_node_unlock(iter, 0);
- iter->flags |= BTREE_ITER_ERROR;
- iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+ btree_node_unlock(path, 0);
+ path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
}
return ret;
}
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
- struct btree_iter *c_iter = NULL, *b_iter = NULL;
+ struct btree_iter c_iter, b_iter;
struct bkey_cached *ck = NULL;
int ret;
- b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
- BTREE_ITER_SLOTS|
- BTREE_ITER_INTENT);
- c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_CACHED_NOCREATE|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(c_iter);
+ bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+ BTREE_ITER_SLOTS|
+ BTREE_ITER_INTENT|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL|
+ BTREE_ITER_CACHED_NOCREATE|
+ BTREE_ITER_INTENT);
+ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
+ ret = bch2_btree_iter_traverse(&c_iter);
if (ret)
goto out;
- ck = (void *) c_iter->l[0].b;
- if (!ck ||
- (journal_seq && ck->journal.seq != journal_seq))
+ ck = (void *) c_iter.path->l[0].b;
+ if (!ck)
goto out;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- if (!evict)
- goto out;
- goto evict;
+ if (evict)
+ goto evict;
+ goto out;
}
+ BUG_ON(!ck->valid);
+
+ if (journal_seq && ck->journal.seq != journal_seq)
+ goto out;
+
/*
* Since journal reclaim depends on us making progress here, and the
* allocator/copygc depend on journal reclaim making progress, we need
* to be using alloc reserves:
* */
- ret = bch2_btree_iter_traverse(b_iter) ?:
- bch2_trans_update(trans, b_iter, ck->k,
+ ret = bch2_btree_iter_traverse(&b_iter) ?:
+ bch2_trans_update(trans, &b_iter, ck->k,
+ BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
- BUG_ON(!btree_node_locked(c_iter, 0));
+ BUG_ON(!btree_node_locked(c_iter.path, 0));
if (!evict) {
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
}
} else {
evict:
- BUG_ON(!btree_node_intent_locked(c_iter, 0));
+ BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
- mark_btree_node_unlocked(c_iter, 0);
- c_iter->l[0].b = NULL;
+ mark_btree_node_unlocked(c_iter.path, 0);
+ c_iter.path->l[0].b = NULL;
six_lock_write(&ck->c.lock, NULL, NULL);
mutex_unlock(&c->btree_key_cache.lock);
}
out:
- bch2_trans_iter_put(trans, b_iter);
- bch2_trans_iter_put(trans, c_iter);
+ bch2_trans_iter_exit(trans, &b_iter);
+ bch2_trans_iter_exit(trans, &c_iter);
return ret;
}
}
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree_path *path,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) iter->l[0].b;
+ struct bkey_cached *ck = (void *) path->l[0].b;
bool kick_reclaim = false;
BUG_ON(insert->u64s > ck->u64s);
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
- for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
- bkey_cached_evict(bc, ck);
- list_add(&ck->list, &bc->freed);
- }
+ if (tbl)
+ for (i = 0; i < tbl->size; i++)
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+ bkey_cached_evict(bc, ck);
+ list_add(&ck->list, &bc->freed);
+ }
rcu_read_unlock();
list_for_each_entry_safe(ck, n, &bc->freed, list) {
size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4;
- return nr_dirty > max_dirty &&
- test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
+ return nr_dirty > max_dirty;
}
int bch2_btree_key_cache_journal_flush(struct journal *,
struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-int bch2_btree_iter_traverse_cached(struct btree_iter *);
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+ unsigned);
bool bch2_btree_insert_key_cached(struct btree_trans *,
- struct btree_iter *, struct bkey_i *);
+ struct btree_path *, struct bkey_i *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
#ifdef CONFIG_BCACHEFS_DEBUG
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
};
-static inline int btree_node_locked_type(struct btree_iter *iter,
+static inline int btree_node_locked_type(struct btree_path *path,
unsigned level)
{
/*
* branches:
*/
return BTREE_NODE_UNLOCKED +
- ((iter->nodes_locked >> level) & 1) +
- ((iter->nodes_intent_locked >> level) & 1);
+ ((path->nodes_locked >> level) & 1) +
+ ((path->nodes_intent_locked >> level) & 1);
}
-static inline bool btree_node_intent_locked(struct btree_iter *iter,
+static inline bool btree_node_intent_locked(struct btree_path *path,
unsigned level)
{
- return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+ return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
}
-static inline bool btree_node_read_locked(struct btree_iter *iter,
+static inline bool btree_node_read_locked(struct btree_path *path,
unsigned level)
{
- return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+ return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
}
-static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
{
- return iter->nodes_locked & (1 << level);
+ return path->nodes_locked & (1 << level);
}
-static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+static inline void mark_btree_node_unlocked(struct btree_path *path,
unsigned level)
{
- iter->nodes_locked &= ~(1 << level);
- iter->nodes_intent_locked &= ~(1 << level);
+ path->nodes_locked &= ~(1 << level);
+ path->nodes_intent_locked &= ~(1 << level);
}
-static inline void mark_btree_node_locked(struct btree_iter *iter,
+static inline void mark_btree_node_locked(struct btree_path *path,
unsigned level,
enum six_lock_type type)
{
BUILD_BUG_ON(SIX_LOCK_read != 0);
BUILD_BUG_ON(SIX_LOCK_intent != 1);
- iter->nodes_locked |= 1 << level;
- iter->nodes_intent_locked |= type << level;
+ path->nodes_locked |= 1 << level;
+ path->nodes_intent_locked |= type << level;
}
-static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+static inline void mark_btree_node_intent_locked(struct btree_path *path,
unsigned level)
{
- mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+ mark_btree_node_locked(path, level, SIX_LOCK_intent);
}
-static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
{
- return level < iter->locks_want
+ return level < path->locks_want
? SIX_LOCK_intent
: SIX_LOCK_read;
}
static inline enum btree_node_locked_type
-btree_lock_want(struct btree_iter *iter, int level)
+btree_lock_want(struct btree_path *path, int level)
{
- if (level < iter->level)
+ if (level < path->level)
return BTREE_NODE_UNLOCKED;
- if (level < iter->locks_want)
+ if (level < path->locks_want)
return BTREE_NODE_INTENT_LOCKED;
- if (level == iter->level)
+ if (level == path->level)
return BTREE_NODE_READ_LOCKED;
return BTREE_NODE_UNLOCKED;
}
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_path *path, unsigned level)
{
- int lock_type = btree_node_locked_type(iter, level);
+ int lock_type = btree_node_locked_type(path, level);
EBUG_ON(level >= BTREE_MAX_DEPTH);
if (lock_type != BTREE_NODE_UNLOCKED)
- six_unlock_type(&iter->l[level].b->c.lock, lock_type);
- mark_btree_node_unlocked(iter, level);
+ six_unlock_type(&path->l[level].b->c.lock, lock_type);
+ mark_btree_node_unlocked(path, level);
}
-static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+static inline void __bch2_btree_path_unlock(struct btree_path *path)
{
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
- while (iter->nodes_locked)
- btree_node_unlock(iter, __ffs(iter->nodes_locked));
+ while (path->nodes_locked)
+ btree_node_unlock(path, __ffs(path->nodes_locked));
}
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
}
}
-/*
- * wrapper around six locks that just traces lock contended time
- */
-static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
- enum six_lock_type type)
+static inline bool btree_node_lock_type(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct bpos pos, unsigned level,
+ enum six_lock_type type,
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
{
- u64 start_time = local_clock();
+ struct bch_fs *c = trans->c;
+ u64 start_time;
+ bool ret;
- six_lock_type(&b->c.lock, type, NULL, NULL);
- bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
-}
+ if (six_trylock_type(&b->c.lock, type))
+ return true;
-static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
- enum six_lock_type type)
-{
- if (!six_trylock_type(&b->c.lock, type))
- __btree_node_lock_type(c, b, type);
+ start_time = local_clock();
+
+ trans->locking_path_idx = path->idx;
+ trans->locking_pos = pos;
+ trans->locking_btree_id = path->btree_id;
+ trans->locking_level = level;
+ trans->locking_lock_type = type;
+ trans->locking = b;
+ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+ trans->locking = NULL;
+
+ if (ret)
+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+
+ return ret;
}
/*
struct btree *b, unsigned level,
enum btree_node_locked_type want)
{
- struct btree_iter *iter;
+ struct btree_path *path;
- trans_for_each_iter(trans, iter)
- if (iter->l[level].b == b &&
- btree_node_locked_type(iter, level) >= want) {
+ trans_for_each_path(trans, path)
+ if (path->l[level].b == b &&
+ btree_node_locked_type(path, level) >= want) {
six_lock_increment(&b->c.lock, want);
return true;
}
return false;
}
-bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
- struct btree_iter *, enum six_lock_type,
+bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
+ struct btree *, struct bpos, unsigned,
+ enum six_lock_type,
six_lock_should_sleep_fn, void *,
unsigned long);
-static inline bool btree_node_lock(struct btree *b,
- struct bpos pos, unsigned level,
- struct btree_iter *iter,
+static inline bool btree_node_lock(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b, struct bpos pos, unsigned level,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p,
unsigned long ip)
{
- struct btree_trans *trans = iter->trans;
-
EBUG_ON(level >= BTREE_MAX_DEPTH);
- EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
return likely(six_trylock_type(&b->c.lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
- __bch2_btree_node_lock(b, pos, level, iter, type,
+ __bch2_btree_node_lock(trans, path, b, pos, level, type,
should_sleep_fn, p, ip);
}
-bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
-static inline bool bch2_btree_node_relock(struct btree_iter *iter,
- unsigned level)
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+ struct btree_path *path, unsigned level)
{
- EBUG_ON(btree_node_locked(iter, level) &&
- btree_node_locked_type(iter, level) !=
- __btree_lock_want(iter, level));
+ EBUG_ON(btree_node_locked(path, level) &&
+ btree_node_locked_type(path, level) !=
+ __btree_lock_want(path, level));
- return likely(btree_node_locked(iter, level)) ||
- __bch2_btree_node_relock(iter, level);
+ return likely(btree_node_locked(path, level)) ||
+ __bch2_btree_node_relock(trans, path, level);
}
/*
* succeed:
*/
static inline void
-bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+ struct btree *b)
{
- struct btree_iter *linked;
+ struct btree_path *linked;
- EBUG_ON(iter->l[b->c.level].b != b);
- EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+ EBUG_ON(path->l[b->c.level].b != b);
+ EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
- trans_for_each_iter_with_node(iter->trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked)
linked->l[b->c.level].lock_seq += 2;
six_unlock_write(&b->c.lock);
}
-void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+void bch2_btree_node_unlock_write(struct btree_trans *,
+ struct btree_path *, struct btree *);
-void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
-static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
- EBUG_ON(iter->l[b->c.level].b != b);
- EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
+ EBUG_ON(path->l[b->c.level].b != b);
+ EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
+ EBUG_ON(!btree_node_intent_locked(path, b->c.level));
if (unlikely(!six_trylock_write(&b->c.lock)))
- __bch2_btree_node_lock_write(b, iter);
+ __bch2_btree_node_lock_write(trans, b);
}
#endif /* _BCACHEFS_BTREE_LOCKING_H */
} data[MAX_BSETS];
};
-enum btree_iter_type {
- BTREE_ITER_KEYS,
- BTREE_ITER_NODES,
- BTREE_ITER_CACHED,
-};
-
-#define BTREE_ITER_TYPE ((1 << 2) - 1)
-
/*
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
-#define BTREE_ITER_SLOTS (1 << 2)
+#define BTREE_ITER_SLOTS (1 << 0)
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-#define BTREE_ITER_INTENT (1 << 3)
+#define BTREE_ITER_INTENT (1 << 1)
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-#define BTREE_ITER_PREFETCH (1 << 4)
+#define BTREE_ITER_PREFETCH (1 << 2)
/*
* Indicates that this iterator should not be reused until transaction commit,
* either because a pending update references it or because the update depends
* on that particular key being locked (e.g. by the str_hash code, for hash
* table consistency)
*/
-#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
+#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-#define BTREE_ITER_IS_EXTENTS (1 << 6)
-#define BTREE_ITER_NOT_EXTENTS (1 << 7)
-#define BTREE_ITER_ERROR (1 << 8)
-#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 9)
-#define BTREE_ITER_CACHED_NOFILL (1 << 10)
-#define BTREE_ITER_CACHED_NOCREATE (1 << 11)
-#define BTREE_ITER_WITH_UPDATES (1 << 12)
+#define BTREE_ITER_IS_EXTENTS (1 << 4)
+#define BTREE_ITER_NOT_EXTENTS (1 << 5)
+#define BTREE_ITER_CACHED (1 << 6)
+#define BTREE_ITER_CACHED_NOFILL (1 << 7)
+#define BTREE_ITER_CACHED_NOCREATE (1 << 8)
+#define BTREE_ITER_WITH_KEY_CACHE (1 << 9)
+#define BTREE_ITER_WITH_UPDATES (1 << 10)
+#define BTREE_ITER_WITH_JOURNAL (1 << 11)
+#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13)
+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14)
+#define BTREE_ITER_NOPRESERVE (1 << 15)
-enum btree_iter_uptodate {
+enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
- BTREE_ITER_NEED_PEEK = 1,
- BTREE_ITER_NEED_RELOCK = 2,
- BTREE_ITER_NEED_TRAVERSE = 3,
+ BTREE_ITER_NEED_RELOCK = 1,
+ BTREE_ITER_NEED_TRAVERSE = 2,
};
#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
#define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8)
-/*
- * @pos - iterator's current position
- * @level - current btree depth
- * @locks_want - btree level below which we start taking intent locks
- * @nodes_locked - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
- struct btree_trans *trans;
- unsigned long ip_allocated;
-
+struct btree_path {
u8 idx;
- u8 child_idx;
u8 sorted_idx;
+ u8 ref;
+ u8 intent_ref;
/* btree_iter_copy starts here: */
- u16 flags;
-
- /* When we're filtering by snapshot, the snapshot ID we're looking for: */
- unsigned snapshot;
-
struct bpos pos;
- struct bpos real_pos;
- struct bpos pos_after_commit;
enum btree_id btree_id:4;
- enum btree_iter_uptodate uptodate:3;
+ bool cached:1;
+ bool preserve:1;
+ enum btree_path_uptodate uptodate:2;
/*
- * True if we've returned a key (and thus are expected to keep it
- * locked), false after set_pos - for avoiding spurious transaction
- * restarts in bch2_trans_relock():
+ * When true, failing to relock this path will cause the transaction to
+ * restart:
*/
bool should_be_locked:1;
- unsigned level:4,
- min_depth:4,
+ unsigned level:3,
locks_want:4,
nodes_locked:4,
nodes_intent_locked:4;
- struct btree_iter_level {
+ struct btree_path_level {
struct btree *b;
struct btree_node_iter iter;
u32 lock_seq;
} l[BTREE_MAX_DEPTH];
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned long ip_allocated;
+#endif
+};
+
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+ return path->l + path->level;
+}
+
+/*
+ * @pos - iterator's current position
+ * @level - current btree depth
+ * @locks_want - btree level below which we start taking intent locks
+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+ struct btree_trans *trans;
+ struct btree_path *path;
+ struct btree_path *update_path;
+ struct btree_path *key_cache_path;
+ enum btree_id btree_id:4;
+ unsigned min_depth:4;
+
+ /* btree_iter_copy starts here: */
+ u16 flags;
+
+ /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+ unsigned snapshot;
+
+ struct bpos pos;
+ struct bpos pos_after_commit;
/*
* Current unpacked key - so that bch2_btree_iter_next()/
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned long ip_allocated;
+#endif
};
-static inline enum btree_iter_type
-btree_iter_type(const struct btree_iter *iter)
-{
- return iter->flags & BTREE_ITER_TYPE;
-}
-
-static inline bool btree_iter_is_cached(const struct btree_iter *iter)
-{
- return btree_iter_type(iter) == BTREE_ITER_CACHED;
-}
-
-static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
-{
- return iter->l + iter->level;
-}
-
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
u8 bkey_type;
enum btree_id btree_id:8;
u8 level;
- unsigned trans_triggers_run:1;
+ bool cached:1;
+ bool insert_trigger_run:1;
+ bool overwrite_trigger_run:1;
struct bkey_i *k;
- struct btree_iter *iter;
+ struct btree_path *path;
+ unsigned long ip_allocated;
};
#ifndef CONFIG_LOCKDEP
struct btree_trans {
struct bch_fs *c;
-#ifdef CONFIG_BCACHEFS_DEBUG
+ const char *fn;
struct list_head list;
struct btree *locking;
- unsigned locking_iter_idx;
+ unsigned locking_path_idx;
struct bpos locking_pos;
u8 locking_btree_id;
u8 locking_level;
+ u8 locking_lock_type;
pid_t pid;
-#endif
- unsigned long ip;
int srcu_idx;
u8 nr_sorted;
u8 nr_updates;
bool used_mempool:1;
- bool error:1;
bool in_traverse_all:1;
bool restarted:1;
+ bool journal_transaction_names:1;
/*
* For when bch2_trans_update notices we'll be splitting a compressed
* extent:
*/
unsigned extra_journal_res;
- u64 iters_linked;
- u64 iters_live;
- u64 iters_touched;
+ u64 paths_allocated;
unsigned mem_top;
unsigned mem_bytes;
void *mem;
- u8 *sorted;
- struct btree_iter *iters;
+ u8 sorted[BTREE_ITER_MAX];
+ struct btree_path *paths;
struct btree_insert_entry *updates;
/* update path: */
return btree_node_type_is_extents(btree_node_type(b));
}
-static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter)
-{
- return __btree_node_type(iter->level, iter->btree_id);
-}
-
-static inline bool btree_iter_is_extents(struct btree_iter *iter)
-{
- return btree_node_type_is_extents(btree_iter_key_type(iter));
-}
-
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
((1U << BKEY_TYPE_extents)| \
(1U << BKEY_TYPE_inodes)| \
#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
((1U << BKEY_TYPE_alloc)| \
- (1U << BKEY_TYPE_stripes))
+ (1U << BKEY_TYPE_inodes)| \
+ (1U << BKEY_TYPE_stripes)| \
+ (1U << BKEY_TYPE_snapshots))
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+ __BTREE_UPDATE_KEY_CACHE_RECLAIM,
__BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
};
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
- ((1U << KEY_TYPE_stripe)| \
- (1U << KEY_TYPE_inode))
+ ((1U << KEY_TYPE_alloc)| \
+ (1U << KEY_TYPE_alloc_v2)| \
+ (1U << KEY_TYPE_alloc_v3)| \
+ (1U << KEY_TYPE_stripe)| \
+ (1U << KEY_TYPE_inode)| \
+ (1U << KEY_TYPE_inode_v2)| \
+ (1U << KEY_TYPE_snapshot))
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
s8 error;
};
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
enum btree_insert_ret {
BTREE_INSERT_OK,
/* leaf node needs to be split */
btree_next_sib,
};
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
- struct btree *,
- struct btree_node_iter *);
-
#endif /* _BCACHEFS_BTREE_TYPES_H */
struct bch_fs;
struct btree;
-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *,
+void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
- struct btree_node_iter *, struct bkey_i *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
+ struct btree *, struct btree_node_iter *,
+ struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags {
struct disk_reservation *, u64 *, int flags);
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos, u64 *);
+ struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
- struct bpos, struct bpos, u64 *);
+ struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
- __le64, unsigned);
+ struct btree *, unsigned);
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
struct btree *, struct bkey_i *, bool);
int bch2_btree_node_update_key_get_iter(struct btree_trans *,
struct btree *, struct bkey_i *, bool);
-int bch2_trans_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_update_flags);
+int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, enum btree_update_flags);
+
+int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, enum btree_update_flags);
+
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *);
#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
({ \
struct btree_trans trans; \
- int _ret, _ret2; \
+ int _ret; \
\
bch2_trans_init(&trans, (_c), 0, 0); \
_ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
_do); \
- _ret2 = bch2_trans_exit(&trans); \
+ bch2_trans_exit(&trans); \
\
- _ret ?: _ret2; \
+ _ret; \
})
#define trans_for_each_update(_trans, _i) \
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
#include <trace/events/bcachefs.h>
static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- struct btree_iter *, struct btree *,
+ struct btree_path *, struct btree *,
struct keylist *, unsigned);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
/* Debug code: */
BUG_ON(!b->c.level);
- if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
return;
bch2_btree_node_iter_init_from_start(&iter, b);
clear_btree_node_noevict(b);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
-
mutex_lock(&c->btree_cache.lock);
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
}
-void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+ struct btree *b)
{
- struct open_buckets ob = b->ob;
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
- b->ob.nr = 0;
+ trans_for_each_path(trans, path)
+ BUG_ON(path->l[b->c.level].b == b &&
+ path->l[b->c.level].lock_seq == b->c.lock.state.seq);
- clear_btree_node_dirty(c, b);
+ six_lock_write(&b->c.lock, NULL, NULL);
- btree_node_lock_type(c, b, SIX_LOCK_write);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
__btree_node_free(c, b);
- six_unlock_write(&b->c.lock);
- bch2_open_buckets_put(c, &ob);
-}
-
-void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
-{
- struct btree_iter *linked;
-
- trans_for_each_iter(iter->trans, linked)
- BUG_ON(linked->l[b->c.level].b == b);
-
- six_lock_write(&b->c.lock, NULL, NULL);
- __btree_node_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
if (IS_ERR(wp))
return ERR_CAST(wp);
- if (wp->sectors_free < c->opts.btree_node_size) {
+ if (wp->sectors_free < btree_sectors(c)) {
struct open_bucket *ob;
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i)
- if (ob->sectors_free < c->opts.btree_node_size)
+ if (ob->sectors_free < btree_sectors(c))
ob->sectors_free = 0;
bch2_alloc_sectors_done(c, wp);
}
bkey_btree_ptr_v2_init(&tmp.k);
- bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
bch2_open_bucket_get(c, wp, &ob);
bch2_alloc_sectors_done(c, wp);
mem_alloc:
b = bch2_btree_node_mem_alloc(c);
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
/* we hold cannibalize_lock: */
BUG_ON(IS_ERR(b));
b = as->prealloc_nodes[--as->nr_prealloc_nodes];
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
+
set_btree_node_accessed(b);
set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
while (as->nr_prealloc_nodes) {
struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
- six_unlock_write(&b->c.lock);
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
if (c->btree_reserve_cache_nr <
ARRAY_SIZE(c->btree_reserve_cache)) {
bch2_open_buckets_put(c, &b->ob);
}
- btree_node_lock_type(c, b, SIX_LOCK_write);
__btree_node_free(c, b);
six_unlock_write(&b->c.lock);
-
six_unlock_intent(&b->c.lock);
}
}
static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
- unsigned flags, struct closure *cl)
+ unsigned flags)
{
struct bch_fs *c = as->c;
+ struct closure cl;
struct btree *b;
int ret;
+ closure_init_stack(&cl);
+retry:
+
BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
+ *
+ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+ * blocking on this lock:
*/
- ret = bch2_btree_cache_cannibalize_lock(c, cl);
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret)
- return ret;
+ goto err;
while (as->nr_prealloc_nodes < nr_nodes) {
b = __bch2_btree_node_alloc(c, &as->disk_res,
flags & BTREE_INSERT_NOWAIT
- ? NULL : cl, flags);
+ ? NULL : &cl, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
- goto err_free;
+ goto err;
}
as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
}
bch2_btree_cache_cannibalize_unlock(c);
+ closure_sync(&cl);
return 0;
-err_free:
+err:
bch2_btree_cache_cannibalize_unlock(c);
- trace_btree_reserve_get_fail(c, nr_nodes, cl);
+ closure_sync(&cl);
+
+ if (ret == -EAGAIN)
+ goto retry;
+
+ trace_btree_reserve_get_fail(c, nr_nodes, &cl);
return ret;
}
bch2_disk_reservation_put(c, &as->disk_res);
bch2_btree_reserve_put(as);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+ as->start_time);
+
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->unwritten_list);
list_del(&as->list);
- mutex_unlock(&c->btree_interior_update_lock);
closure_debug_destroy(&as->cl);
mempool_free(as, &c->btree_interior_update_pool);
+ /*
+ * Have to do the wakeup with btree_interior_update_lock still held,
+ * since being on btree_interior_update_list is our ref on @c:
+ */
closure_wake_up(&c->btree_interior_update_wait);
+
+ mutex_unlock(&c->btree_interior_update_lock);
}
static void btree_update_will_delete_key(struct btree_update *as,
* we're in journal error state:
*/
- btree_node_lock_type(c, b, SIX_LOCK_intent);
- btree_node_lock_type(c, b, SIX_LOCK_write);
+ six_lock_intent(&b->c.lock, NULL, NULL);
+ six_lock_write(&b->c.lock, NULL, NULL);
mutex_lock(&c->btree_interior_update_lock);
list_del(&as->write_blocked_list);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
- btree_node_lock_type(c, b, SIX_LOCK_read);
+ six_lock_read(&b->c.lock, NULL, NULL);
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
}
* And it adds @b to the list of @as's new nodes, so that we can update sector
* counts in bch2_btree_update_nodes_written:
*/
-void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
closure_put(&as->cl);
}
-void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
{
while (b->ob.nr)
as->open_buckets[as->nr_open_buckets++] =
* nodes and thus outstanding btree_updates - redirect @b's
* btree_updates to point to this btree_update:
*/
-void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
struct btree *b)
{
struct bch_fs *c = as->c;
as->nr_old_nodes++;
}
-void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as)
{
+ struct bch_fs *c = as->c;
+ u64 start_time = as->start_time;
+
BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
if (as->took_gc_lock)
continue_at(&as->cl, btree_update_set_nodes_written,
as->c->btree_interior_update_worker);
+
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+ start_time);
}
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *iter, unsigned level,
- unsigned nr_nodes, unsigned flags)
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
+ unsigned level, unsigned nr_nodes, unsigned flags)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct btree_update *as;
- struct closure cl;
+ u64 start_time = local_clock();
int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0;
int journal_flags = 0;
int ret = 0;
- BUG_ON(!iter->should_be_locked);
+ BUG_ON(!path->should_be_locked);
if (flags & BTREE_INSERT_JOURNAL_RESERVED)
journal_flags |= JOURNAL_RES_GET_RESERVED;
-
- closure_init_stack(&cl);
-retry:
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+ journal_flags |= JOURNAL_RES_GET_NONBLOCK;
/*
* XXX: figure out how far we might need to split,
* instead of locking/reserving all the way to the root:
*/
- if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
- trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
- iter->btree_id,
- &iter->real_pos);
- return ERR_PTR(-EINTR);
+ if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
+ trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
+ path->btree_id, &path->pos);
+ ret = btree_trans_restart(trans);
+ return ERR_PTR(ret);
}
if (flags & BTREE_INSERT_GC_LOCK_HELD)
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
as->c = c;
+ as->start_time = start_time;
as->mode = BTREE_INTERIOR_NO_UPDATE;
as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
- as->btree_id = iter->btree_id;
+ as->btree_id = path->btree_id;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
if (ret)
goto err;
+ bch2_trans_unlock(trans);
+
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
- journal_flags|JOURNAL_RES_GET_NONBLOCK);
- if (ret == -EAGAIN) {
- bch2_trans_unlock(trans);
-
- if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
- bch2_btree_update_free(as);
- btree_trans_restart(trans);
- return ERR_PTR(ret);
- }
-
- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- BTREE_UPDATE_JOURNAL_RES,
- journal_flags);
- if (ret) {
- trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
- goto err;
- }
-
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
- goto err;
- }
+ journal_flags);
+ if (ret) {
+ bch2_btree_update_free(as);
+ trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
+ btree_trans_restart(trans);
+ return ERR_PTR(ret);
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
- nr_nodes * c->opts.btree_node_size,
+ nr_nodes * btree_sectors(c),
c->opts.metadata_replicas,
disk_res_flags);
if (ret)
goto err;
- ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+ ret = bch2_btree_reserve_get(as, nr_nodes, flags);
if (ret)
goto err;
+ if (!bch2_trans_relock(trans)) {
+ ret = -EINTR;
+ goto err;
+ }
+
bch2_journal_pin_add(&c->journal,
atomic64_read(&c->journal.seq),
&as->journal, NULL);
return as;
err:
bch2_btree_update_free(as);
-
- if (ret == -EAGAIN) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
- ret = -EINTR;
- }
-
- if (ret == -EINTR && bch2_trans_relock(trans))
- goto retry;
-
return ERR_PTR(ret);
}
* is nothing new to be done. This just guarantees that there is a
* journal write.
*/
-static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
- struct btree_iter *iter)
+static void bch2_btree_set_root(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
struct bch_fs *c = as->c;
struct btree *old;
* Ensure no one is using the old root while we switch to the
* new root:
*/
- bch2_btree_node_lock_write(old, iter);
+ bch2_btree_node_lock_write(trans, path, old);
bch2_btree_set_root_inmem(c, b);
* an intent lock on the new root, and any updates that would
* depend on the new root would have to update the new root.
*/
- bch2_btree_node_unlock_write(old, iter);
+ bch2_btree_node_unlock_write(trans, path, old);
}
/* Interior node updates: */
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
- struct btree_iter *iter,
- struct bkey_i *insert,
- struct btree_node_iter *node_iter)
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter *node_iter,
+ struct bkey_i *insert)
{
struct bch_fs *c = as->c;
struct bkey_packed *k;
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
if (invalid) {
bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
bch2_btree_node_iter_advance(node_iter, b);
- bch2_btree_bset_insert_key(iter, b, node_iter, insert);
+ bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
set_btree_node_dirty(c, b);
set_btree_node_need_write(b);
}
static void
-__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
- struct btree_iter *iter, struct keylist *keys,
- struct btree_node_iter node_iter)
+__bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
;
while (!bch2_keylist_empty(keys)) {
- bch2_insert_fixup_btree_ptr(as, b, iter,
- bch2_keylist_front(keys), &node_iter);
+ bch2_insert_fixup_btree_ptr(as, trans, path, b,
+ &node_iter, bch2_keylist_front(keys));
bch2_keylist_pop_front(keys);
}
}
* node)
*/
static struct btree *__btree_split_node(struct btree_update *as,
- struct btree *n1,
- struct btree_iter *iter)
+ struct btree *n1)
{
struct bkey_format_state s;
size_t nr_packed = 0, nr_unpacked = 0;
* nodes that were coalesced, and thus in the middle of a child node post
* coalescing:
*/
-static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
- struct btree_iter *iter,
+static void btree_split_insert_keys(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
struct keylist *keys)
{
struct btree_node_iter node_iter;
bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
- __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
+ __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
/*
* We can't tolerate whiteouts here - with whiteouts there can be
btree_node_interior_verify(as->c, b);
}
-static void btree_split(struct btree_update *as,
- struct btree_trans *trans, struct btree_iter *iter,
- struct btree *b, struct keylist *keys,
- unsigned flags)
+static void btree_split(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
- struct btree *parent = btree_node_parent(iter, b);
+ struct btree *parent = btree_node_parent(path, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
u64 start_time = local_clock();
BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_update_add_new_node(as, n1);
if (keys)
- btree_split_insert_keys(as, n1, iter, keys);
+ btree_split_insert_keys(as, trans, path, n1, keys);
if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
trace_btree_split(c, b);
- n2 = __btree_split_node(as, n1, iter);
+ n2 = __btree_split_node(as, n1);
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
- btree_split_insert_keys(as, n3, iter, &as->parent_keys);
+ btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
bch2_btree_node_write(c, n3, SIX_LOCK_intent);
}
if (parent) {
/* Split a non root node */
- bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
+ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
} else if (n3) {
- bch2_btree_set_root(as, n3, iter);
+ bch2_btree_set_root(as, trans, path, n3);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, n1, iter);
+ bch2_btree_set_root(as, trans, path, n1);
}
bch2_btree_update_get_open_buckets(as, n1);
if (n3)
bch2_btree_update_get_open_buckets(as, n3);
- /* Successful split, update the iterator to point to the new nodes: */
+ /* Successful split, update the path to point to the new nodes: */
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- bch2_btree_iter_node_drop(iter, b);
if (n3)
- bch2_btree_iter_node_replace(iter, n3);
+ bch2_trans_node_add(trans, n3);
if (n2)
- bch2_btree_iter_node_replace(iter, n2);
- bch2_btree_iter_node_replace(iter, n1);
+ bch2_trans_node_add(trans, n2);
+ bch2_trans_node_add(trans, n1);
/*
* The old node must be freed (in memory) _before_ unlocking the new
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
- bch2_btree_node_free_inmem(c, b, iter);
+ bch2_btree_node_free_inmem(trans, b);
if (n3)
six_unlock_intent(&n3->c.lock);
six_unlock_intent(&n2->c.lock);
six_unlock_intent(&n1->c.lock);
- bch2_btree_trans_verify_locks(trans);
+ bch2_trans_verify_locks(trans);
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
+ bch2_time_stats_update(&c->times[n2
+ ? BCH_TIME_btree_node_split
+ : BCH_TIME_btree_node_compact],
start_time);
}
static void
-bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
- struct btree_iter *iter, struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct keylist *keys)
{
- struct btree_iter *linked;
+ struct btree_path *linked;
- __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
+ __bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
btree_update_updated_node(as, b);
- trans_for_each_iter_with_node(iter->trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked)
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
- bch2_btree_trans_verify_iters(iter->trans, b);
+ bch2_trans_verify_paths(trans);
}
/**
* If a split occurred, this function will return early. This can only happen
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
-static void bch2_btree_insert_node(struct btree_update *as,
- struct btree_trans *trans, struct btree_iter *iter,
- struct btree *b, struct keylist *keys,
- unsigned flags)
+static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+ struct btree_path *path, struct btree *b,
+ struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int live_u64s_added, u64s_added;
lockdep_assert_held(&c->gc_lock);
- BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+ BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
- bch2_btree_node_lock_for_insert(trans, iter, b);
+ bch2_btree_node_lock_for_insert(trans, path, b);
if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
- bch2_btree_node_unlock_write(b, iter);
+ bch2_btree_node_unlock_write(trans, path, b);
goto split;
}
btree_node_interior_verify(c, b);
- bch2_btree_insert_keys_interior(as, b, iter, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b, keys);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
- bch2_btree_iter_reinit_node(iter, b);
+ bch2_trans_node_reinit_iter(trans, b);
- bch2_btree_node_unlock_write(b, iter);
+ bch2_btree_node_unlock_write(trans, path, b);
btree_node_interior_verify(c, b);
return;
split:
- btree_split(as, trans, iter, b, keys, flags);
+ btree_split(as, trans, path, b, keys, flags);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree_path *path,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter_l(iter)->b;
+ struct btree *b = path_l(path)->b;
struct btree_update *as;
unsigned l;
int ret = 0;
- as = bch2_btree_update_start(iter, iter->level,
+ as = bch2_btree_update_start(trans, path, path->level,
btree_update_reserve_required(c, b), flags);
if (IS_ERR(as))
return PTR_ERR(as);
- btree_split(as, trans, iter, b, NULL, flags);
+ btree_split(as, trans, path, b, NULL, flags);
bch2_btree_update_done(as);
- for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
- ret = bch2_foreground_maybe_merge(trans, iter, l, flags);
+ for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+ ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret;
}
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree_path *path,
unsigned level,
unsigned flags,
enum btree_node_sibling sib)
{
struct bch_fs *c = trans->c;
- struct btree_iter *sib_iter = NULL;
+ struct btree_path *sib_path = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
struct btree *b, *m, *n, *prev, *next, *parent;
struct bpos sib_pos;
size_t sib_u64s;
- int ret = 0, ret2 = 0;
-
-retry:
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
+ u64 start_time = local_clock();
+ int ret = 0;
- BUG_ON(!iter->should_be_locked);
- BUG_ON(!btree_node_locked(iter, level));
+ BUG_ON(!path->should_be_locked);
+ BUG_ON(!btree_node_locked(path, level));
- b = iter->l[level].b;
+ b = path->l[level].b;
if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
(sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
b->sib_u64s[sib] = U16_MAX;
- goto out;
+ return 0;
}
sib_pos = sib == btree_prev_sib
? bpos_predecessor(b->data->min_key)
: bpos_successor(b->data->max_key);
- sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
- sib_pos, U8_MAX, level,
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(sib_iter);
+ sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+ U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
- m = sib_iter->l[level].b;
+ sib_path->should_be_locked = true;
- if (btree_node_parent(iter, b) !=
- btree_node_parent(sib_iter, m)) {
+ m = sib_path->l[level].b;
+
+ if (btree_node_parent(path, b) !=
+ btree_node_parent(sib_path, m)) {
b->sib_u64s[sib] = U16_MAX;
goto out;
}
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
goto out;
- parent = btree_node_parent(iter, b);
- as = bch2_btree_update_start(iter, level,
+ parent = btree_node_parent(path, b);
+ as = bch2_btree_update_start(trans, path, level,
btree_update_reserve_required(c, parent) + 1,
flags|
BTREE_INSERT_NOFAIL|
bch2_keylist_add(&as->parent_keys, &delete);
bch2_keylist_add(&as->parent_keys, &n->key);
- bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
+ bch2_trans_verify_paths(trans);
+
+ bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+
+ bch2_trans_verify_paths(trans);
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
six_lock_increment(&m->c.lock, SIX_LOCK_intent);
- bch2_btree_iter_node_drop(iter, b);
- bch2_btree_iter_node_drop(iter, m);
- bch2_btree_iter_node_replace(iter, n);
+ bch2_trans_node_add(trans, n);
- bch2_btree_trans_verify_iters(trans, n);
+ bch2_trans_verify_paths(trans);
- bch2_btree_node_free_inmem(c, b, iter);
- bch2_btree_node_free_inmem(c, m, iter);
+ bch2_btree_node_free_inmem(trans, b);
+ bch2_btree_node_free_inmem(trans, m);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
-out:
- bch2_btree_trans_verify_locks(trans);
- bch2_trans_iter_free(trans, sib_iter);
- /*
- * Don't downgrade locks here: we're called after successful insert,
- * and the caller will downgrade locks after a successful insert
- * anyways (in case e.g. a split was required first)
- *
- * And we're also called when inserting into interior nodes in the
- * split path, and downgrading to read locks in there is potentially
- * confusing:
- */
- return ret ?: ret2;
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+out:
err:
- bch2_trans_iter_put(trans, sib_iter);
- sib_iter = NULL;
-
- if (ret == -EINTR && bch2_trans_relock(trans))
- goto retry;
-
- goto out;
+ bch2_path_put(trans, sib_path, true);
+ bch2_trans_verify_locks(trans);
+ return ret;
}
/**
*/
int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_iter *iter,
- __le64 seq, unsigned flags)
+ struct btree *b,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree *b, *n, *parent;
+ struct btree *n, *parent;
struct btree_update *as;
int ret;
flags |= BTREE_INSERT_NOFAIL;
-retry:
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- goto out;
- b = bch2_btree_iter_peek_node(iter);
- if (!b || b->data->keys.seq != seq)
- goto out;
-
- parent = btree_node_parent(iter, b);
- as = bch2_btree_update_start(iter, b->c.level,
+ parent = btree_node_parent(iter->path, b);
+ as = bch2_btree_update_start(trans, iter->path, b->c.level,
(parent
? btree_update_reserve_required(c, parent)
: 0) + 1,
flags);
ret = PTR_ERR_OR_ZERO(as);
- if (ret == -EINTR)
- goto retry;
if (ret) {
trace_btree_gc_rewrite_node_fail(c, b);
goto out;
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- bch2_btree_insert_node(as, trans, iter, parent,
+ bch2_btree_insert_node(as, trans, iter->path, parent,
&as->parent_keys, flags);
} else {
- bch2_btree_set_root(as, n, iter);
+ bch2_btree_set_root(as, trans, iter->path, n);
}
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
- bch2_btree_iter_node_drop(iter, b);
- bch2_btree_iter_node_replace(iter, n);
- bch2_btree_node_free_inmem(c, b, iter);
+ bch2_trans_node_add(trans, n);
+ bch2_btree_node_free_inmem(trans, b);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as);
out:
- bch2_btree_iter_downgrade(iter);
+ bch2_btree_path_downgrade(iter->path);
return ret;
}
__le64 seq;
};
+static int async_btree_node_rewrite_trans(struct btree_trans *trans,
+ struct async_btree_rewrite *a)
+{
+ struct btree_iter iter;
+ struct btree *b;
+ int ret;
+
+ bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+ BTREE_MAX_DEPTH, a->level, 0);
+ b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto out;
+
+ if (!b || b->data->keys.seq != a->seq)
+ goto out;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+out :
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
void async_btree_node_rewrite_work(struct work_struct *work)
{
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- struct btree_trans trans;
- struct btree_iter *iter;
- bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
- BTREE_MAX_DEPTH, a->level, 0);
- bch2_btree_node_rewrite(&trans, iter, a->seq, 0);
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
+ bch2_trans_do(c, NULL, NULL, 0,
+ async_btree_node_rewrite_trans(&trans, a));
percpu_ref_put(&c->writes);
kfree(a);
}
{
struct async_btree_rewrite *a;
- if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
- return;
-
if (!percpu_ref_tryget(&c->writes))
return;
bool skip_triggers)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter2 = NULL;
+ struct btree_iter iter2 = { NULL };
struct btree *parent;
u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
int ret;
BUG_ON(ret);
}
- parent = btree_node_parent(iter, b);
+ parent = btree_node_parent(iter->path, b);
if (parent) {
- iter2 = bch2_trans_copy_iter(trans, iter);
+ bch2_trans_copy_iter(&iter2, iter);
- BUG_ON(iter2->level != b->c.level);
- BUG_ON(bpos_cmp(iter2->pos, new_key->k.p));
+ iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+ iter2.flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
- btree_node_unlock(iter2, iter2->level);
- iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP;
- iter2->level++;
+ BUG_ON(iter2.path->level != b->c.level);
+ BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
- ret = bch2_btree_iter_traverse(iter2) ?:
- bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN);
+ btree_node_unlock(iter2.path, iter2.path->level);
+ path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
+ iter2.path->level++;
+
+ ret = bch2_btree_iter_traverse(&iter2) ?:
+ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
if (ret)
goto err;
} else {
ret = bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_JOURNAL_RECLAIM|
BTREE_INSERT_JOURNAL_RESERVED);
if (ret)
goto err;
- bch2_btree_node_lock_write(b, iter);
+ bch2_btree_node_lock_write(trans, iter->path, b);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
bkey_copy(&b->key, new_key);
}
- bch2_btree_node_unlock_write(b, iter);
+ bch2_btree_node_unlock_write(trans, iter->path, b);
out:
- bch2_trans_iter_put(trans, iter2);
+ bch2_trans_iter_exit(trans, &iter2);
return ret;
err:
if (new_hash) {
{
struct bch_fs *c = trans->c;
struct btree *new_hash = NULL;
+ struct btree_path *path = iter->path;
struct closure cl;
int ret = 0;
+ if (!btree_node_intent_locked(path, b->c.level) &&
+ !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
+ btree_trans_restart(trans);
+ return -EINTR;
+ }
+
closure_init_stack(&cl);
/*
new_hash = bch2_btree_node_mem_alloc(c);
}
+ path->intent_ref++;
ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
new_key, skip_triggers);
+ --path->intent_ref;
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
struct btree *b, struct bkey_i *new_key,
bool skip_triggers)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
int ret;
- iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p,
- BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
+ bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH, b->c.level,
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto out;
/* has node been freed? */
- if (iter->l[b->c.level].b != b) {
+ if (iter.path->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
goto out;
BUG_ON(!btree_node_hashed(b));
- ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers);
+ ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct btree_update {
struct closure cl;
struct bch_fs *c;
+ u64 start_time;
struct list_head list;
struct list_head unwritten_list;
/* Nodes being freed: */
struct keylist old_keys;
u64 _old_keys[BTREE_UPDATE_NODES_MAX *
- BKEY_BTREE_PTR_VAL_U64s_MAX];
+ BKEY_BTREE_PTR_U64s_MAX];
/* Nodes being added: */
struct keylist new_keys;
u64 _new_keys[BTREE_UPDATE_NODES_MAX *
- BKEY_BTREE_PTR_VAL_U64s_MAX];
+ BKEY_BTREE_PTR_U64s_MAX];
/* New nodes, that will be made reachable by this update: */
struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
-void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
- struct btree_iter *);
-void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-
-void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
-
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
-void bch2_btree_update_done(struct btree_update *);
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
-
-void bch2_btree_interior_update_will_free_node(struct btree_update *,
- struct btree *);
-void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
-int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
unsigned, unsigned, enum btree_node_sibling);
static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree_path *path,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
{
struct btree *b;
- if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
- return 0;
-
- if (!bch2_btree_node_relock(iter, level))
- return 0;
+ EBUG_ON(!btree_node_locked(path, level));
- b = iter->l[level].b;
+ b = path->l[level].b;
if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
return 0;
- return __bch2_foreground_maybe_merge(trans, iter, level, flags, sib);
+ return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
}
static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree_path *path,
unsigned level,
unsigned flags)
{
- return bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
+ return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
- bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
+ bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_next_sib);
}
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s;
- ssize_t total = c->opts.btree_node_size << 6;
+ ssize_t total = c->opts.btree_node_size >> 3;
/* Always leave one extra u64 for bch2_varint_decode: */
used++;
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery.h"
+#include "subvolume.h"
#include "replicas.h"
#include <linux/prefetch.h>
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+ struct bkey_i *, enum btree_update_flags);
+
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
bpos_cmp(l->k->k.p, r->k->k.p);
}
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+ return i->path->l + i->level;
+}
+
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i != trans->updates &&
- iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
+ insert_l(&i[0])->b == insert_l(&i[-1])->b;
}
-inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree *b)
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+ struct btree_insert_entry *i)
{
- struct bch_fs *c = trans->c;
+ return i + 1 < trans->updates + trans->nr_updates &&
+ insert_l(&i[0])->b == insert_l(&i[1])->b;
+}
- bch2_btree_node_lock_write(b, iter);
+static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
+{
+ struct bch_fs *c = trans->c;
- if (btree_iter_type(iter) == BTREE_ITER_CACHED)
+ if (path->cached)
return;
if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
- bch2_btree_iter_reinit_node(iter, b);
+ bch2_trans_node_reinit_iter(trans, b);
/*
* If the last bset has been written, or if it's gotten too big - start
* a new bset to insert into:
*/
if (want_new_bset(c, b))
- bch2_btree_init_next(trans, iter, b);
+ bch2_btree_init_next(trans, b);
+}
+
+void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
+{
+ bch2_btree_node_lock_write(trans, path, b);
+ bch2_btree_node_prep_for_write(trans, path, b);
}
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+ struct btree_path *path,
struct btree *b,
struct btree_node_iter *node_iter,
struct bkey_i *insert)
EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(iter->trans->c, b));
- EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+ bch_btree_keys_u64s_remaining(trans->c, b));
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(iter->trans->c, b, insert->k.p);
+ push_whiteout(trans->c, b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
bch2_bset_delete(b, k, clobber_u64s);
goto fix_iter;
} else {
- bch2_btree_iter_fix_key_modified(iter, b, k);
+ bch2_btree_path_fix_key_modified(trans, b, k);
}
return true;
clobber_u64s = k->u64s;
goto overwrite;
} else {
- bch2_btree_iter_fix_key_modified(iter, b, k);
+ bch2_btree_path_fix_key_modified(trans, b, k);
}
}
new_u64s = k->u64s;
fix_iter:
if (clobber_u64s != new_u64s)
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
+ bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
clobber_u64s, new_u64s);
return true;
}
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
- btree_node_lock_type(c, b, SIX_LOCK_read);
+ six_lock_read(&b->c.lock, NULL, NULL);
bch2_btree_node_write_cond(c, b,
(btree_current_write(b) == w && w->journal.seq == seq));
six_unlock_read(&b->c.lock);
* btree_insert_key - insert a key one key into a leaf node
*/
static bool btree_insert_key_leaf(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *insert)
+ struct btree_insert_entry *insert)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter_l(iter)->b;
+ struct btree *b = insert_l(insert)->b;
struct bset_tree *t = bset_tree_last(b);
struct bset *i = bset(b, t);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- EBUG_ON(!iter->level &&
- !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
-
- if (unlikely(!bch2_btree_bset_insert_key(iter, b,
- &iter_l(iter)->iter, insert)))
+ if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
+ &insert_l(insert)->iter, insert->k)))
return false;
i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
- bch2_btree_iter_reinit_node(iter, b);
+ bch2_trans_node_reinit_iter(trans, b);
- trace_btree_insert_key(c, b, insert);
return true;
}
static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos));
- BUG_ON(i->level != i->iter->level);
- BUG_ON(i->btree_id != i->iter->btree_id);
+ BUG_ON(bpos_cmp(i->k->k.p, i->path->pos));
+ BUG_ON(i->cached != i->path->cached);
+ BUG_ON(i->level != i->path->level);
+ BUG_ON(i->btree_id != i->path->btree_id);
+ EBUG_ON(!i->level &&
+ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+ i->k->k.p.snapshot &&
+ bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
}
static noinline int
return ret;
if (!bch2_trans_relock(trans)) {
- trace_trans_restart_journal_preres_get(trans->ip, trace_ip);
+ trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
return -EINTR;
}
return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
}
-static enum btree_insert_ret
+#define JSET_ENTRY_LOG_U64s 4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
+ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+ unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
+ unsigned b, buflen = u64s * sizeof(u64);
+
+ l->entry.u64s = cpu_to_le16(u64s);
+ l->entry.btree_id = 0;
+ l->entry.level = 0;
+ l->entry.type = BCH_JSET_ENTRY_log;
+ l->entry.pad[0] = 0;
+ l->entry.pad[1] = 0;
+ l->entry.pad[2] = 0;
+ b = min_t(unsigned, strlen(trans->fn), buflen);
+ memcpy(l->d, trans->fn, b);
+ while (b < buflen)
+ l->d[b++] = '\0';
+
+ trans->journal_res.offset += JSET_ENTRY_LOG_U64s;
+ trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s;
+}
+
+static inline enum btree_insert_ret
btree_key_can_insert(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree *b,
unsigned u64s)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter_l(iter)->b;
if (!bch2_btree_node_insert_fits(c, b, u64s))
return BTREE_INSERT_BTREE_NODE_FULL;
static enum btree_insert_ret
btree_key_can_insert_cached(struct btree_trans *trans,
- struct btree_iter *iter,
+ struct btree_path *path,
unsigned u64s)
{
- struct bkey_cached *ck = (void *) iter->l[0].b;
+ struct bch_fs *c = trans->c;
+ struct bkey_cached *ck = (void *) path->l[0].b;
unsigned new_u64s;
struct bkey_i *new_k;
- BUG_ON(iter->level);
+ EBUG_ON(path->level);
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
- bch2_btree_key_cache_must_wait(trans->c) &&
+ bch2_btree_key_cache_must_wait(c) &&
!(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
new_u64s = roundup_pow_of_two(u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
- if (!new_k)
+ if (!new_k) {
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_ids[path->btree_id], new_u64s);
return -ENOMEM;
+ }
ck->u64s = new_u64s;
ck->k = new_k;
i->k->k.needs_whiteout = false;
- did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
- ? btree_insert_key_leaf(trans, i->iter, i->k)
- : bch2_btree_insert_key_cached(trans, i->iter, i->k);
+ did_work = !i->cached
+ ? btree_insert_key_leaf(trans, i)
+ : bch2_btree_insert_key_cached(trans, i->path, i->k);
if (!did_work)
return;
i->level,
i->k);
- bch2_journal_set_has_inode(j, &trans->journal_res,
- i->k->k.p.inode);
-
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
}
}
-static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
+static noinline int bch2_trans_mark_gc(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
+ int ret = 0;
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
+ * XXX: synchronization of interior node updates with gc
*/
- BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
+ BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
- bch2_mark_update(trans, i->iter, i->k,
- i->flags|BTREE_TRIGGER_GC);
+ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+ ret = bch2_mark_update(trans, i->path, i->k,
+ i->flags|BTREE_TRIGGER_GC);
+ if (ret)
+ break;
+ }
}
+
+ return ret;
}
static inline int
int ret;
if (race_fault()) {
- trace_trans_restart_fault_inject(trans->ip, trace_ip);
+ trace_trans_restart_fault_inject(trans->fn, trace_ip);
trans->restarted = true;
return -EINTR;
}
u64s = 0;
u64s += i->k->k.u64s;
- ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
- ? btree_key_can_insert(trans, i->iter, u64s)
- : btree_key_can_insert_cached(trans, i->iter, u64s);
+ ret = !i->cached
+ ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+ : btree_key_can_insert_cached(trans, i->path, u64s);
if (ret) {
*stopped_at = i;
return ret;
marking = true;
}
- if (marking) {
- percpu_down_read(&c->mark_lock);
- }
-
- /* Must be called under mark_lock: */
- if (marking && trans->fs_usage_deltas &&
- !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
- ret = BTREE_INSERT_NEED_MARK_REPLICAS;
- goto err;
- }
-
/*
* Don't get journal reservation until after we know insert will
* succeed:
ret = bch2_trans_journal_res_get(trans,
JOURNAL_RES_GET_NONBLOCK);
if (ret)
- goto err;
+ return ret;
+
+ if (unlikely(trans->journal_transaction_names))
+ journal_transaction_name(trans);
} else {
trans->journal_res.seq = c->journal.replay_journal_seq;
}
i->k->k.version = MAX_VERSION;
}
- trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
- bch2_mark_update(trans, i->iter, i->k,
- i->flags);
+ if (trans->fs_usage_deltas &&
+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
+ return BTREE_INSERT_NEED_MARK_REPLICAS;
- if (marking && trans->fs_usage_deltas)
- bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
+ trans_for_each_update(trans, i)
+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+ ret = bch2_mark_update(trans, i->path, i->k, i->flags);
+ if (ret)
+ return ret;
+ }
- if (unlikely(c->gc_pos.phase))
- bch2_trans_mark_gc(trans);
+ if (unlikely(c->gc_pos.phase)) {
+ ret = bch2_trans_mark_gc(trans);
+ if (ret)
+ return ret;
+ }
trans_for_each_update(trans, i)
do_btree_insert_one(trans, i);
-err:
- if (marking) {
- percpu_up_read(&c->mark_lock);
- }
return ret;
}
-static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
+static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
{
- struct btree_insert_entry *i;
- struct btree *b = iter_l(iter)->b;
- struct bkey_s_c old;
- int u64s_delta = 0;
- int ret;
+ unsigned l;
- /*
- * Inserting directly into interior nodes is an uncommon operation with
- * various weird edge cases: also, a lot of things about
- * BTREE_ITER_NODES iters need to be audited
- */
- if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
- return 0;
+ for (l = 0; l < BTREE_MAX_DEPTH; l++)
+ if (btree_node_read_locked(path, l))
+ BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
+}
+
+static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
+{
+ struct btree *b = path_l(path)->b;
+
+ do {
+ if (path->nodes_locked &&
+ path->nodes_locked != path->nodes_intent_locked)
+ path_upgrade_readers(trans, path);
+ } while ((path = prev_btree_path(trans, path)) &&
+ path_l(path)->b == b);
+}
+
+/*
+ * Check for nodes that we have both read and intent locks on, and upgrade the
+ * readers to intent:
+ */
+static inline void normalize_read_intent_locks(struct btree_trans *trans)
+{
+ struct btree_path *path;
+ unsigned i, nr_read = 0, nr_intent = 0;
+
+ trans_for_each_path_inorder(trans, path, i) {
+ struct btree_path *next = i + 1 < trans->nr_sorted
+ ? trans->paths + trans->sorted[i + 1]
+ : NULL;
+
+ if (path->nodes_locked) {
+ if (path->nodes_intent_locked)
+ nr_intent++;
+ else
+ nr_read++;
+ }
+
+ if (!next || path_l(path)->b != path_l(next)->b) {
+ if (nr_read && nr_intent)
+ upgrade_readers(trans, path);
+
+ nr_read = nr_intent = 0;
+ }
+ }
+
+ bch2_trans_verify_locks(trans);
+}
+
+static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
+{
+ struct btree_path *path;
+ unsigned i;
+
+ trans_for_each_path_inorder(trans, path, i) {
+ //if (path == pos)
+ // break;
- BUG_ON(iter->level);
+ if (path->nodes_locked != path->nodes_intent_locked &&
+ !bch2_btree_path_upgrade(trans, path, path->level + 1))
+ return true;
+ }
+
+ return false;
+}
+
+static inline int trans_lock_write(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
trans_for_each_update(trans, i) {
- if (iter_l(i->iter)->b != b)
+ if (same_leaf_as_prev(trans, i))
continue;
- old = bch2_btree_iter_peek_slot(i->iter);
- ret = bkey_err(old);
- if (ret)
- return ret;
+ if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
+ if (have_conflicting_read_lock(trans, i->path))
+ goto fail;
- u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
- u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+ btree_node_lock_type(trans, i->path,
+ insert_l(i)->b,
+ i->path->pos, i->level,
+ SIX_LOCK_write, NULL, NULL);
+ }
+
+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
- if (u64s_delta > 0)
- return 0;
+ return 0;
+fail:
+ while (--i >= trans->updates) {
+ if (same_leaf_as_prev(trans, i))
+ continue;
- return bch2_foreground_maybe_merge(trans, iter,
- iter->level, trans->flags);
+ bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
+ }
+
+ trace_trans_restart_would_deadlock_write(trans->fn);
+ return btree_trans_restart(trans);
+}
+
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_update(trans, i)
+ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
}
/*
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
- struct btree_iter *iter;
- int ret;
+ struct bkey_s_c old;
+ int ret, u64s_delta = 0;
trans_for_each_update(trans, i) {
- struct btree *b;
+ const char *invalid = bch2_bkey_invalid(c,
+ bkey_i_to_s_c(i->k), i->bkey_type);
+ if (invalid) {
+ char buf[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+ bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
+ buf, trans->fn, (void *) i->ip_allocated, invalid);
+ return -EINVAL;
+ }
+ btree_insert_entry_checks(trans, i);
+ }
- BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+ trans_for_each_update(trans, i) {
+ struct bkey u;
- if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
+ /*
+ * peek_slot() doesn't yet work on iterators that point to
+ * interior nodes:
+ */
+ if (i->cached || i->level)
continue;
- b = iter_l(i->iter)->b;
- if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
- b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
- ret = maybe_do_btree_merge(trans, i->iter);
- if (unlikely(ret))
- return ret;
+ old = bch2_btree_path_peek_slot(i->path, &u);
+ ret = bkey_err(old);
+ if (unlikely(ret))
+ return ret;
+
+ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+ u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+
+ if (!same_leaf_as_next(trans, i)) {
+ if (u64s_delta <= 0) {
+ ret = bch2_foreground_maybe_merge(trans, i->path,
+ i->level, trans->flags);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ u64s_delta = 0;
}
}
- trans_for_each_update(trans, i)
- BUG_ON(!btree_node_intent_locked(i->iter, i->level));
-
ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
JOURNAL_RES_GET_NONBLOCK|
if (unlikely(ret))
return ret;
- /*
- * Can't be holding any read locks when we go to take write locks:
- * another thread could be holding an intent lock on the same node we
- * have a read lock on, and it'll block trying to take a write lock
- * (because we hold a read lock) and it could be blocking us by holding
- * its own read lock (while we're trying to to take write locks).
- *
- * note - this must be done after bch2_trans_journal_preres_get_cold()
- * or anything else that might call bch2_trans_relock(), since that
- * would just retake the read locks:
- */
- trans_for_each_iter(trans, iter)
- if (iter->nodes_locked != iter->nodes_intent_locked &&
- !bch2_btree_iter_upgrade(iter, 1)) {
- trace_trans_restart_upgrade(trans->ip, trace_ip,
- iter->btree_id,
- &iter->real_pos);
- trans->restarted = true;
- return -EINTR;
- }
+ normalize_read_intent_locks(trans);
- trans_for_each_update(trans, i) {
- const char *invalid = bch2_bkey_invalid(c,
- bkey_i_to_s_c(i->k), i->bkey_type);
- if (invalid) {
- char buf[200];
-
- bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
- bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
- bch2_fatal_error(c);
- }
- btree_insert_entry_checks(trans, i);
- }
- bch2_btree_trans_verify_locks(trans);
-
- trans_for_each_update(trans, i)
- if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_lock_for_insert(trans, i->iter,
- iter_l(i->iter)->b);
+ ret = trans_lock_write(trans);
+ if (unlikely(ret))
+ return ret;
ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
+ if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ bch2_drop_overwrites_from_journal(trans);
+
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
- i->iter);
+ bch2_btree_node_unlock_write_inlined(trans, i->path,
+ insert_l(i)->b);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
switch (ret) {
case BTREE_INSERT_BTREE_NODE_FULL:
- ret = bch2_btree_split_leaf(trans, i->iter, trans->flags);
+ ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
if (!ret)
return 0;
if (ret == -EINTR)
- trace_trans_restart_btree_node_split(trans->ip, trace_ip,
- i->iter->btree_id,
- &i->iter->real_pos);
+ trace_trans_restart_btree_node_split(trans->fn, trace_ip,
+ i->btree_id, &i->path->pos);
break;
case BTREE_INSERT_NEED_MARK_REPLICAS:
bch2_trans_unlock(trans);
if (bch2_trans_relock(trans))
return 0;
- trace_trans_restart_mark_replicas(trans->ip, trace_ip);
+ trace_trans_restart_mark_replicas(trans->fn, trace_ip);
ret = -EINTR;
break;
case BTREE_INSERT_NEED_JOURNAL_RES:
if (bch2_trans_relock(trans))
return 0;
- trace_trans_restart_journal_res_get(trans->ip, trace_ip);
+ trace_trans_restart_journal_res_get(trans->fn, trace_ip);
ret = -EINTR;
break;
case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
bch2_trans_unlock(trans);
- trace_trans_blocked_journal_reclaim(trans->ip, trace_ip);
+ trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
wait_event_freezable(c->journal.reclaim_wait,
(ret = journal_reclaim_wait_done(c)));
if (bch2_trans_relock(trans))
return 0;
- trace_trans_restart_journal_reclaim(trans->ip, trace_ip);
+ trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
ret = -EINTR;
break;
default:
}
BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
- BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL));
+ BUG_ON(ret == -ENOSPC &&
+ !(trans->flags & BTREE_INSERT_NOWAIT) &&
+ (trans->flags & BTREE_INSERT_NOFAIL));
return ret;
}
struct bch_fs *c = trans->c;
int ret;
- if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+ test_bit(BCH_FS_STARTED, &c->flags))
return -EROFS;
bch2_trans_unlock(trans);
if (ret)
return ret;
+ if (!bch2_trans_relock(trans))
+ return -EINTR;
+
percpu_ref_get(&c->writes);
return 0;
}
-static int extent_handle_overwrites(struct btree_trans *trans,
- struct btree_insert_entry *i)
+static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+ bool overwrite)
{
- struct bch_fs *c = trans->c;
- struct btree_iter *iter, *update_iter;
- struct bpos start = bkey_start_pos(&i->k->k);
- struct bkey_i *update;
- struct bkey_s_c k;
- int ret = 0, compressed_sectors;
-
- iter = bch2_trans_get_iter(trans, i->btree_id, start,
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES|
- BTREE_ITER_NOT_EXTENTS);
- k = bch2_btree_iter_peek(iter);
- if (!k.k || (ret = bkey_err(k)))
- goto out;
-
- if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) {
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- goto out;
+ struct bkey _deleted = KEY(0, 0, 0);
+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
+ struct bkey_s_c old;
+ struct bkey unpacked;
+ int ret = 0;
- bkey_reassemble(update, k);
+ if ((i->flags & BTREE_TRIGGER_NORUN) ||
+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+ return 0;
- if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) {
- update_iter = bch2_trans_copy_iter(trans, iter);
- ret = bch2_btree_delete_at(trans, update_iter, i->flags);
- bch2_trans_iter_put(trans, update_iter);
+ if (!overwrite) {
+ if (i->insert_trigger_run)
+ return 0;
- if (ret)
- goto out;
+ BUG_ON(i->overwrite_trigger_run);
+ i->insert_trigger_run = true;
+ } else {
+ if (i->overwrite_trigger_run)
+ return 0;
- i->k = update;
- goto next;
- }
+ BUG_ON(!i->insert_trigger_run);
+ i->overwrite_trigger_run = true;
}
- if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k)))
- goto next;
-
- while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) {
- /*
- * If we're going to be splitting a compressed extent, note it
- * so that __bch2_trans_commit() can increase our disk
- * reservation:
- */
- if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
- bkey_cmp(k.k->p, i->k->k.p) > 0 &&
- (compressed_sectors = bch2_bkey_sectors_compressed(k)))
- trans->extra_journal_res += compressed_sectors;
+ old = bch2_btree_path_peek_slot(i->path, &unpacked);
+ _deleted.p = i->path->pos;
+
+ if (overwrite) {
+ ret = bch2_trans_mark_key(trans, old, deleted,
+ BTREE_TRIGGER_OVERWRITE|i->flags);
+ } else if (old.k->type == i->k->k.type &&
+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ i->overwrite_trigger_run = true;
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+ } else {
+ ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
+ BTREE_TRIGGER_INSERT|i->flags);
+ }
- if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- goto out;
+ if (ret == -EINTR)
+ trace_trans_restart_mark(trans->fn, _RET_IP_,
+ i->btree_id, &i->path->pos);
+ return ret ?: 1;
+}
- bkey_reassemble(update, k);
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+ struct btree_insert_entry *btree_id_start)
+{
+ struct btree_insert_entry *i;
+ bool trans_trigger_run;
+ int ret, overwrite;
- bch2_cut_back(start, update);
+ for (overwrite = 0; overwrite < 2; overwrite++) {
- update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p,
- BTREE_ITER_NOT_EXTENTS|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(update_iter);
- if (ret) {
- bch2_trans_iter_put(trans, update_iter);
- goto out;
+ /*
+ * Running triggers will append more updates to the list of updates as
+ * we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
+
+ for (i = btree_id_start;
+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+ i++) {
+ ret = run_one_trigger(trans, i, overwrite);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
}
+ } while (trans_trigger_run);
+ }
- bch2_trans_update(trans, update_iter, update,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
- i->flags);
- bch2_trans_iter_put(trans, update_iter);
- }
-
- if (bkey_cmp(k.k->p, i->k->k.p) <= 0) {
- update_iter = bch2_trans_copy_iter(trans, iter);
- ret = bch2_btree_delete_at(trans, update_iter,
- i->flags);
- bch2_trans_iter_put(trans, update_iter);
-
- if (ret)
- goto out;
- }
+ return 0;
+}
- if (bkey_cmp(k.k->p, i->k->k.p) > 0) {
- update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
- if ((ret = PTR_ERR_OR_ZERO(update)))
- goto out;
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ unsigned btree_id = 0;
+ int ret = 0;
- bkey_reassemble(update, k);
- bch2_cut_front(i->k->k.p, update);
+ /*
+ *
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being moved
+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+ * they are re-added.
+ */
+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+ while (btree_id_start < trans->updates + trans->nr_updates &&
+ btree_id_start->btree_id < btree_id)
+ btree_id_start++;
- bch2_trans_update(trans, iter, update, i->flags);
- goto out;
- }
-next:
- k = bch2_btree_iter_next(iter);
- if (!k.k || (ret = bkey_err(k)))
- goto out;
+ ret = run_btree_triggers(trans, btree_id, btree_id_start);
+ if (ret)
+ return ret;
}
- bch2_bkey_merge(c, bkey_i_to_s(i->k), k);
-out:
- bch2_trans_iter_put(trans, iter);
+ trans_for_each_update(trans, i)
+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+ (!i->insert_trigger_run || !i->overwrite_trigger_run));
- return ret;
+ return 0;
}
int __bch2_trans_commit(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
- struct btree_iter *iter;
- bool trans_trigger_run;
unsigned u64s;
int ret = 0;
goto out_reset;
if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&trans->c->gc_lock);
+ lockdep_assert_held(&c->gc_lock);
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
trans->journal_u64s = trans->extra_journal_entry_u64s;
trans->journal_preres_u64s = 0;
+ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+
+ if (trans->journal_transaction_names)
+ trans->journal_u64s += JSET_ENTRY_LOG_U64s;
+
if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
- unlikely(!percpu_ref_tryget(&trans->c->writes))) {
+ unlikely(!percpu_ref_tryget(&c->writes))) {
ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
goto out_reset;
}
#ifdef CONFIG_BCACHEFS_DEBUG
+ /*
+ * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
+ * from the key cache flush code:
+ */
trans_for_each_update(trans, i)
- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
+ if (!i->cached &&
!(i->flags & BTREE_TRIGGER_NORUN))
bch2_btree_key_cache_verify_clean(trans,
i->btree_id, i->k->k.p);
#endif
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
-
- trans_for_each_update(trans, i) {
- if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
- !i->trans_triggers_run) {
- i->trans_triggers_run = true;
- trans_trigger_run = true;
-
- ret = bch2_trans_mark_update(trans, i->iter,
- i->k, i->flags);
- if (unlikely(ret)) {
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->ip, _RET_IP_,
- i->iter->btree_id,
- &i->iter->pos);
- goto out;
- }
- }
- }
- } while (trans_trigger_run);
+ ret = bch2_trans_commit_run_triggers(trans);
+ if (ret)
+ goto out;
trans_for_each_update(trans, i) {
- BUG_ON(!i->iter->should_be_locked);
+ BUG_ON(!i->path->should_be_locked);
- if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
- trace_trans_restart_upgrade(trans->ip, _RET_IP_,
- i->iter->btree_id,
- &i->iter->pos);
- trans->restarted = true;
- ret = -EINTR;
+ if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
+ trace_trans_restart_upgrade(trans->fn, _RET_IP_,
+ i->btree_id, &i->path->pos);
+ ret = btree_trans_restart(trans);
goto out;
}
- BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+ BUG_ON(!btree_node_intent_locked(i->path, i->level));
u64s = jset_u64s(i->k->k.u64s);
- if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
+ if (i->cached &&
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
trans->journal_u64s += u64s;
}
if (trans->extra_journal_res) {
- ret = bch2_disk_reservation_add(trans->c, trans->disk_res,
+ ret = bch2_disk_reservation_add(c, trans->disk_res,
trans->extra_journal_res,
(trans->flags & BTREE_INSERT_NOFAIL)
? BCH_DISK_RESERVATION_NOFAIL : 0);
ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
/* make sure we didn't drop or screw up locks: */
- bch2_btree_trans_verify_locks(trans);
+ bch2_trans_verify_locks(trans);
if (ret)
goto err;
-
- trans_for_each_iter(trans, iter)
- if (btree_iter_live(trans, iter) &&
- (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
- bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
out:
- bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+ bch2_journal_preres_put(&c->journal, &trans->journal_preres);
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
- percpu_ref_put(&trans->c->writes);
+ percpu_ref_put(&c->writes);
out_reset:
+ trans_for_each_update(trans, i)
+ bch2_path_put(trans, i->path, true);
+
trans->extra_journal_res = 0;
trans->nr_updates = 0;
trans->hooks = NULL;
goto retry;
}
-int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_update_flags flags)
+static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos pos)
{
- struct btree_insert_entry *i, n = (struct btree_insert_entry) {
- .flags = flags,
- .bkey_type = __btree_node_type(iter->level, iter->btree_id),
- .btree_id = iter->btree_id,
- .level = iter->level,
- .iter = iter,
- .k = k
- };
- bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0;
- int ret = 0;
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
- BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
- BUG_ON(!iter->should_be_locked);
+ if (!btree_type_has_snapshots(id))
+ return 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
- trans_for_each_update(trans, i)
- BUG_ON(i != trans->updates &&
- btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
+ if (!snapshot_t(c, pos.snapshot)->children[0])
+ return 0;
- if (is_extent) {
- ret = extent_handle_overwrites(trans, &n);
+ bch2_trans_iter_init(trans, &iter, id, pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
if (ret)
- return ret;
+ break;
- iter->pos_after_commit = k->k.p;
- iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
+ if (!k.k)
+ break;
- if (bkey_deleted(&n.k->k))
- return 0;
+ if (bkey_cmp(pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+ ret = 1;
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+int bch2_trans_update_extent(struct btree_trans *trans,
+ struct btree_iter *orig_iter,
+ struct bkey_i *insert,
+ enum btree_update_flags flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter, update_iter;
+ struct bpos start = bkey_start_pos(&insert->k);
+ struct bkey_i *update;
+ struct bkey_s_c k;
+ enum btree_id btree_id = orig_iter->btree_id;
+ int ret = 0, compressed_sectors;
- n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p,
- BTREE_ITER_INTENT|
- BTREE_ITER_NOT_EXTENTS);
- ret = bch2_btree_iter_traverse(n.iter);
- bch2_trans_iter_put(trans, n.iter);
+ bch2_trans_iter_init(trans, &iter, btree_id, start,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES|
+ BTREE_ITER_NOT_EXTENTS);
+ k = bch2_btree_iter_peek(&iter);
+ if ((ret = bkey_err(k)))
+ goto err;
+ if (!k.k)
+ goto out;
+ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+ /*
+ * We can't merge extents if they belong to interior snapshot
+ * tree nodes, and there's a snapshot in which one extent is
+ * visible and the other is not - i.e. if visibility is
+ * different.
+ *
+ * Instead of checking if visibilitiy of the two extents is
+ * different, for now we just check if either has been
+ * overwritten:
+ */
+ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+ if (ret < 0)
+ goto err;
if (ret)
- return ret;
+ goto nomerge1;
+
+ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge1;
+
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+
+ if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) {
+ ret = bch2_btree_delete_at(trans, &iter, flags);
+ if (ret)
+ goto err;
+
+ insert = update;
+ goto next;
+ }
+ }
+nomerge1:
+ ret = 0;
+ if (!bkey_cmp(k.k->p, start))
+ goto next;
+
+ while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
+ bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
+ bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0;
+
+ /*
+ * If we're going to be splitting a compressed extent, note it
+ * so that __bch2_trans_commit() can increase our disk
+ * reservation:
+ */
+ if (((front_split && back_split) ||
+ ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
+ (compressed_sectors = bch2_bkey_sectors_compressed(k)))
+ trans->extra_journal_res += compressed_sectors;
+
+ if (front_split) {
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+
+ bch2_cut_back(start, update);
+
+ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+
+ if (ret)
+ goto err;
+ }
+
+ if (k.k->p.snapshot != insert->k.p.snapshot &&
+ (front_split || back_split)) {
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+
+ bch2_cut_front(start, update);
+ bch2_cut_back(insert->k.p, update);
+
+ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+ if (ret)
+ goto err;
+ }
+
+ if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
+ update = bch2_trans_kmalloc(trans, sizeof(*update));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_init(&update->k);
+ update->k.p = k.k->p;
+
+ if (insert->k.p.snapshot != k.k->p.snapshot) {
+ update->k.p.snapshot = insert->k.p.snapshot;
+ update->k.type = KEY_TYPE_whiteout;
+ }
+
+ bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ bch2_trans_iter_exit(trans, &update_iter);
+
+ if (ret)
+ goto err;
+ }
+
+ if (back_split) {
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+ if ((ret = PTR_ERR_OR_ZERO(update)))
+ goto err;
+
+ bkey_reassemble(update, k);
+ bch2_cut_front(insert->k.p, update);
+
+ ret = bch2_trans_update_by_path(trans, iter.path, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+ flags);
+ if (ret)
+ goto err;
+ goto out;
+ }
+next:
+ k = bch2_btree_iter_next(&iter);
+ if ((ret = bkey_err(k)))
+ goto err;
+ if (!k.k)
+ goto out;
+ }
+
+ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+ ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge2;
+
+ ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+ if (ret < 0)
+ goto err;
+ if (ret)
+ goto nomerge2;
+
+ bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+ }
+nomerge2:
+ ret = 0;
+out:
+ if (!bkey_deleted(&insert->k)) {
+ /*
+ * Rewinding iterators is expensive: get a new one and the one
+ * that points to the start of insert will be cloned from:
+ */
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, insert, flags);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u32 snapshot = pos.snapshot;
+ int ret;
+
+ if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+ return 0;
+
+ pos.snapshot++;
+
+ for_each_btree_key_norestart(trans, iter, btree_id, pos,
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_NOPRESERVE, k, ret) {
+ if (bkey_cmp(k.k->p, pos))
+ break;
+
+ if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+ k.k->p.snapshot)) {
+ ret = !bkey_whiteout(k.k);
+ break;
+ }
}
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ struct btree_insert_entry *i, n;
- BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS);
+ BUG_ON(!path->should_be_locked);
- n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+ BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+ BUG_ON(bpos_cmp(k->k.p, path->pos));
+
+ n = (struct btree_insert_entry) {
+ .flags = flags,
+ .bkey_type = __btree_node_type(path->level, path->btree_id),
+ .btree_id = path->btree_id,
+ .level = path->level,
+ .cached = path->cached,
+ .path = path,
+ .k = k,
+ .ip_allocated = _RET_IP_,
+ };
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+ trans_for_each_update(trans, i)
+ BUG_ON(i != trans->updates &&
+ btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
/*
* Pending updates are kept sorted: first, find position of new update,
if (i < trans->updates + trans->nr_updates &&
!btree_insert_entry_cmp(&n, i)) {
- BUG_ON(i->trans_triggers_run);
+ BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
- /*
- * This is a hack to ensure that inode creates update the btree,
- * not the key cache, which helps with cache coherency issues in
- * other areas:
- */
- if (btree_iter_type(n.iter) == BTREE_ITER_CACHED &&
- btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
- i->k = n.k;
- i->flags = n.flags;
- } else {
- *i = n;
- }
+ bch2_path_put(trans, i->path, true);
+ *i = n;
} else
array_insert_item(trans->updates, trans->nr_updates,
i - trans->updates, n);
+ __btree_path_get(n.path, true);
return 0;
}
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ struct btree_path *path = iter->update_path ?: iter->path;
+ struct bkey_cached *ck;
+ int ret;
+
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
+ return bch2_trans_update_extent(trans, iter, k, flags);
+
+ if (bkey_deleted(&k->k) &&
+ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+ if (unlikely(ret < 0))
+ return ret;
+
+ if (ret)
+ k->k.type = KEY_TYPE_whiteout;
+ }
+
+ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+ !path->cached &&
+ !path->level &&
+ btree_id_cached(trans->c, path->btree_id)) {
+ if (!iter->key_cache_path ||
+ !iter->key_cache_path->should_be_locked ||
+ bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
+ if (!iter->key_cache_path)
+ iter->key_cache_path =
+ bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_CACHED, _THIS_IP_);
+
+ iter->key_cache_path =
+ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+ iter->flags & BTREE_ITER_INTENT,
+ _THIS_IP_);
+
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_CACHED_NOFILL);
+ if (unlikely(ret))
+ return ret;
+
+ ck = (void *) iter->key_cache_path->l[0].b;
+
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+ trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
+ btree_trans_restart(trans);
+ return -EINTR;
+ }
+
+ iter->key_cache_path->should_be_locked = true;
+ }
+
+ path = iter->key_cache_path;
+ }
+
+ return bch2_trans_update_by_path(trans, path, k, flags);
+}
+
void bch2_trans_commit_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{
int __bch2_btree_insert(struct btree_trans *trans,
enum btree_id id, struct bkey_i *k)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
int ret;
- iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
- BTREE_ITER_INTENT);
-
- ret = bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(trans, iter, k, 0);
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, k, 0);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
struct bpos start, struct bpos end,
+ unsigned iter_flags,
u64 *journal_seq)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
- iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
retry:
while ((bch2_trans_begin(trans),
- (k = bch2_btree_iter_peek(iter)).k) &&
+ (k = bch2_btree_iter_peek(&iter)).k) &&
!(ret = bkey_err(k)) &&
- bkey_cmp(iter->pos, end) < 0) {
+ bkey_cmp(iter.pos, end) < 0) {
+ struct disk_reservation disk_res =
+ bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
bkey_init(&delete.k);
* (bch2_btree_iter_peek() does guarantee that iter.pos >=
* bkey_start_pos(k.k)).
*/
- delete.k.p = iter->pos;
+ delete.k.p = iter.pos;
- if (btree_node_type_is_extents(iter->btree_id)) {
+ if (iter.flags & BTREE_ITER_IS_EXTENTS) {
unsigned max_sectors =
KEY_SIZE_MAX & (~0 << trans->c->block_bits);
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete);
- ret = bch2_extent_trim_atomic(&delete, iter);
+ ret = bch2_extent_trim_atomic(trans, &iter, &delete);
if (ret)
break;
}
- ret = bch2_trans_update(trans, iter, &delete, 0) ?:
- bch2_trans_commit(trans, NULL, journal_seq,
+ ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+ bch2_trans_commit(trans, &disk_res, journal_seq,
BTREE_INSERT_NOFAIL);
+ bch2_disk_reservation_put(trans->c, &disk_res);
if (ret)
break;
-
- bch2_trans_cond_resched(trans);
}
if (ret == -EINTR) {
goto retry;
}
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
*/
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
struct bpos start, struct bpos end,
+ unsigned iter_flags,
u64 *journal_seq)
{
return bch2_trans_do(c, NULL, journal_seq, 0,
- bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
+ bch2_btree_delete_range_trans(&trans, id, start, end,
+ iter_flags, journal_seq));
}
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
+#include "buckets_waiting_for_journal.h"
#include "ec.h"
#include "error.h"
+#include "inode.h"
#include "movinggc.h"
+#include "recovery.h"
#include "reflink.h"
#include "replicas.h"
+#include "subvolume.h"
#include <linux/preempt.h>
#include <trace/events/bcachefs.h>
}
}
-/*
- * Clear journal_seq_valid for buckets for which it's not needed, to prevent
- * wraparound:
- */
-void bch2_bucket_seq_cleanup(struct bch_fs *c)
-{
- u64 journal_seq = atomic64_read(&c->journal.seq);
- u16 last_seq_ondisk = c->journal.last_seq_ondisk;
- struct bch_dev *ca;
- struct bucket_array *buckets;
- struct bucket *g;
- struct bucket_mark m;
- unsigned i;
-
- if (journal_seq - c->last_bucket_seq_cleanup <
- (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
- return;
-
- c->last_bucket_seq_cleanup = journal_seq;
-
- for_each_member_device(ca, c, i) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for_each_bucket(g, buckets) {
- bucket_cmpxchg(g, m, ({
- if (!m.journal_seq_valid ||
- bucket_needs_journal_commit(m, last_seq_ondisk))
- break;
-
- m.journal_seq_valid = 0;
- }));
- }
- up_read(&ca->bucket_lock);
- }
-}
-
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
unsigned journal_seq,
bool gc)
{
+ BUG_ON(!gc && !journal_seq);
+
return this_cpu_ptr(gc
? ca->usage_gc
: ca->usage[journal_seq & JOURNAL_BUF_MASK]);
unsigned journal_seq,
bool gc)
{
+ percpu_rwsem_assert_held(&c->mark_lock);
+ BUG_ON(!gc && !journal_seq);
+
return this_cpu_ptr(gc
? c->usage_gc
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
static inline int bucket_sectors_fragmented(struct bch_dev *ca,
struct bucket_mark m)
{
- return bucket_sectors_used(m)
- ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+ return m.dirty_sectors
+ ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
: 0;
}
: m.data_type;
}
-static bool bucket_became_unavailable(struct bucket_mark old,
- struct bucket_mark new)
-{
- return is_available_bucket(old) &&
- !is_available_bucket(new);
-}
-
static inline void account_bucket(struct bch_fs_usage *fs_usage,
struct bch_dev_usage *dev_usage,
enum bch_data_type type,
struct bch_fs_usage *fs_usage;
struct bch_dev_usage *u;
- percpu_rwsem_assert_held(&c->mark_lock);
-
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
u = dev_usage_ptr(ca, journal_seq, gc);
return 0;
}
-static inline int update_replicas(struct bch_fs *c,
+static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
struct bch_replicas_entry *r, s64 sectors,
unsigned journal_seq, bool gc)
{
struct bch_fs_usage __percpu *fs_usage;
- int idx = bch2_replicas_entry_idx(c, r);
+ int idx, ret = 0;
+ char buf[200];
- if (idx < 0)
- return -1;
+ percpu_down_read(&c->mark_lock);
+
+ idx = bch2_replicas_entry_idx(c, r);
+ if (idx < 0 &&
+ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+ fsck_err(c, "no replicas entry\n"
+ " while marking %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+ percpu_up_read(&c->mark_lock);
+ ret = bch2_mark_replicas(c, r);
+ if (ret)
+ return ret;
+
+ percpu_down_read(&c->mark_lock);
+ idx = bch2_replicas_entry_idx(c, r);
+ }
+ if (idx < 0) {
+ ret = -1;
+ goto err;
+ }
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
preempt_enable();
- return 0;
+err:
+fsck_err:
+ percpu_up_read(&c->mark_lock);
+ return ret;
}
static inline int update_cached_sectors(struct bch_fs *c,
+ struct bkey_s_c k,
unsigned dev, s64 sectors,
unsigned journal_seq, bool gc)
{
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas(c, &r.e, sectors, journal_seq, gc);
+ return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
}
static struct replicas_delta_list *
update_replicas_list(trans, &r.e, sectors);
}
-#define do_mark_fn(fn, c, pos, flags, ...) \
-({ \
- int gc, ret = 0; \
- \
- percpu_rwsem_assert_held(&c->mark_lock); \
- \
- for (gc = 0; gc < 2 && !ret; gc++) \
- if (!gc == !(flags & BTREE_TRIGGER_GC) || \
- (gc && gc_visited(c, pos))) \
- ret = fn(c, __VA_ARGS__, gc); \
- ret; \
-})
-
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator)
{
BUG_ON(owned_by_allocator == old.owned_by_allocator);
}
-static int bch2_mark_alloc(struct bch_fs *c,
+static int bch2_mark_alloc(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
- u64 journal_seq, unsigned flags)
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
- struct bkey_alloc_unpacked u;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
+ struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
+ struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
struct bch_dev *ca;
struct bucket *g;
struct bucket_mark old_m, m;
-
- /* We don't do anything for deletions - do we?: */
- if (new.k->type != KEY_TYPE_alloc &&
- new.k->type != KEY_TYPE_alloc_v2)
- return 0;
+ int ret = 0;
/*
* alloc btree is read in by bch2_alloc_read, not gc:
!(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
return 0;
- ca = bch_dev_bkey_exists(c, new.k->p.inode);
+ if ((flags & BTREE_TRIGGER_INSERT) &&
+ !old_u.data_type != !new_u.data_type &&
+ new.k->type == KEY_TYPE_alloc_v3) {
+ struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+ u64 old_journal_seq = le64_to_cpu(v->journal_seq);
+
+ BUG_ON(!journal_seq);
- if (new.k->p.offset >= ca->mi.nbuckets)
+ /*
+ * If the btree updates referring to a bucket weren't flushed
+ * before the bucket became empty again, then the we don't have
+ * to wait on a journal flush before we can reuse the bucket:
+ */
+ new_u.journal_seq = !new_u.data_type &&
+ (journal_seq == old_journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, old_journal_seq))
+ ? 0 : journal_seq;
+ v->journal_seq = cpu_to_le64(new_u.journal_seq);
+ }
+
+ if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new_u.dev, new_u.bucket,
+ new_u.journal_seq);
+ if (ret) {
+ bch2_fs_fatal_error(c,
+ "error setting bucket_needs_journal_commit: %i", ret);
+ return ret;
+ }
+ }
+
+ ca = bch_dev_bkey_exists(c, new_u.dev);
+
+ if (new_u.bucket >= ca->mi.nbuckets)
return 0;
- g = __bucket(ca, new.k->p.offset, gc);
- u = bch2_alloc_unpack(new);
+ percpu_down_read(&c->mark_lock);
+ if (!gc && new_u.gen != old_u.gen)
+ *bucket_gen(ca, new_u.bucket) = new_u.gen;
+
+ g = __bucket(ca, new_u.bucket, gc);
old_m = bucket_cmpxchg(g, m, ({
- m.gen = u.gen;
- m.data_type = u.data_type;
- m.dirty_sectors = u.dirty_sectors;
- m.cached_sectors = u.cached_sectors;
- m.stripe = u.stripe != 0;
-
- if (journal_seq) {
- m.journal_seq_valid = 1;
- m.journal_seq = journal_seq;
- }
+ m.gen = new_u.gen;
+ m.data_type = new_u.data_type;
+ m.dirty_sectors = new_u.dirty_sectors;
+ m.cached_sectors = new_u.cached_sectors;
+ m.stripe = new_u.stripe != 0;
}));
bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
+ g->io_time[READ] = new_u.read_time;
+ g->io_time[WRITE] = new_u.write_time;
+ g->oldest_gen = new_u.oldest_gen;
g->gen_valid = 1;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
+ g->stripe = new_u.stripe;
+ g->stripe_redundancy = new_u.stripe_redundancy;
+ percpu_up_read(&c->mark_lock);
/*
* need to know if we're getting called from the invalidate path or
if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
old_m.cached_sectors) {
- if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
- journal_seq, gc)) {
+ ret = update_cached_sectors(c, new, ca->dev_idx,
+ -old_m.cached_sectors,
+ journal_seq, gc);
+ if (ret) {
bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
- return -1;
+ return ret;
}
- trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
+ trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
old_m.cached_sectors);
}
overflow; \
})
-static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type data_type,
- unsigned sectors, bool gc)
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, enum bch_data_type data_type,
+ unsigned sectors, struct gc_pos pos,
+ unsigned flags)
{
- struct bucket *g = __bucket(ca, b, gc);
+ struct bucket *g;
struct bucket_mark old, new;
bool overflow;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
BUG_ON(data_type != BCH_DATA_sb &&
data_type != BCH_DATA_journal);
+ /*
+ * Backup superblock might be past the end of our normal usable space:
+ */
+ if (b >= ca->mi.nbuckets)
+ return;
+
+ percpu_down_read(&c->mark_lock);
+ g = gc_bucket(ca, b);
old = bucket_cmpxchg(g, new, ({
new.data_type = data_type;
overflow = checked_add(new.dirty_sectors, sectors);
bch2_data_types[old.data_type ?: data_type],
old.dirty_sectors, sectors);
- if (c)
- bch2_dev_usage_update(c, ca, old, new, 0, gc);
-
- return 0;
-}
-
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type type,
- unsigned sectors, struct gc_pos pos,
- unsigned flags)
-{
- BUG_ON(type != BCH_DATA_sb &&
- type != BCH_DATA_journal);
-
- /*
- * Backup superblock might be past the end of our normal usable space:
- */
- if (b >= ca->mi.nbuckets)
- return;
-
- if (likely(c)) {
- do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
- ca, b, type, sectors);
- } else {
- __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
- }
+ bch2_dev_usage_update(c, ca, old, new, 0, true);
+ percpu_up_read(&c->mark_lock);
}
static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
{
- return p.crc.compression_type
- ? DIV_ROUND_UP(sectors * p.crc.compressed_size,
+ EBUG_ON(sectors < 0);
+
+ return p.crc.compression_type &&
+ p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
+ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
p.crc.uncompressed_size)
: sectors;
}
-static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
+static int check_bucket_ref(struct bch_fs *c,
+ struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 bucket_data_type,
+ u8 b_gen, u8 bucket_data_type,
u16 dirty_sectors, u16 cached_sectors)
{
- size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
u16 bucket_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
char buf[200];
- if (gen_after(ptr->gen, bucket_gen)) {
+ if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
- if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+ if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
- if (bucket_gen != ptr->gen && !ptr->cached) {
+ if (b_gen != ptr->gen && !ptr->cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
+ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
+ *bucket_gen(ca, bucket_nr),
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
- if (bucket_gen != ptr->gen)
+ if (b_gen != ptr->gen)
return 1;
if (bucket_data_type && ptr_data_type &&
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type],
bch2_data_types[ptr_data_type],
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
"while marking %s",
- ptr->dev, bucket_nr, bucket_gen,
+ ptr->dev, bucket_nr, b_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
bucket_sectors, sectors,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return 0;
}
-static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
- unsigned ptr_idx,
- u64 journal_seq, unsigned flags)
+static int mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned ptr_idx,
+ unsigned flags)
{
+ struct bch_fs *c = trans->c;
+ u64 journal_seq = trans->journal_res.seq;
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
+ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
- bool gc = flags & BTREE_TRIGGER_GC;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+ struct bucket *g;
struct bucket_mark new, old;
char buf[200];
- int ret;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
- if (g->stripe && g->stripe != k.k->p.offset) {
+ /* * XXX doesn't handle deletion */
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_GC_BUCKET(ca, ptr);
+
+ if (g->mark.dirty_sectors ||
+ (g->stripe && g->stripe != k.k->p.offset)) {
bch2_fs_inconsistent(c,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
old = bucket_cmpxchg(g, new, ({
- ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
+ ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+ new.gen, new.data_type,
new.dirty_sectors, new.cached_sectors);
if (ret)
- return ret;
+ goto err;
- if (parity) {
- new.data_type = BCH_DATA_parity;
- new.dirty_sectors = le16_to_cpu(s->sectors);
- }
+ new.dirty_sectors += sectors;
+ if (data_type)
+ new.data_type = data_type;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
+ new.stripe = true;
}));
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
- bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
+ bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+err:
+ percpu_up_read(&c->mark_lock);
+
return 0;
}
-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+static int __mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 bucket_gen, u8 *bucket_data_type,
u16 *dst_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
- int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
+ int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
bucket_gen, *bucket_data_type,
*dirty_sectors, *cached_sectors);
return 0;
}
-static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
- u64 journal_seq, unsigned flags)
+ unsigned flags)
{
- bool gc = flags & BTREE_TRIGGER_GC;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
+ struct bucket *g;
u8 bucket_data_type;
u64 v;
- int ret;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_GC_BUCKET(ca, &p.ptr);
v = atomic64_read(&g->_mark.v);
do {
new.v.counter = old.v.counter = v;
bucket_data_type = new.data_type;
- ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
+ ret = __mark_pointer(trans, k, &p.ptr, sectors,
+ data_type, new.gen,
&bucket_data_type,
&new.dirty_sectors,
&new.cached_sectors);
if (ret)
- return ret;
+ goto err;
new.data_type = bucket_data_type;
- if (journal_seq) {
- new.journal_seq_valid = 1;
- new.journal_seq = journal_seq;
- }
-
if (flags & BTREE_TRIGGER_NOATOMIC) {
g->_mark = new;
break;
old.v.counter,
new.v.counter)) != old.v.counter);
- bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
-
- BUG_ON(!gc && bucket_became_unavailable(old, new));
+ bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+err:
+ percpu_up_read(&c->mark_lock);
- return 0;
+ return ret;
}
-static int bch2_mark_stripe_ptr(struct bch_fs *c,
+static int bch2_mark_stripe_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
struct bch_extent_stripe_ptr p,
enum bch_data_type data_type,
s64 sectors,
- unsigned journal_seq, unsigned flags)
+ unsigned flags)
{
- bool gc = flags & BTREE_TRIGGER_GC;
+ struct bch_fs *c = trans->c;
struct bch_replicas_padded r;
- struct stripe *m;
- unsigned i, blocks_nonempty = 0;
+ struct gc_stripe *m;
- m = genradix_ptr(&c->stripes[gc], p.idx);
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ (u64) p.idx);
+ return -ENOMEM;
+ }
spin_lock(&c->ec_stripes_heap_lock);
m->block_sectors[p.block] += sectors;
r = m->r;
-
- for (i = 0; i < m->nr_blocks; i++)
- blocks_nonempty += m->block_sectors[i] != 0;
-
- if (m->blocks_nonempty != blocks_nonempty) {
- m->blocks_nonempty = blocks_nonempty;
- if (!gc)
- bch2_stripes_heap_update(c, m, p.idx);
- }
-
spin_unlock(&c->ec_stripes_heap_lock);
r.e.data_type = data_type;
- update_replicas(c, &r.e, sectors, journal_seq, gc);
+ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
return 0;
}
-static int bch2_mark_extent(struct bch_fs *c,
+static int bch2_mark_extent(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new,
- unsigned journal_seq, unsigned flags)
+ unsigned flags)
{
- bool gc = flags & BTREE_TRIGGER_GC;
- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
? BCH_DATA_btree
: BCH_DATA_user;
s64 sectors = bkey_is_btree_ptr(k.k)
- ? c->opts.btree_node_size
+ ? btree_sectors(c)
: k.k->size;
s64 dirty_sectors = 0;
bool stale;
int ret;
- BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
- (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- sectors = -sectors;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
r.e.data_type = data_type;
r.e.nr_devs = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors = ptr_disk_sectors(sectors, p);
- ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
- journal_seq, flags);
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ disk_sectors = -disk_sectors;
+
+ ret = bch2_mark_pointer(trans, k, p, disk_sectors,
+ data_type, flags);
if (ret < 0)
return ret;
stale = ret > 0;
if (p.ptr.cached) {
- if (!stale)
- if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
- journal_seq, gc)) {
+ if (!stale) {
+ ret = update_cached_sectors(c, k, p.ptr.dev,
+ disk_sectors, journal_seq, true);
+ if (ret) {
bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
- return -1;
-
+ return ret;
}
+ }
} else if (!p.has_ec) {
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
- disk_sectors, journal_seq, flags);
+ ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
+ disk_sectors, flags);
if (ret)
return ret;
}
if (r.e.nr_devs) {
- if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
+ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
+ if (ret) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, k);
bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
- return -1;
+ return ret;
}
}
return 0;
}
-static int bch2_mark_stripe(struct bch_fs *c,
- struct bkey_s_c old, struct bkey_s_c new,
- u64 journal_seq, unsigned flags)
+static int bch2_mark_stripe(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
- size_t idx = new.k->p.offset;
+ u64 journal_seq = trans->journal_res.seq;
+ struct bch_fs *c = trans->c;
+ u64 idx = new.k->p.offset;
const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(old).v : NULL;
const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
? bkey_s_c_to_stripe(new).v : NULL;
- struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
unsigned i;
int ret;
BUG_ON(gc && old_s);
- if (!m || (old_s && !m->alive)) {
- bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
- idx);
- bch2_inconsistent_error(c);
- return -1;
- }
+ if (!gc) {
+ struct stripe *m = genradix_ptr(&c->stripes, idx);
- if (!new_s) {
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_del(c, m, idx);
- spin_unlock(&c->ec_stripes_heap_lock);
+ if (!m || (old_s && !m->alive)) {
+ char buf1[200], buf2[200];
- memset(m, 0, sizeof(*m));
+ bch2_bkey_val_to_text(&PBUF(buf1), c, old);
+ bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+ "old %s\n"
+ "new %s", idx, buf1, buf2);
+ bch2_inconsistent_error(c);
+ return -1;
+ }
+
+ if (!new_s) {
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_del(c, m, idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
+
+ memset(m, 0, sizeof(*m));
+ } else {
+ m->alive = true;
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->algorithm = new_s->algorithm;
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+ m->blocks_nonempty = 0;
+
+ for (i = 0; i < new_s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_update(c, m, idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
} else {
+ struct gc_stripe *m =
+ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ idx);
+ return -ENOMEM;
+ }
+ /*
+ * This will be wrong when we bring back runtime gc: we should
+ * be unmarking the old key and then marking the new key
+ */
m->alive = true;
m->sectors = le16_to_cpu(new_s->sectors);
- m->algorithm = new_s->algorithm;
m->nr_blocks = new_s->nr_blocks;
m->nr_redundant = new_s->nr_redundant;
- m->blocks_nonempty = 0;
-
- for (i = 0; i < new_s->nr_blocks; i++) {
- m->block_sectors[i] =
- stripe_blockcount_get(new_s, i);
- m->blocks_nonempty += !!m->block_sectors[i];
+ for (i = 0; i < new_s->nr_blocks; i++)
m->ptrs[i] = new_s->ptrs[i];
- }
bch2_bkey_to_replicas(&m->r.e, new);
- if (!gc) {
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_update(c, m, idx);
- spin_unlock(&c->ec_stripes_heap_lock);
- }
- }
-
- if (gc) {
/*
* gc recalculates this field from stripe ptr
* references:
*/
memset(m->block_sectors, 0, sizeof(m->block_sectors));
- m->blocks_nonempty = 0;
for (i = 0; i < new_s->nr_blocks; i++) {
- ret = mark_stripe_bucket(c, new, i, journal_seq, flags);
+ ret = mark_stripe_bucket(trans, new, i, flags);
if (ret)
return ret;
}
- if (update_replicas(c, &m->r.e,
- ((s64) m->sectors * m->nr_redundant),
- journal_seq, gc)) {
+ ret = update_replicas(c, new, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant),
+ journal_seq, gc);
+ if (ret) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, new);
bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
- return -1;
+ return ret;
}
}
return 0;
}
-static int bch2_mark_inode(struct bch_fs *c,
- struct bkey_s_c old, struct bkey_s_c new,
- u64 journal_seq, unsigned flags)
+static int bch2_mark_inode(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
+ struct bch_fs *c = trans->c;
struct bch_fs_usage __percpu *fs_usage;
+ u64 journal_seq = trans->journal_res.seq;
- preempt_disable();
- fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
- fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
- fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
- preempt_enable();
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
+
+ BUG_ON(!journal_seq);
+ BUG_ON(new.k->type != KEY_TYPE_inode_v2);
+
+ v->bi_journal_seq = cpu_to_le64(journal_seq);
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+
+ fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+ fs_usage->nr_inodes += bkey_is_inode(new.k);
+ fs_usage->nr_inodes -= bkey_is_inode(old.k);
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+ }
return 0;
}
-static int bch2_mark_reservation(struct bch_fs *c,
- struct bkey_s_c old, struct bkey_s_c new,
- u64 journal_seq, unsigned flags)
+static int bch2_mark_reservation(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
struct bch_fs_usage __percpu *fs_usage;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
if (flags & BTREE_TRIGGER_OVERWRITE)
sectors = -sectors;
sectors *= replicas;
+ percpu_down_read(&c->mark_lock);
preempt_disable();
- fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+
+ fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
replicas = clamp_t(unsigned, replicas, 1,
ARRAY_SIZE(fs_usage->persistent_reserved));
fs_usage->reserved += sectors;
fs_usage->persistent_reserved[replicas - 1] += sectors;
+
preempt_enable();
+ percpu_up_read(&c->mark_lock);
return 0;
}
static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
- u64 idx, unsigned flags, size_t *r_idx)
+ u64 *idx, unsigned flags, size_t r_idx)
{
struct reflink_gc *r;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ s64 ret = 0;
- while (1) {
- if (*r_idx >= c->reflink_gc_nr)
- goto not_found;
- r = genradix_ptr(&c->reflink_gc_table, *r_idx);
- BUG_ON(!r);
+ if (r_idx >= c->reflink_gc_nr)
+ goto not_found;
- if (idx < r->offset)
- break;
- (*r_idx)++;
- }
+ r = genradix_ptr(&c->reflink_gc_table, r_idx);
+ if (*idx < r->offset - r->size)
+ goto not_found;
BUG_ON((s64) r->refcount + add < 0);
r->refcount += add;
- return r->offset - idx;
+ *idx = r->offset;
+ return 0;
not_found:
- bch2_fs_inconsistent(c,
- "%llu:%llu len %u points to nonexistent indirect extent %llu",
- p.k->p.inode, p.k->p.offset, p.k->size, idx);
- bch2_inconsistent_error(c);
- return -EIO;
+ *idx = U64_MAX;
+ ret = -EIO;
+
+ /*
+ * XXX: we're replacing the entire reflink pointer with an error
+ * key, we should just be replacing the part that was missing:
+ */
+ if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
+ p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
+ struct bkey_i_error new;
+
+ bkey_init(&new.k);
+ new.k.type = KEY_TYPE_error;
+ new.k.p = p.k->p;
+ new.k.size = p.k->size;
+ ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
+ }
+fsck_err:
+ return ret;
}
-static int bch2_mark_reflink_p(struct bch_fs *c,
- struct bkey_s_c old, struct bkey_s_c new,
- u64 journal_seq, unsigned flags)
+static int bch2_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
u64 idx = le64_to_cpu(p.v->idx);
- unsigned sectors = p.k->size;
- s64 ret = 0;
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+ int ret = 0;
- BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
- (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
+ idx -= le32_to_cpu(p.v->front_pad);
+ end += le32_to_cpu(p.v->back_pad);
+ }
l = 0;
r = c->reflink_gc_nr;
r = m;
}
- while (sectors) {
- ret = __bch2_mark_reflink_p(c, p, idx, flags, &l);
- if (ret < 0)
- return ret;
+ while (idx < end && !ret)
+ ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
- ret = min_t(s64, ret, sectors);
- idx += ret;
- sectors -= ret;
- }
-
- return 0;
+ return ret;
}
-static int bch2_mark_key_locked(struct bch_fs *c,
- struct bkey_s_c old,
- struct bkey_s_c new,
- u64 journal_seq, unsigned flags)
+int bch2_mark_key(struct btree_trans *trans,
+ struct bkey_s_c old,
+ struct bkey_s_c new,
+ unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
-
- BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
switch (k.k->type) {
case KEY_TYPE_alloc:
case KEY_TYPE_alloc_v2:
- return bch2_mark_alloc(c, old, new, journal_seq, flags);
+ case KEY_TYPE_alloc_v3:
+ return bch2_mark_alloc(trans, old, new, flags);
case KEY_TYPE_btree_ptr:
case KEY_TYPE_btree_ptr_v2:
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
- return bch2_mark_extent(c, old, new, journal_seq, flags);
+ return bch2_mark_extent(trans, old, new, flags);
case KEY_TYPE_stripe:
- return bch2_mark_stripe(c, old, new, journal_seq, flags);
+ return bch2_mark_stripe(trans, old, new, flags);
case KEY_TYPE_inode:
- return bch2_mark_inode(c, old, new, journal_seq, flags);
+ case KEY_TYPE_inode_v2:
+ return bch2_mark_inode(trans, old, new, flags);
case KEY_TYPE_reservation:
- return bch2_mark_reservation(c, old, new, journal_seq, flags);
+ return bch2_mark_reservation(trans, old, new, flags);
case KEY_TYPE_reflink_p:
- return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
+ return bch2_mark_reflink_p(trans, old, new, flags);
+ case KEY_TYPE_snapshot:
+ return bch2_mark_snapshot(trans, old, new, flags);
default:
return 0;
}
}
-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
-{
- struct bkey deleted = KEY(0, 0, 0);
- struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- int ret;
-
- percpu_down_read(&c->mark_lock);
- ret = bch2_mark_key_locked(c, old, new, 0, flags);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter,
+int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
struct bkey_i *new, unsigned flags)
{
- struct bch_fs *c = trans->c;
struct bkey _deleted = KEY(0, 0, 0);
struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
struct bkey_s_c old;
- int iter_flags, ret;
+ struct bkey unpacked;
+ int ret;
+
+ _deleted.p = path->pos;
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
- if (!btree_node_type_needs_gc(iter->btree_id))
+ if (!btree_node_type_needs_gc(path->btree_id))
return 0;
- if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
- iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
- iter->flags &= ~BTREE_ITER_WITH_UPDATES;
-
- old = bch2_btree_iter_peek_slot(iter);
- iter->flags |= iter_flags;
-
- ret = bkey_err(old);
- if (ret)
- return ret;
- } else {
- /*
- * If BTREE_ITER_CACHED_NOFILL was used, we better not be
- * running triggers that do anything on removal (alloc btree):
- */
- old = deleted;
- }
+ old = bch2_btree_path_peek_slot(path, &unpacked);
if (old.k->type == new->k.type &&
((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
- trans->journal_res.seq,
+ ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
} else {
- ret = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new),
- trans->journal_res.seq,
+ ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key_locked(c, old, deleted,
- trans->journal_res.seq,
+ bch2_mark_key(trans, old, deleted,
BTREE_TRIGGER_OVERWRITE|flags);
}
pr_err("%s", buf);
pr_err("overlapping with");
- if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
- struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_continue(copy, 0, k, ret) {
- if (btree_node_type_is_extents(i->iter->btree_id)
- ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
- : bkey_cmp(i->k->k.p, k.k->p))
- break;
+ if (!i->cached) {
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
- bch2_bkey_val_to_text(&PBUF(buf), c, k);
- pr_err("%s", buf);
- }
- bch2_trans_iter_put(trans, copy);
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ pr_err("%s", buf);
} else {
- struct bkey_cached *ck = (void *) i->iter->l[0].b;
+ struct bkey_cached *ck = (void *) i->path->l[0].b;
if (ck->valid) {
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
__WARN();
}
-void bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
{
struct bch_fs *c = trans->c;
static int warned_disk_usage = 0;
bool warn = false;
unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- struct replicas_delta *d = deltas->d;
+ struct replicas_delta *d = deltas->d, *d2;
struct replicas_delta *top = (void *) deltas->d + deltas->used;
struct bch_fs_usage *dst;
s64 added = 0, should_not_have_added;
unsigned i;
- percpu_rwsem_assert_held(&c->mark_lock);
-
+ percpu_down_read(&c->mark_lock);
preempt_disable();
dst = fs_usage_ptr(c, trans->journal_res.seq, false);
added += d->delta;
}
- BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
+ if (__update_replicas(c, dst, &d->r, d->delta))
+ goto need_mark;
}
dst->nr_inodes += deltas->nr_inodes;
}
preempt_enable();
+ percpu_up_read(&c->mark_lock);
if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
+ return 0;
+need_mark:
+ /* revert changes: */
+ for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
+ BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+ return -1;
}
/* trans_mark: */
-static struct btree_iter *trans_get_update(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos,
- struct bkey_s_c *k)
-{
- struct btree_insert_entry *i;
-
- trans_for_each_update(trans, i)
- if (i->iter->btree_id == btree_id &&
- (btree_node_type_is_extents(btree_id)
- ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
- bkey_cmp(pos, i->k->k.p) < 0
- : !bkey_cmp(pos, i->iter->pos))) {
- *k = bkey_i_to_s_c(i->k);
-
- /* ugly hack.. */
- BUG_ON(btree_iter_live(trans, i->iter));
- trans->iters_live |= 1ULL << i->iter->idx;
- return i->iter;
- }
-
- return NULL;
-}
-
-static struct bkey_alloc_buf *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_extent_ptr *ptr,
struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
- struct bucket *g;
- struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_alloc_buf *a;
int ret;
- a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
- if (IS_ERR(a))
- return a;
-
- iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
- if (iter) {
- *u = bch2_alloc_unpack(k);
- } else {
- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter);
- if (ret) {
- bch2_trans_iter_put(trans, iter);
- return ERR_PTR(ret);
- }
-
- percpu_down_read(&c->mark_lock);
- g = bucket(ca, pos.offset);
- *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
- percpu_up_read(&c->mark_lock);
+ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
+ POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
+ BTREE_ITER_WITH_UPDATES|
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
}
- *_iter = iter;
- return a;
+ *u = bch2_alloc_unpack(k);
+ return 0;
}
static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bkey_s_c k, struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
- struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_alloc_unpacked u;
- struct bkey_alloc_buf *a;
int ret;
- a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
- if (IS_ERR(a))
- return PTR_ERR(a);
+ ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+ if (ret)
+ return ret;
- ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
+ ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
+ u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
- bch2_alloc_pack(c, a, u);
- bch2_trans_update(trans, iter, &a->k, 0);
+ ret = bch2_alloc_write(trans, &iter, &u, 0);
+ if (ret)
+ goto out;
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_stripe *s;
struct bch_replicas_padded r;
int ret = 0;
- iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
stripe_blockcount_set(&s->v, p.ec.block,
stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
- bch2_trans_update(trans, iter, &s->k_i, 0);
+
+ ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+ if (ret)
+ goto err;
bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
r.e.data_type = data_type;
update_replicas_list(trans, &r.e, sectors);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
? BCH_DATA_btree
: BCH_DATA_user;
s64 sectors = bkey_is_btree_ptr(k.k)
- ? c->opts.btree_node_size
+ ? btree_sectors(c)
: k.k->size;
s64 dirty_sectors = 0;
bool stale;
int ret;
- BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
- (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- sectors = -sectors;
-
r.e.data_type = data_type;
r.e.nr_devs = 0;
r.e.nr_required = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors = ptr_disk_sectors(sectors, p);
+ if (flags & BTREE_TRIGGER_OVERWRITE)
+ disk_sectors = -disk_sectors;
+
ret = bch2_trans_mark_pointer(trans, k, p,
disk_sectors, data_type);
if (ret < 0)
return 0;
}
-static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned idx, bool deleting)
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
{
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
- struct bkey_alloc_buf *a;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_alloc_unpacked u;
- bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity : 0;
+ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
int ret = 0;
- a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- if (parity) {
- s64 sectors = le16_to_cpu(s.v->sectors);
+ if (deleting)
+ sectors = -sectors;
- if (deleting)
- sectors = -sectors;
+ ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+ if (ret)
+ return ret;
- u.dirty_sectors += sectors;
- u.data_type = u.dirty_sectors
- ? BCH_DATA_parity
- : 0;
- }
+ ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
+ u.gen, u.data_type,
+ u.dirty_sectors, u.cached_sectors);
+ if (ret)
+ goto err;
if (!deleting) {
- if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
- "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
- iter->pos.inode, iter->pos.offset, u.gen,
+ if (bch2_fs_inconsistent_on(u.stripe ||
+ u.stripe_redundancy, c,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
+ iter.pos.inode, iter.pos.offset, u.gen,
+ bch2_data_types[u.data_type],
+ u.dirty_sectors,
u.stripe, s.k->p.offset)) {
ret = -EIO;
goto err;
}
+ if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+ iter.pos.inode, iter.pos.offset, u.gen,
+ bch2_data_types[u.data_type],
+ u.dirty_sectors,
+ s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
u.stripe = s.k->p.offset;
u.stripe_redundancy = s.v->nr_redundant;
} else {
+ if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
+ u.stripe_redundancy != s.v->nr_redundant, c,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+ iter.pos.inode, iter.pos.offset, u.gen,
+ s.k->p.offset, u.stripe)) {
+ ret = -EIO;
+ goto err;
+ }
+
u.stripe = 0;
u.stripe_redundancy = 0;
}
- bch2_alloc_pack(c, a, u);
- bch2_trans_update(trans, iter, &a->k, 0);
+ u.dirty_sectors += sectors;
+ if (data_type)
+ u.data_type = !deleting ? data_type : 0;
+
+ ret = bch2_alloc_write(trans, &iter, &u, 0);
+ if (ret)
+ goto err;
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct bkey_s_c_stripe old_s = { .k = NULL };
struct bkey_s_c_stripe new_s = { .k = NULL };
struct bch_replicas_padded r;
- unsigned i;
+ unsigned i, nr_blocks;
int ret = 0;
if (old.k->type == KEY_TYPE_stripe)
new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
+ BUG_ON(new_s.k && old_s.k &&
+ (new_s.v->nr_blocks != old_s.v->nr_blocks ||
+ new_s.v->nr_redundant != old_s.v->nr_redundant));
+
+ nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+
if (new_s.k) {
s64 sectors = le16_to_cpu(new_s.v->sectors);
bch2_bkey_to_replicas(&r.e, new);
update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
-
- for (i = 0; i < new_s.v->nr_blocks; i++) {
- ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
- i, false);
- if (ret)
- return ret;
- }
}
if (old_s.k) {
bch2_bkey_to_replicas(&r.e, old);
update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+ }
- for (i = 0; i < old_s.v->nr_blocks; i++) {
- ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
- i, true);
+ for (i = 0; i < nr_blocks; i++) {
+ if (new_s.k && old_s.k &&
+ !memcmp(&new_s.v->ptrs[i],
+ &old_s.v->ptrs[i],
+ sizeof(new_s.v->ptrs[i])))
+ continue;
+
+ if (new_s.k) {
+ ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
if (ret)
- return ret;
+ break;
+ }
+
+ if (old_s.k) {
+ ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+ if (ret)
+ break;
}
}
struct bkey_s_c new,
unsigned flags)
{
- int nr = (new.k->type == KEY_TYPE_inode) -
- (old.k->type == KEY_TYPE_inode);
+ int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
if (nr) {
struct replicas_delta_list *d =
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
- BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
- (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
if (flags & BTREE_TRIGGER_OVERWRITE)
sectors = -sectors;
sectors *= replicas;
static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
- u64 idx, unsigned flags)
+ u64 *idx, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i *n;
__le64 *refcount;
int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- s64 ret;
+ char buf[200];
+ int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx),
- BTREE_ITER_INTENT|
- BTREE_ITER_WITH_UPDATES);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
refcount = bkey_refcount(n);
if (!refcount) {
+ bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
bch2_fs_inconsistent(c,
- "%llu:%llu len %u points to nonexistent indirect extent %llu",
- p.k->p.inode, p.k->p.offset, p.k->size, idx);
- bch2_inconsistent_error(c);
+ "nonexistent indirect extent at %llu while marking\n %s",
+ *idx, buf);
ret = -EIO;
goto err;
}
- BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE));
+ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
+ bch2_fs_inconsistent(c,
+ "indirect extent refcount underflow at %llu while marking\n %s",
+ *idx, buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+ u64 pad;
+
+ pad = max_t(s64, le32_to_cpu(v->front_pad),
+ le64_to_cpu(v->idx) - bkey_start_offset(k.k));
+ BUG_ON(pad > U32_MAX);
+ v->front_pad = cpu_to_le32(pad);
+
+ pad = max_t(s64, le32_to_cpu(v->back_pad),
+ k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
+ BUG_ON(pad > U32_MAX);
+ v->back_pad = cpu_to_le32(pad);
+ }
+
le64_add_cpu(refcount, add);
if (!*refcount) {
set_bkey_val_u64s(&n->k, 0);
}
- bch2_btree_iter_set_pos_to_extent_start(iter);
- ret = bch2_trans_update(trans, iter, n, 0);
+ bch2_btree_iter_set_pos_to_extent_start(&iter);
+ ret = bch2_trans_update(trans, &iter, n, 0);
if (ret)
goto err;
- ret = k.k->p.offset - idx;
+ *idx = k.k->p.offset;
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct bkey_s_c k, unsigned flags)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx = le64_to_cpu(p.v->idx);
- unsigned sectors = p.k->size;
- s64 ret = 0;
+ u64 idx, end_idx;
+ int ret = 0;
- while (sectors) {
- ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags);
- if (ret < 0)
- return ret;
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
- ret = min_t(s64, ret, sectors);
- idx += ret;
- sectors -= ret;
+ v->front_pad = v->back_pad = 0;
}
- return 0;
+ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+ end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+ le32_to_cpu(p.v->back_pad);
+
+ while (idx < end_idx && !ret)
+ ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
+
+ return ret;
}
int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
struct bkey_s_c new, unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
-
- BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
case KEY_TYPE_stripe:
return bch2_trans_mark_stripe(trans, old, new, flags);
case KEY_TYPE_inode:
+ case KEY_TYPE_inode_v2:
return bch2_trans_mark_inode(trans, old, new, flags);
case KEY_TYPE_reservation:
return bch2_trans_mark_reservation(trans, k, flags);
}
}
-int bch2_trans_mark_update(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_i *new,
- unsigned flags)
-{
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
- struct bkey_s_c old;
- int iter_flags, ret;
-
- if (unlikely(flags & BTREE_TRIGGER_NORUN))
- return 0;
-
- if (!btree_node_type_needs_gc(iter->btree_id))
- return 0;
-
-
- if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
- iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
- iter->flags &= ~BTREE_ITER_WITH_UPDATES;
-
- old = bch2_btree_iter_peek_slot(iter);
- iter->flags |= iter_flags;
-
- ret = bkey_err(old);
- if (ret)
- return ret;
- } else {
- /*
- * If BTREE_ITER_CACHED_NOFILL was used, we better not be
- * running triggers that do anything on removal (alloc btree):
- */
- old = deleted;
- }
-
- if (old.k->type == new->k.type &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
- ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
- } else {
- ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_trans_mark_key(trans, old, deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
- }
-
- return ret;
-}
-
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
unsigned sectors)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_alloc_unpacked u;
- struct bkey_alloc_buf *a;
struct bch_extent_ptr ptr = {
.dev = ca->dev_idx,
.offset = bucket_to_sector(ca, b),
if (b >= ca->mi.nbuckets)
return 0;
- a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
- if (IS_ERR(a))
- return PTR_ERR(a);
+ ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+ if (ret)
+ return ret;
if (u.data_type && u.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
- iter->pos.inode, iter->pos.offset, u.gen,
+ iter.pos.inode, iter.pos.offset, u.gen,
bch2_data_types[u.data_type],
bch2_data_types[type],
bch2_data_types[type]);
u.data_type = type;
u.dirty_sectors = sectors;
- bch2_alloc_pack(c, a, u);
- bch2_trans_update(trans, iter, &a->k, 0);
+ ret = bch2_alloc_write(trans, &iter, &u, 0);
+ if (ret)
+ goto out;
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
container_of(rcu, struct bucket_array, rcu);
kvpfree(buckets,
- sizeof(struct bucket_array) +
+ sizeof(*buckets) +
buckets->nbuckets * sizeof(struct bucket));
}
+static void bucket_gens_free_rcu(struct rcu_head *rcu)
+{
+ struct bucket_gens *buckets =
+ container_of(rcu, struct bucket_gens, rcu);
+
+ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+}
+
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bucket_array *buckets = NULL, *old_buckets = NULL;
+ struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
unsigned long *buckets_nouse = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
alloc_heap alloc_heap;
size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
- ca->mi.bucket_size / c->opts.btree_node_size);
+ ca->mi.bucket_size / btree_sectors(c));
/* XXX: these should be tunable */
size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
- !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
- sizeof(unsigned long),
+ !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO)) ||
+ (c->opts.buckets_nouse &&
+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
+ sizeof(unsigned long),
+ GFP_KERNEL|__GFP_ZERO))) ||
!init_fifo(&free[RESERVE_MOVINGGC],
copygc_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = nbuckets;
+ bucket_gens->first_bucket = ca->mi.first_bucket;
+ bucket_gens->nbuckets = nbuckets;
bch2_copygc_stop(c);
}
old_buckets = bucket_array(ca);
+ old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
memcpy(buckets->b,
old_buckets->b,
n * sizeof(struct bucket));
- memcpy(buckets_nouse,
- ca->buckets_nouse,
- BITS_TO_LONGS(n) * sizeof(unsigned long));
+ memcpy(bucket_gens->b,
+ old_bucket_gens->b,
+ n);
+ if (buckets_nouse)
+ memcpy(buckets_nouse,
+ ca->buckets_nouse,
+ BITS_TO_LONGS(n) * sizeof(unsigned long));
}
rcu_assign_pointer(ca->buckets[0], buckets);
- buckets = old_buckets;
+ rcu_assign_pointer(ca->bucket_gens, bucket_gens);
+ buckets = old_buckets;
+ bucket_gens = old_bucket_gens;
swap(ca->buckets_nouse, buckets_nouse);
free_fifo(&free[i]);
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+ if (bucket_gens)
+ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
if (buckets)
- call_rcu(&old_buckets->rcu, buckets_free_rcu);
+ call_rcu(&buckets->rcu, buckets_free_rcu);
return ret;
}
free_fifo(&ca->free[i]);
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+ sizeof(struct bucket_gens) + ca->mi.nbuckets);
kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
return buckets->b + b;
}
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
+{
+ return __bucket(ca, b, true);
+}
+
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
{
return __bucket(ca, b, false);
}
+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
+{
+ return rcu_dereference_check(ca->bucket_gens,
+ !ca->fs ||
+ percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+ lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->bucket_lock));
+
+}
+
+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
+{
+ struct bucket_gens *gens = bucket_gens(ca);
+
+ BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+ return gens->b + b;
+}
+
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
return sector_to_bucket(ca, ptr->offset);
}
-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr,
- bool gc)
+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
+ const struct bch_extent_ptr *ptr)
{
- return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
+ return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
}
static inline enum bch_data_type ptr_data_type(const struct bkey *k,
return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
}
-static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
-{
- struct bucket_mark m;
-
- rcu_read_lock();
- m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
- rcu_read_unlock();
-
- return m;
-}
-
static inline int gen_cmp(u8 a, u8 b)
{
return (s8) (a - b);
static inline u8 ptr_stale(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
- return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
-}
+ u8 ret;
-/* bucket gc marks */
+ rcu_read_lock();
+ ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ rcu_read_unlock();
-static inline unsigned bucket_sectors_used(struct bucket_mark mark)
-{
- return mark.dirty_sectors + mark.cached_sectors;
+ return ret;
}
+/* bucket gc marks */
+
static inline bool is_available_bucket(struct bucket_mark mark)
{
return !mark.dirty_sectors && !mark.stripe;
}
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
- u16 last_seq_ondisk)
-{
- return m.journal_seq_valid &&
- ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
-}
-
/* Device usage: */
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
/* key/bucket marking: */
-void bch2_bucket_seq_cleanup(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned);
+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_update(struct btree_trans *, struct btree_iter *,
+int bch2_mark_update(struct btree_trans *, struct btree_path *,
struct bkey_i *, unsigned);
int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
struct bkey_s_c, unsigned);
-int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
- struct bkey_i *insert, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
+int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
size_t, enum bch_data_type, unsigned);
u8 gen;
u8 data_type:3,
owned_by_allocator:1,
- journal_seq_valid:1,
stripe:1;
u16 dirty_sectors;
u16 cached_sectors;
-
- /*
- * low bits of journal sequence number when this bucket was most
- * recently modified: if journal_seq_valid is set, this bucket can't be
- * reused until the journal sequence number written to disk is >= the
- * bucket's journal sequence number:
- */
- u16 journal_seq;
};
};
};
u64 io_time[2];
u8 oldest_gen;
- u8 gc_gen;
unsigned gen_valid:1;
u8 stripe_redundancy;
u32 stripe;
struct bucket b[];
};
+struct bucket_gens {
+ struct rcu_head rcu;
+ u16 first_bucket;
+ size_t nbuckets;
+ u8 b[];
+};
+
struct bch_dev_usage {
u64 buckets_ec;
u64 buckets_unavailable;
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/random.h>
+
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+ unsigned hash_seed_idx, u64 dev_bucket)
+{
+ unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]);
+
+ BUG_ON(!is_power_of_2(t->size));
+
+ return t->d + (h & (t->size - 1));
+}
+
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size)
+{
+ unsigned i;
+
+ t->size = size;
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+ get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+ memset(t->d, 0, sizeof(t->d[0]) * size);
+}
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+ u64 flushed_seq,
+ unsigned dev, u64 bucket)
+{
+ struct buckets_waiting_for_journal_table *t;
+ u64 dev_bucket = (u64) dev << 56 | bucket;
+ bool ret = false;
+ unsigned i;
+
+ mutex_lock(&b->lock);
+ t = b->t;
+
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
+
+ if (h->dev_bucket == dev_bucket) {
+ ret = h->journal_seq > flushed_seq;
+ break;
+ }
+ }
+
+ mutex_unlock(&b->lock);
+
+ return ret;
+}
+
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+ struct bucket_hashed *new,
+ u64 flushed_seq)
+{
+ struct bucket_hashed *last_evicted = NULL;
+ unsigned tries, i;
+
+ for (tries = 0; tries < 10; tries++) {
+ struct bucket_hashed *old, *victim = NULL;
+
+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ old = bucket_hash(t, i, new->dev_bucket);
+
+ if (old->dev_bucket == new->dev_bucket ||
+ old->journal_seq <= flushed_seq) {
+ *old = *new;
+ return true;
+ }
+
+ if (last_evicted != old)
+ victim = old;
+ }
+
+ /* hashed to same slot 3 times: */
+ if (!victim)
+ break;
+
+ /* Failed to find an empty slot: */
+ swap(*new, *victim);
+ last_evicted = victim;
+ }
+
+ return false;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+ u64 flushed_seq,
+ unsigned dev, u64 bucket,
+ u64 journal_seq)
+{
+ struct buckets_waiting_for_journal_table *t, *n;
+ struct bucket_hashed tmp, new = {
+ .dev_bucket = (u64) dev << 56 | bucket,
+ .journal_seq = journal_seq,
+ };
+ size_t i, new_size, nr_elements = 1, nr_rehashes = 0;
+ int ret = 0;
+
+ mutex_lock(&b->lock);
+
+ if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+ goto out;
+
+ t = b->t;
+ for (i = 0; i < t->size; i++)
+ nr_elements += t->d[i].journal_seq > flushed_seq;
+
+ new_size = nr_elements < t->size / 3 ? t->size : t->size * 2;
+
+ n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL);
+ if (!n) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+retry_rehash:
+ nr_rehashes++;
+ bucket_table_init(n, new_size);
+
+ tmp = new;
+ BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+ for (i = 0; i < t->size; i++) {
+ if (t->d[i].journal_seq <= flushed_seq)
+ continue;
+
+ tmp = t->d[i];
+ if (!bucket_table_insert(n, &tmp, flushed_seq))
+ goto retry_rehash;
+ }
+
+ b->t = n;
+ kvfree(t);
+
+ pr_debug("took %zu rehashes, table at %zu/%zu elements",
+ nr_rehashes, nr_elements, b->t->size);
+out:
+ mutex_unlock(&b->lock);
+
+ return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+ kvfree(b->t);
+}
+
+#define INITIAL_TABLE_SIZE 8
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+ mutex_init(&b->lock);
+
+ b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL);
+ if (!b->t)
+ return -ENOMEM;
+
+ bucket_table_init(b->t, INITIAL_TABLE_SIZE);
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+ u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+ u64, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+#include <linux/siphash.h>
+
+struct bucket_hashed {
+ u64 dev_bucket;
+ u64 journal_seq;
+};
+
+struct buckets_waiting_for_journal_table {
+ size_t size;
+ siphash_key_t hash_seeds[3];
+ struct bucket_hashed d[];
+};
+
+struct buckets_waiting_for_journal {
+ struct mutex lock;
+ struct buckets_waiting_for_journal_table *t;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!dev)
+ return -EINVAL;
+
for_each_online_member(ca, c, i)
- if (ca->disk_sb.bdev->bd_dev == dev) {
+ if (ca->dev == dev) {
percpu_ref_put(&ca->io_ref);
return i;
}
static void bch2_checksum_init(struct bch2_checksum_state *state)
{
switch (state->type) {
- case BCH_CSUM_NONE:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_CRC64:
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
state->seed = 0;
break;
- case BCH_CSUM_CRC32C_NONZERO:
+ case BCH_CSUM_crc32c_nonzero:
state->seed = U32_MAX;
break;
- case BCH_CSUM_CRC64_NONZERO:
+ case BCH_CSUM_crc64_nonzero:
state->seed = U64_MAX;
break;
- case BCH_CSUM_XXHASH:
+ case BCH_CSUM_xxhash:
xxh64_reset(&state->h64state, 0);
break;
default:
static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
{
switch (state->type) {
- case BCH_CSUM_NONE:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_CRC64:
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
return state->seed;
- case BCH_CSUM_CRC32C_NONZERO:
+ case BCH_CSUM_crc32c_nonzero:
return state->seed ^ U32_MAX;
- case BCH_CSUM_CRC64_NONZERO:
+ case BCH_CSUM_crc64_nonzero:
return state->seed ^ U64_MAX;
- case BCH_CSUM_XXHASH:
+ case BCH_CSUM_xxhash:
return xxh64_digest(&state->h64state);
default:
BUG();
static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
{
switch (state->type) {
- case BCH_CSUM_NONE:
+ case BCH_CSUM_none:
return;
- case BCH_CSUM_CRC32C_NONZERO:
- case BCH_CSUM_CRC32C:
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc32c:
state->seed = crc32c(state->seed, data, len);
break;
- case BCH_CSUM_CRC64_NONZERO:
- case BCH_CSUM_CRC64:
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc64:
state->seed = crc64_be(state->seed, data, len);
break;
- case BCH_CSUM_XXHASH:
+ case BCH_CSUM_xxhash:
xxh64_update(&state->h64state, data, len);
break;
default:
struct nonce nonce, const void *data, size_t len)
{
switch (type) {
- case BCH_CSUM_NONE:
- case BCH_CSUM_CRC32C_NONZERO:
- case BCH_CSUM_CRC64_NONZERO:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_XXHASH:
- case BCH_CSUM_CRC64: {
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_xxhash:
+ case BCH_CSUM_crc64: {
struct bch2_checksum_state state;
state.type = type;
return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
}
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128: {
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128: {
SHASH_DESC_ON_STACK(desc, c->poly1305);
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
struct bio_vec bv;
switch (type) {
- case BCH_CSUM_NONE:
+ case BCH_CSUM_none:
return (struct bch_csum) { 0 };
- case BCH_CSUM_CRC32C_NONZERO:
- case BCH_CSUM_CRC64_NONZERO:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_XXHASH:
- case BCH_CSUM_CRC64: {
+ case BCH_CSUM_crc32c_nonzero:
+ case BCH_CSUM_crc64_nonzero:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_xxhash:
+ case BCH_CSUM_crc64: {
struct bch2_checksum_state state;
state.type = type;
return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
}
- case BCH_CSUM_CHACHA20_POLY1305_80:
- case BCH_CSUM_CHACHA20_POLY1305_128: {
+ case BCH_CSUM_chacha20_poly1305_80:
+ case BCH_CSUM_chacha20_poly1305_128: {
SHASH_DESC_ON_STACK(desc, c->poly1305);
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
}
#ifdef __KERNEL__
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
{
- char key_description[60];
struct key *keyring_key;
const struct user_key_payload *ukp;
int ret;
- snprintf(key_description, sizeof(key_description),
- "bcachefs:%pUb", &sb->user_uuid);
-
keyring_key = request_key(&key_type_logon, key_description, NULL);
if (IS_ERR(keyring_key))
return PTR_ERR(keyring_key);
}
#else
#include <keyutils.h>
-#include <uuid/uuid.h>
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
{
key_serial_t key_id;
- char key_description[60];
- char uuid[40];
-
- uuid_unparse_lower(sb->user_uuid.b, uuid);
- sprintf(key_description, "bcachefs:%s", uuid);
key_id = request_key("user", key_description, NULL,
KEY_SPEC_USER_KEYRING);
}
#endif
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+ char key_description[60];
+ char uuid[40];
+
+ uuid_unparse_lower(sb->user_uuid.b, uuid);
+ sprintf(key_description, "bcachefs:%s", uuid);
+
+ return __bch2_request_key(key_description, key);
+}
+
int bch2_decrypt_sb_key(struct bch_fs *c,
struct bch_sb_field_crypt *crypt,
struct bch_key *key)
{
switch (type) {
- case BCH_CSUM_NONE:
- case BCH_CSUM_CRC32C:
- case BCH_CSUM_CRC64:
+ case BCH_CSUM_none:
+ case BCH_CSUM_crc32c:
+ case BCH_CSUM_crc64:
return true;
default:
return false;
{
switch (type) {
case BCH_CSUM_OPT_none:
- return BCH_CSUM_NONE;
+ return BCH_CSUM_none;
case BCH_CSUM_OPT_crc32c:
- return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
+ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
case BCH_CSUM_OPT_crc64:
- return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
case BCH_CSUM_OPT_xxhash:
- return BCH_CSUM_XXHASH;
+ return BCH_CSUM_xxhash;
default:
BUG();
}
{
if (c->sb.encryption_type)
return c->opts.wide_macs
- ? BCH_CSUM_CHACHA20_POLY1305_128
- : BCH_CSUM_CHACHA20_POLY1305_80;
+ ? BCH_CSUM_chacha20_poly1305_128
+ : BCH_CSUM_chacha20_poly1305_80;
return bch2_csum_opt_to_type(opt, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
{
if (c->sb.encryption_type)
- return BCH_CSUM_CHACHA20_POLY1305_128;
+ return BCH_CSUM_chacha20_poly1305_128;
return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
}
{
void *b;
- BUG_ON(size > c->sb.encoded_extent_max << 9);
+ BUG_ON(size > c->opts.encoded_extent_max);
b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
if (b)
struct page **pages = NULL;
void *data;
- BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
+ BUG_ON(start.bi_size > c->opts.encoded_extent_max);
if (!PageHighMem(bio_iter_page(bio, start)) &&
bio_phys_contig(bio, start))
BUG_ON(!bio->bi_vcnt);
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
- if (crc->uncompressed_size > c->sb.encoded_extent_max ||
- crc->compressed_size > c->sb.encoded_extent_max) {
+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
+ crc->compressed_size << 9 > c->opts.encoded_extent_max) {
bch_err(c, "error rewriting existing data: extent too big");
return -EIO;
}
size_t dst_len = crc.uncompressed_size << 9;
int ret = -ENOMEM;
- if (crc.uncompressed_size > c->sb.encoded_extent_max ||
- crc.compressed_size > c->sb.encoded_extent_max)
+ if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
+ crc.compressed_size << 9 > c->opts.encoded_extent_max)
return -EIO;
dst_data = dst_len == dst_iter.bi_size
BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
/* If it's only one block, don't bother trying to compress: */
- if (bio_sectors(src) <= c->opts.block_size)
+ if (src->bi_iter.bi_size <= c->opts.block_size)
return 0;
dst_data = bio_map_or_bounce(c, dst, WRITE);
/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
- c->sb.encoded_extent_max << 9);
+ c->opts.encoded_extent_max);
/* Don't generate a bigger output than input: */
dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
- size_t max_extent = c->sb.encoded_extent_max << 9;
size_t decompress_workspace_size = 0;
bool decompress_workspace_needed;
- ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
+ ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0);
struct {
unsigned feature;
unsigned type;
if (!mempool_initialized(&c->compression_bounce[READ])) {
ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
- 1, max_extent);
+ 1, c->opts.encoded_extent_max);
if (ret)
goto out;
}
if (!mempool_initialized(&c->compression_bounce[WRITE])) {
ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
- 1, max_extent);
+ 1, c->opts.encoded_extent_max);
if (ret)
goto out;
}
{
struct dump_iter *i = file->private_data;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int err;
bch2_trans_init(&trans, i->c, 0, 0);
- iter = bch2_trans_get_iter(&trans, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(iter);
+ bch2_trans_iter_init(&trans, &iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek(&iter);
while (k.k && !(err = bkey_err(k))) {
bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
i->buf[i->bytes] = '\n';
i->bytes++;
- k = bch2_btree_iter_next(iter);
- i->from = iter->pos;
+ k = bch2_btree_iter_next(&iter);
+ i->from = iter.pos;
err = flush_buf(i);
if (err)
if (!i->size)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
{
struct dump_iter *i = file->private_data;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
int err;
bch2_trans_init(&trans, i->c, 0, 0);
- for_each_btree_node(&trans, iter, i->id, i->from, 0, b) {
+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
i->bytes = strlen(i->buf);
err = flush_buf(i);
if (!i->size)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
{
struct dump_iter *i = file->private_data;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct btree *prev_node = NULL;
int err;
bch2_trans_init(&trans, i->c, 0, 0);
- iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ while ((k = bch2_btree_iter_peek(&iter)).k &&
!(err = bkey_err(k))) {
- struct btree_iter_level *l = &iter->l[0];
+ struct btree_path_level *l = &iter.path->l[0];
struct bkey_packed *_k =
bch2_btree_node_iter_peek(&l->iter, l->b);
if (err)
break;
- bch2_btree_iter_advance(iter);
- i->from = iter->pos;
+ bch2_btree_iter_advance(&iter);
+ i->from = iter.pos;
err = flush_buf(i);
if (err)
if (!i->size)
break;
}
+ bch2_trans_iter_exit(&trans, &iter);
+
bch2_trans_exit(&trans);
return err < 0 ? err : i->ret;
#include "fs.h"
#include "keylist.h"
#include "str_hash.h"
+#include "subvolume.h"
#include <linux/dcache.h>
return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
}
+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
+{
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+ if (d.v->d_type == DT_SUBVOL)
+ return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
+ return true;
+}
+
const struct bch_hash_desc bch2_dirent_hash_desc = {
.btree_id = BTREE_ID_dirents,
.key_type = KEY_TYPE_dirent,
.hash_bkey = dirent_hash_bkey,
.cmp_key = dirent_cmp_key,
.cmp_bkey = dirent_cmp_bkey,
+ .is_visible = dirent_is_visible,
};
const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
if (memchr(d.v->d_name, '/', len))
return "invalid name";
- if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
+ if (d.v->d_type != DT_SUBVOL &&
+ le64_to_cpu(d.v->d_inum) == d.k->p.inode)
return "dirent points to own directory";
return NULL;
bch_scnmemcpy(out, d.v->d_name,
bch2_dirent_name_bytes(d));
- pr_buf(out, " -> %llu type %s", d.v->d_inum,
- d.v->d_type < DT_MAX
- ? bch2_d_types[d.v->d_type]
- : "(bad d_type)");
+ pr_buf(out, " -> %llu type %s",
+ d.v->d_type != DT_SUBVOL
+ ? le64_to_cpu(d.v->d_inum)
+ : le32_to_cpu(d.v->d_child_subvol),
+ bch2_d_type_str(d.v->d_type));
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
- u8 type, const struct qstr *name, u64 dst)
+ subvol_inum dir, u8 type,
+ const struct qstr *name, u64 dst)
{
struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
bkey_dirent_init(&dirent->k_i);
dirent->k.u64s = u64s;
- dirent->v.d_inum = cpu_to_le64(dst);
+
+ if (type != DT_SUBVOL) {
+ dirent->v.d_inum = cpu_to_le64(dst);
+ } else {
+ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+ dirent->v.d_child_subvol = cpu_to_le32(dst);
+ }
+
dirent->v.d_type = type;
memcpy(dirent->v.d_name, name->name, name->len);
return dirent;
}
-int bch2_dirent_create(struct btree_trans *trans,
- u64 dir_inum, const struct bch_hash_info *hash_info,
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+ const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset, int flags)
{
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, type, name, dst_inum);
+ dirent = dirent_create_key(trans, dir, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, &dirent->k_i, flags);
+ dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
dst->v.d_type = src.v->d_type;
}
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+ struct bkey_s_c_dirent d, subvol_inum *target)
+{
+ struct bch_subvolume s;
+ int ret = 0;
+
+ if (d.v->d_type == DT_SUBVOL &&
+ d.v->d_parent_subvol != dir.subvol)
+ return 1;
+
+ if (likely(d.v->d_type != DT_SUBVOL)) {
+ target->subvol = dir.subvol;
+ target->inum = le64_to_cpu(d.v->d_inum);
+ } else {
+ target->subvol = le32_to_cpu(d.v->d_child_subvol);
+
+ ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+
+ target->inum = le64_to_cpu(s.inode);
+ }
+
+ return ret;
+}
+
int bch2_dirent_rename(struct btree_trans *trans,
- u64 src_dir, struct bch_hash_info *src_hash,
- u64 dst_dir, struct bch_hash_info *dst_hash,
- const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
- const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
- enum bch_rename_mode mode)
+ subvol_inum src_dir, struct bch_hash_info *src_hash,
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+ const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+ const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+ enum bch_rename_mode mode)
{
- struct btree_iter *src_iter = NULL, *dst_iter = NULL;
- struct bkey_s_c old_src, old_dst;
+ struct btree_iter src_iter = { NULL };
+ struct btree_iter dst_iter = { NULL };
+ struct bkey_s_c old_src, old_dst = bkey_s_c_null;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
- POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
+ POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
+ unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
int ret = 0;
- *src_inum = *dst_inum = 0;
+ if (src_dir.subvol != dst_dir.subvol)
+ return -EXDEV;
- /*
- * Lookup dst:
- *
- * Note that in BCH_RENAME mode, we're _not_ checking if
- * the target already exists - we're relying on the VFS
- * to do that check for us for correctness:
- */
- dst_iter = mode == BCH_RENAME
- ? bch2_hash_hole(trans, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name)
- : bch2_hash_lookup(trans, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dst_iter);
- if (ret)
- goto out;
+ memset(src_inum, 0, sizeof(*src_inum));
+ memset(dst_inum, 0, sizeof(*dst_inum));
- old_dst = bch2_btree_iter_peek_slot(dst_iter);
- ret = bkey_err(old_dst);
+ /* Lookup src: */
+ ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+ src_hash, src_dir, src_name,
+ BTREE_ITER_INTENT);
if (ret)
goto out;
- if (mode != BCH_RENAME)
- *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
- if (mode != BCH_RENAME_EXCHANGE)
- *src_offset = dst_iter->pos.offset;
-
- /* Lookup src: */
- src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
- src_hash, src_dir, src_name,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(src_iter);
+ old_src = bch2_btree_iter_peek_slot(&src_iter);
+ ret = bkey_err(old_src);
if (ret)
goto out;
- old_src = bch2_btree_iter_peek_slot(src_iter);
- ret = bkey_err(old_src);
+ ret = bch2_dirent_read_target(trans, src_dir,
+ bkey_s_c_to_dirent(old_src), src_inum);
if (ret)
goto out;
- *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
+ src_type = bkey_s_c_to_dirent(old_src).v->d_type;
+
+ if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
+ return -EOPNOTSUPP;
+
+
+ /* Lookup dst: */
+ if (mode == BCH_RENAME) {
+ /*
+ * Note that we're _not_ checking if the target already exists -
+ * we're relying on the VFS to do that check for us for
+ * correctness:
+ */
+ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name);
+ if (ret)
+ goto out;
+ } else {
+ ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+ dst_hash, dst_dir, dst_name,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto out;
+
+ old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+ ret = bkey_err(old_dst);
+ if (ret)
+ goto out;
+
+ ret = bch2_dirent_read_target(trans, dst_dir,
+ bkey_s_c_to_dirent(old_dst), dst_inum);
+ if (ret)
+ goto out;
+
+ dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
+
+ if (dst_type == DT_SUBVOL)
+ return -EOPNOTSUPP;
+ }
+
+ if (mode != BCH_RENAME_EXCHANGE)
+ *src_offset = dst_iter.pos.offset;
/* Create new dst key: */
- new_dst = dirent_create_key(trans, 0, dst_name, 0);
+ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
- new_dst->k.p = dst_iter->pos;
+ new_dst->k.p = dst_iter.pos;
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, 0, src_name, 0);
+ new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
- new_src->k.p = src_iter->pos;
+ new_src->k.p = src_iter.pos;
} else {
new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
ret = PTR_ERR_OR_ZERO(new_src);
goto out;
bkey_init(&new_src->k);
- new_src->k.p = src_iter->pos;
+ new_src->k.p = src_iter.pos;
- if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
- bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+ if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+ bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
/*
* We have a hash collision for the new dst key,
* and new_src - the key we're deleting - is between
* If we're not overwriting, we can just insert
* new_dst at the src position:
*/
- new_dst->k.p = src_iter->pos;
- bch2_trans_update(trans, src_iter,
- &new_dst->k_i, 0);
- goto out_set_offset;
+ new_src = new_dst;
+ new_src->k.p = src_iter.pos;
+ goto out_set_src;
} else {
/* If we're overwriting, we can't insert new_dst
* at a different slot because it has to
} else {
/* Check if we need a whiteout to delete src: */
ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
- src_hash, src_iter);
+ src_hash, &src_iter);
if (ret < 0)
goto out;
}
}
- bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
- bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
-out_set_offset:
+ ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+ if (ret)
+ goto out;
+out_set_src:
+
+ /*
+ * If we're deleting a subvolume, we need to really delete the dirent,
+ * not just emit a whiteout in the current snapshot:
+ */
+ if (src_type == DT_SUBVOL) {
+ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&src_iter);
+ if (ret)
+ goto out;
+
+ new_src->k.p = src_iter.pos;
+ src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+ }
+
+ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+ if (ret)
+ goto out;
+
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
*dst_offset = new_dst->k.p.offset;
out:
- bch2_trans_iter_put(trans, src_iter);
- bch2_trans_iter_put(trans, dst_iter);
+ bch2_trans_iter_exit(trans, &src_iter);
+ bch2_trans_iter_exit(trans, &dst_iter);
return ret;
}
-int bch2_dirent_delete_at(struct btree_trans *trans,
- const struct bch_hash_info *hash_info,
- struct btree_iter *iter)
-{
- return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- hash_info, iter);
-}
-
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- const struct qstr *name, unsigned flags)
-{
- return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
- hash_info, dir_inum, name, flags);
-}
-
-u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
- const struct bch_hash_info *hash_info,
- const struct qstr *name)
+int __bch2_dirent_lookup_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum,
+ unsigned flags)
{
- struct btree_trans trans;
- struct btree_iter *iter;
struct bkey_s_c k;
- u64 inum = 0;
- int ret = 0;
+ struct bkey_s_c_dirent d;
+ u32 snapshot;
+ int ret;
- bch2_trans_init(&trans, c, 0, 0);
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+ if (ret)
+ return ret;
- iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
- hash_info, name, 0);
- ret = PTR_ERR_OR_ZERO(iter);
+ ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+ hash_info, dir, name, flags);
if (ret)
- goto out;
+ return ret;
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
- goto out;
+ goto err;
- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
- bch2_trans_iter_put(&trans, iter);
-out:
- BUG_ON(ret == -EINTR);
+ d = bkey_s_c_to_dirent(k);
+
+ ret = bch2_dirent_read_target(trans, dir, d, inum);
+ if (ret > 0)
+ ret = -ENOENT;
+err:
+ if (ret)
+ bch2_trans_iter_exit(trans, iter);
+
+ return ret;
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name, subvol_inum *inum)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+ name, inum, 0);
+ if (ret == -EINTR)
+ goto retry;
+ if (!ret)
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- return inum;
+ return ret;
}
-int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_dirents,
- POS(dir_inum, 0), 0, k, ret) {
- if (k.k->p.inode > dir_inum)
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
+ SPOS(dir.inum, 0, snapshot), 0, k, ret) {
+ if (k.k->p.inode > dir.inum)
break;
if (k.k->type == KEY_TYPE_dirent) {
break;
}
}
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
+ subvol_inum target;
+ u32 snapshot;
int ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
- for_each_btree_key(&trans, iter, BTREE_ID_dirents,
- POS(inum, ctx->pos), 0, k, ret) {
- if (k.k->p.inode > inum)
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
+ SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
+ if (k.k->p.inode > inum.inum)
break;
if (k.k->type != KEY_TYPE_dirent)
dirent = bkey_s_c_to_dirent(k);
+ ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
/*
* XXX: dir_emit() can fault and block, while we're holding
* locks
ctx->pos = dirent.k->p.offset;
if (!dir_emit(ctx, dirent.v->d_name,
bch2_dirent_name_bytes(dirent),
- le64_to_cpu(dirent.v->d_inum),
- dirent.v->d_type))
+ target.inum,
+ vfs_d_type(dirent.v->d_type)))
break;
ctx->pos = dirent.k->p.offset + 1;
+
+ /*
+ * read_target looks up subvolumes, we can overflow paths if the
+ * directory has many subvolumes in it
+ */
+ ret = btree_trans_too_many_iters(&trans);
+ if (ret)
+ break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
return ret;
}
sizeof(u64));
}
-int bch2_dirent_create(struct btree_trans *, u64,
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+ struct bkey_s_c_dirent, subvol_inum *);
+
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *, int);
-int bch2_dirent_delete_at(struct btree_trans *,
- const struct bch_hash_info *,
- struct btree_iter *);
+static inline unsigned vfs_d_type(unsigned type)
+{
+ return type == DT_SUBVOL ? DT_DIR : type;
+}
enum bch_rename_mode {
BCH_RENAME,
};
int bch2_dirent_rename(struct btree_trans *,
- u64, struct bch_hash_info *,
- u64, struct bch_hash_info *,
- const struct qstr *, u64 *, u64 *,
- const struct qstr *, u64 *, u64 *,
+ subvol_inum, struct bch_hash_info *,
+ subvol_inum, struct bch_hash_info *,
+ const struct qstr *, subvol_inum *, u64 *,
+ const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *, u64,
- const struct bch_hash_info *,
- const struct qstr *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
- const struct qstr *);
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+ subvol_inum, const struct bch_hash_info *,
+ const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+ const struct bch_hash_info *,
+ const struct qstr *, subvol_inum *);
-int bch2_empty_dir_trans(struct btree_trans *, u64);
-int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
#endif /* _BCACHEFS_DIRENT_H */
strncmp(l->label, r->label, sizeof(l->label));
}
-static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
struct bch_disk_group *g, *sorted = NULL;
- struct bch_sb_field_members *mi;
- struct bch_member *m;
- unsigned i, nr_groups, len;
- const char *err = NULL;
-
- mi = bch2_sb_get_members(sb);
- groups = bch2_sb_get_disk_groups(sb);
- nr_groups = disk_groups_nr(groups);
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ unsigned nr_groups = disk_groups_nr(groups);
+ unsigned i, len;
+ int ret = -EINVAL;
- for (m = mi->members;
- m < mi->members + sb->nr_devices;
- m++) {
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member *m = mi->members + i;
unsigned g;
if (!BCH_MEMBER_GROUP(m))
g = BCH_MEMBER_GROUP(m) - 1;
- if (g >= nr_groups ||
- BCH_GROUP_DELETED(&groups->entries[g]))
- return "disk has invalid group";
+ if (g >= nr_groups) {
+ pr_buf(err, "disk %u has invalid label %u (have %u)",
+ i, g, nr_groups);
+ return -EINVAL;
+ }
+
+ if (BCH_GROUP_DELETED(&groups->entries[g])) {
+ pr_buf(err, "disk %u has deleted label %u", i, g);
+ return -EINVAL;
+ }
}
if (!nr_groups)
- return NULL;
+ return 0;
+
+ for (i = 0; i < nr_groups; i++) {
+ g = groups->entries + i;
- for (g = groups->entries;
- g < groups->entries + nr_groups;
- g++) {
if (BCH_GROUP_DELETED(g))
continue;
len = strnlen(g->label, sizeof(g->label));
if (!len) {
- err = "group with empty label";
- goto err;
+ pr_buf(err, "label %u empty", i);
+ return -EINVAL;
}
}
sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
if (!sorted)
- return "cannot allocate memory";
+ return -ENOMEM;
memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
- for (i = 0; i + 1 < nr_groups; i++)
- if (!BCH_GROUP_DELETED(sorted + i) &&
- !group_cmp(sorted + i, sorted + i + 1)) {
- err = "duplicate groups";
+ for (g = sorted; g + 1 < sorted + nr_groups; g++)
+ if (!BCH_GROUP_DELETED(g) &&
+ !group_cmp(&g[0], &g[1])) {
+ pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g));
+ bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label)));
goto err;
}
- err = NULL;
+ ret = 0;
err:
kfree(sorted);
- return err;
+ return 0;
}
static void bch2_sb_disk_groups_to_text(struct printbuf *out,
#include "io.h"
#include "keylist.h"
#include "recovery.h"
+#include "replicas.h"
#include "super-io.h"
#include "util.h"
}
/* returns blocknr in stripe that we matched: */
-static int bkey_matches_stripe(struct bch_stripe *s,
- struct bkey_s_c k)
+static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
+ struct bkey_s_c k, unsigned *block)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
for (i = 0; i < nr_data; i++)
if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
- le16_to_cpu(s->sectors)))
- return i;
+ le16_to_cpu(s->sectors))) {
+ *block = i;
+ return ptr;
+ }
- return -1;
+ return NULL;
}
static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
+ POS(0, idx), BTREE_ITER_SLOTS);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
}
bkey_reassemble(&stripe->key.k_i, k);
err:
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
free_heap(&n);
}
- if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
+ if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
return -ENOMEM;
if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
- !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
+ !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -ENOMEM;
return 0;
}
-static int ec_stripe_mem_alloc(struct bch_fs *c,
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
struct btree_iter *iter)
{
size_t idx = iter->pos.offset;
int ret = 0;
- if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
+ if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
return ret;
- bch2_trans_unlock(iter->trans);
+ bch2_trans_unlock(trans);
ret = -EINTR;
- if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+ if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
return ret;
return -ENOMEM;
{
struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
- genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
+ genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
}
static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m = genradix_ptr(&c->stripes[0], idx);
+ struct stripe *m = genradix_ptr(&c->stripes, idx);
BUG_ON(!m->alive);
BUG_ON(m->heap_idx >= h->used);
return bch2_btree_delete_range(c, BTREE_ID_stripes,
POS(0, idx),
POS(0, idx + 1),
- NULL);
+ 0, NULL);
}
static void ec_stripe_delete_work(struct work_struct *work)
break;
}
- bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
+ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
spin_unlock(&c->ec_stripes_heap_lock);
if (ec_stripe_delete(c, idx))
/* stripe creation: */
-static int ec_stripe_bkey_insert(struct bch_fs *c,
+static int ec_stripe_bkey_insert(struct btree_trans *trans,
struct bkey_i_stripe *stripe,
struct disk_reservation *res)
{
- struct btree_trans trans;
- struct btree_iter *iter;
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bpos min_pos = POS(0, 1);
struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-retry:
- bch2_trans_begin(&trans);
-
- for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos,
+ for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
if (start_pos.offset) {
start_pos = min_pos;
- bch2_btree_iter_set_pos(iter, start_pos);
+ bch2_btree_iter_set_pos(&iter, start_pos);
continue;
}
goto err;
found_slot:
- start_pos = iter->pos;
+ start_pos = iter.pos;
- ret = ec_stripe_mem_alloc(c, iter);
+ ret = ec_stripe_mem_alloc(trans, &iter);
if (ret)
goto err;
- stripe->k.p = iter->pos;
+ stripe->k.p = iter.pos;
- ret = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?:
- bch2_trans_commit(&trans, res, NULL,
- BTREE_INSERT_NOFAIL);
-err:
- bch2_trans_iter_put(&trans, iter);
+ ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
- if (ret == -EINTR)
- goto retry;
-
- c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
- bch2_trans_exit(&trans);
+ c->ec_stripe_hint = start_pos.offset;
+err:
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int ec_stripe_bkey_update(struct btree_trans *trans,
- struct bkey_i_stripe *new)
+ struct bkey_i_stripe *new,
+ struct disk_reservation *res)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
const struct bch_stripe *existing;
unsigned i;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
+ new->k.p, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
stripe_blockcount_set(&new->v, i,
stripe_blockcount_get(existing, i));
- ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct bkey *pos)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_extent e;
struct bkey_buf sk;
+ struct bpos next_pos;
int ret = 0, dev, block;
bch2_bkey_buf_init(&sk);
/* XXX this doesn't support the reflink btree */
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- bkey_start_pos(pos),
- BTREE_ITER_INTENT);
-
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bkey_start_pos(pos),
+ BTREE_ITER_INTENT);
+retry:
+ while (bch2_trans_begin(&trans),
+ (k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k)) &&
bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+ const struct bch_extent_ptr *ptr_c;
struct bch_extent_ptr *ptr, *ec_ptr = NULL;
if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
- block = bkey_matches_stripe(&s->key.v, k);
- if (block < 0) {
- bch2_btree_iter_advance(iter);
+ ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
+ /*
+ * It doesn't generally make sense to erasure code cached ptrs:
+ * XXX: should we be incrementing a counter?
+ */
+ if (!ptr_c || ptr_c->cached) {
+ bch2_btree_iter_advance(&iter);
continue;
}
extent_stripe_ptr_add(e, s, ec_ptr, block);
- bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
- ret = bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(&trans, iter, sk.k, 0) ?:
+ bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
+ next_pos = sk.k->k.p;
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, sk.k, 0) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
- if (ret == -EINTR)
- ret = 0;
+ if (!ret)
+ bch2_btree_iter_set_pos(&iter, next_pos);
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ if (ret == -EINTR)
+ goto retry;
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
goto err_put_writes;
}
- ret = s->have_existing_stripe
- ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
- ec_stripe_bkey_update(&trans, &s->new_stripe.key))
- : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
+ ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+ s->have_existing_stripe
+ ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res)
+ : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res));
if (ret) {
bch_err(c, "error creating stripe: error creating stripe key");
goto err_put_writes;
}
spin_lock(&c->ec_stripes_heap_lock);
- m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
+ m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
BUG_ON(m->on_heap);
bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
if (!ob)
return NULL;
- ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ ca = bch_dev_bkey_exists(c, ob->dev);
offset = ca->mi.bucket_size - ob->sectors_free;
return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
}
-void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
- struct bpos pos, unsigned sectors)
+void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
+ struct bkey *k)
{
- struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
- struct ec_stripe_new *ec;
+ struct ec_stripe_new *ec = ob->ec;
- if (!ob)
+ if (!ec)
return;
- ec = ob->ec;
mutex_lock(&ec->lock);
if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
}
bkey_init(&ec->keys.top->k);
- ec->keys.top->k.p = pos;
- bch2_key_resize(&ec->keys.top->k, sectors);
+ ec->keys.top->k.p = k->p;
+ ec->keys.top->k.size = k->size;
bch2_keylist_push(&ec->keys);
mutex_unlock(&ec->lock);
s->v.algorithm = 0;
s->v.nr_blocks = nr_data + nr_parity;
s->v.nr_redundant = nr_parity;
- s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max);
- s->v.csum_type = BCH_CSUM_CRC32C;
+ s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9);
+ s->v.csum_type = BCH_CSUM_crc32c;
s->v.pad = 0;
while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
return h;
}
-static enum bucket_alloc_ret
-new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
- struct closure *cl)
+static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+ struct closure *cl)
{
struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
struct open_buckets buckets;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
bool have_cache = true;
- enum bucket_alloc_ret ret = ALLOC_SUCCESS;
+ int ret = 0;
for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
if (test_bit(i, h->s->blocks_gotten)) {
BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
h->s->blocks[j] = buckets.v[i];
- h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, h->s->blocks_gotten);
}
BUG_ON(j >= h->s->nr_data);
h->s->blocks[j] = buckets.v[i];
- h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, h->s->blocks_gotten);
}
continue;
stripe_idx = h->data[heap_idx].idx;
- m = genradix_ptr(&c->stripes[0], stripe_idx);
+ m = genradix_ptr(&c->stripes, stripe_idx);
if (m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
err:
bch2_ec_stripe_head_put(c, h);
- return ERR_PTR(-ret);
+ return ERR_PTR(ret);
}
void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
continue;
ob = c->open_buckets + h->s->blocks[i];
- if (ob->ptr.dev == ca->dev_idx)
+ if (ob->dev == ca->dev_idx)
goto found;
}
goto unlock;
struct genradix_iter iter;
struct stripe *m;
- genradix_for_each(&c->stripes[0], iter, m)
+ genradix_for_each(&c->stripes, iter, m)
if (m->alive)
bch2_stripes_heap_insert(c, m, iter.pos);
}
-static int __bch2_stripe_write_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct stripe *m,
- size_t idx,
- struct bkey_i_stripe *new_key)
+int bch2_stripes_read(struct bch_fs *c)
{
- const struct bch_stripe *v;
+ struct btree_trans trans;
+ struct btree_iter iter;
struct bkey_s_c k;
+ const struct bch_stripe *s;
+ struct stripe *m;
unsigned i;
int ret;
- bch2_btree_iter_set_pos(iter, POS(0, idx));
-
- k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
-
- if (k.k->type != KEY_TYPE_stripe)
- return -EIO;
-
- v = bkey_s_c_to_stripe(k).v;
- for (i = 0; i < v->nr_blocks; i++)
- if (m->block_sectors[i] != stripe_blockcount_get(v, i))
- goto write;
- return 0;
-write:
- bkey_reassemble(&new_key->k_i, k);
-
- for (i = 0; i < new_key->v.nr_blocks; i++)
- stripe_blockcount_set(&new_key->v, i,
- m->block_sectors[i]);
-
- return bch2_trans_update(trans, iter, &new_key->k_i, 0);
-}
-
-int bch2_stripes_write(struct bch_fs *c, unsigned flags)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct genradix_iter giter;
- struct bkey_i_stripe *new_key;
- struct stripe *m;
- int ret = 0;
-
- new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
- BUG_ON(!new_key);
-
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
- genradix_for_each(&c->stripes[0], giter, m) {
- if (!m->alive)
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->type != KEY_TYPE_stripe)
continue;
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags,
- __bch2_stripe_write_key(&trans, iter, m,
- giter.pos, new_key));
-
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
if (ret)
break;
- }
- bch2_trans_iter_put(&trans, iter);
- bch2_trans_exit(&trans);
+ s = bkey_s_c_to_stripe(k).v;
- kfree(new_key);
-
- return ret;
-}
+ m = genradix_ptr(&c->stripes, k.k->p.offset);
+ m->alive = true;
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
-static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
-{
- int ret = 0;
+ for (i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
- if (k.k->type == KEY_TYPE_stripe)
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
- bch2_mark_key(c, k,
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_NOATOMIC);
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_update(c, m, k.k->p.offset);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- return ret;
-}
+ bch2_trans_exit(&trans);
-int bch2_stripes_read(struct bch_fs *c)
-{
- int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes,
- bch2_stripes_read_fn);
if (ret)
bch_err(c, "error reading stripes: %i", ret);
return ret;
}
-int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- size_t i, idx = 0;
- int ret = 0;
-
- bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
-
- k = bch2_btree_iter_prev(iter);
- if (!IS_ERR_OR_NULL(k.k))
- idx = k.k->p.offset + 1;
-
- bch2_trans_iter_put(&trans, iter);
- ret = bch2_trans_exit(&trans);
- if (ret)
- return ret;
-
- if (!idx)
- return 0;
-
- if (!gc &&
- !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
- GFP_KERNEL))
- return -ENOMEM;
-#if 0
- ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
-#else
- for (i = 0; i < idx; i++)
- if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
- return -ENOMEM;
-#endif
- return 0;
-}
-
void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
spin_lock(&c->ec_stripes_heap_lock);
for (i = 0; i < min_t(size_t, h->used, 20); i++) {
- m = genradix_ptr(&c->stripes[0], h->data[i].idx);
+ m = genradix_ptr(&c->stripes, h->data[i].idx);
pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
h->data[i].blocks_nonempty,
BUG_ON(!list_empty(&c->ec_stripe_new_list));
free_heap(&c->ec_stripes_heap);
- genradix_free(&c->stripes[0]);
+ genradix_free(&c->stripes);
bioset_exit(&c->ec_bioset);
}
le16_to_cpu(s->sectors));
}
-static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m,
+static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
struct extent_ptr_decoded p)
{
unsigned nr_data = m->nr_blocks - m->nr_redundant;
int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
- struct bpos, unsigned);
+void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
+ struct bkey *);
void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
void bch2_stripes_heap_start(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);
-int bch2_stripes_write(struct bch_fs *, unsigned);
-
-int bch2_ec_mem_alloc(struct bch_fs *, bool);
void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
unsigned alive:1; /* does a corresponding key exist in stripes btree? */
unsigned on_heap:1;
u8 blocks_nonempty;
+};
+
+struct gc_stripe {
+ u16 sectors;
+
+ u8 nr_blocks;
+ u8 nr_redundant;
+
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
u16 block_sectors[BCH_BKEY_PTRS_MAX];
struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERRCODE_H
+#define _BCACHEFS_ERRCODE_H
+
+enum {
+ /* Bucket allocator: */
+ OPEN_BUCKETS_EMPTY = 2048,
+ FREELIST_EMPTY, /* Allocator thread not keeping up */
+ INSUFFICIENT_DEVICES,
+};
+
+#endif /* _BCACHFES_ERRCODE_H */
return false;
case BCH_ON_ERROR_ro:
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
+ bch_err(c, "inconsistency detected - emergency read only");
return true;
case BCH_ON_ERROR_panic:
panic(bch2_fmt(c, "panic after error"));
void bch2_fatal_error(struct bch_fs *c)
{
if (bch2_fs_emergency_read_only(c))
- bch_err(c, "emergency read only");
+ bch_err(c, "fatal error - emergency read only");
}
void bch2_io_error_work(struct work_struct *work)
u64 idx = le64_to_cpu(p.v->idx);
unsigned sectors = bpos_min(*end, p.k->p).offset -
bkey_start_offset(p.k);
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c r_k;
- for_each_btree_key(trans, iter,
+ for_each_btree_key_norestart(trans, iter,
BTREE_ID_reflink, POS(0, idx + offset),
BTREE_ITER_SLOTS, r_k, ret2) {
if (bkey_cmp(bkey_start_pos(r_k.k),
break;
}
}
+ bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_put(trans, iter);
break;
}
}
#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
-int bch2_extent_atomic_end(struct btree_iter *iter,
+int bch2_extent_atomic_end(struct btree_trans *trans,
+ struct btree_iter *iter,
struct bkey_i *insert,
struct bpos *end)
{
- struct btree_trans *trans = iter->trans;
- struct btree_iter *copy;
+ struct btree_iter copy;
struct bkey_s_c k;
unsigned nr_iters = 0;
int ret;
if (ret < 0)
return ret;
- copy = bch2_trans_copy_iter(trans, iter);
+ bch2_trans_copy_iter(©, iter);
- for_each_btree_key_continue(copy, 0, k, ret) {
+ for_each_btree_key_continue_norestart(copy, 0, k, ret) {
unsigned offset = 0;
if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
break;
}
- bch2_trans_iter_put(trans, copy);
+ bch2_trans_iter_exit(trans, ©);
return ret < 0 ? ret : 0;
}
-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *k)
{
struct bpos end;
int ret;
- ret = bch2_extent_atomic_end(iter, k, &end);
+ ret = bch2_extent_atomic_end(trans, iter, k, &end);
if (ret)
return ret;
bch2_cut_back(end, k);
return 0;
}
-
-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
- struct bpos end;
- int ret;
-
- ret = bch2_extent_atomic_end(iter, k, &end);
- if (ret)
- return ret;
-
- return !bkey_cmp(end, k->k.p);
-}
#include "bcachefs.h"
-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
- struct bpos *);
-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *);
#endif /* _BCACHEFS_EXTENT_UPDATE_H */
if (lp.crc.csum_type &&
lp.crc.uncompressed_size +
- rp.crc.uncompressed_size > c->sb.encoded_extent_max)
+ rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
return false;
if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
if (can_narrow_crc(p.crc, n)) {
- bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+ __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
p.ptr.offset += p.crc.offset;
p.crc = n;
bch2_extent_ptr_decoded_append(k, &p);
return false;
}
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
- unsigned nr_replicas, bool compressed)
-{
- struct btree_trans trans;
- struct btree_iter *iter;
- struct bpos end = pos;
- struct bkey_s_c k;
- bool ret = true;
- int err;
-
- end.offset += size;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
- BTREE_ITER_SLOTS, k, err) {
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
- break;
-
- if (nr_replicas > bch2_bkey_replicas(c, k) ||
- (!compressed && bch2_bkey_sectors_compressed(k))) {
- ret = false;
- break;
- }
- }
- bch2_trans_iter_put(&trans, iter);
-
- bch2_trans_exit(&trans);
-
- return ret;
-}
-
unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
return i;
}
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ /* stripes have ptrs, but their layout doesn't work with this code */
+ BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+ memmove_u64s_down(entry, next,
+ (u64 *) bkey_val_end(k) - (u64 *) next);
+ k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- union bch_extent_entry *dst, *src, *prev;
+ union bch_extent_entry *entry = to_entry(ptr), *next;
+ union bch_extent_entry *ret = entry;
bool drop_crc = true;
EBUG_ON(ptr < &ptrs.start->ptr ||
ptr >= &ptrs.end->ptr);
EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
- src = extent_entry_next(to_entry(ptr));
- if (src != ptrs.end &&
- !extent_entry_is_crc(src))
- drop_crc = false;
-
- dst = to_entry(ptr);
- while ((prev = extent_entry_prev(ptrs, dst))) {
- if (extent_entry_is_ptr(prev))
+ for (next = extent_entry_next(entry);
+ next != ptrs.end;
+ next = extent_entry_next(next)) {
+ if (extent_entry_is_crc(next)) {
break;
-
- if (extent_entry_is_crc(prev)) {
- if (drop_crc)
- dst = prev;
+ } else if (extent_entry_is_ptr(next)) {
+ drop_crc = false;
break;
}
+ }
- dst = prev;
+ extent_entry_drop(k, entry);
+
+ while ((entry = extent_entry_prev(ptrs, entry))) {
+ if (extent_entry_is_ptr(entry))
+ break;
+
+ if ((extent_entry_is_crc(entry) && drop_crc) ||
+ extent_entry_is_stripe_ptr(entry)) {
+ ret = (void *) ret - extent_entry_bytes(entry);
+ extent_entry_drop(k, entry);
+ }
}
- memmove_u64s_down(dst, src,
- (u64 *) ptrs.end - (u64 *) src);
- k.k->u64s -= (u64 *) src - (u64 *) dst;
+ return ret;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
+{
+ bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+ union bch_extent_entry *ret =
+ __bch2_bkey_drop_ptr(k, ptr);
+
+ /*
+ * If we deleted all the dirty pointers and there's still cached
+ * pointers, we could set the cached pointers to dirty if they're not
+ * stale - but to do that correctly we'd need to grab an open_bucket
+ * reference so that we don't race with bucket reuse:
+ */
+ if (have_dirty &&
+ !bch2_bkey_dirty_devs(k.s_c).nr) {
+ k.k->type = KEY_TYPE_error;
+ set_bkey_val_u64s(k.k, 0);
+ ret = NULL;
+ } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+ k.k->type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(k.k, 0);
+ ret = NULL;
+ }
- return dst;
+ return ret;
}
void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
ptr->cached &&
ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
- /* will only happen if all pointers were cached: */
- if (!bch2_bkey_nr_ptrs(k.s_c))
- k.k->type = KEY_TYPE_deleted;
-
return bkey_deleted(k.k);
}
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
- crc.csum_type,
- crc.compression_type);
+ bch2_csum_types[crc.csum_type],
+ bch2_compression_types[crc.compression_type]);
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
ec = &entry->stripe_ptr;
if (k.k->type == KEY_TYPE_btree_ptr ||
k.k->type == KEY_TYPE_btree_ptr_v2)
- size_ondisk = c->opts.btree_node_size;
+ size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
- switch (extent_entry_type(e)) {
- case BCH_EXTENT_ENTRY_ptr:
- return true;
- default:
- return false;
- }
+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
}
static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
+ struct bch_extent_ptr *);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
*
* With one based indexing each level of the tree starts at a power of two -
* good for cacheline alignment:
- *
- * Size parameter is treated as if we were using 0 based indexing, however:
- * valid nodes, and inorder indices, are in the range [1..size) - that is, there
- * are actually size - 1 elements
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
static inline unsigned eytzinger1_first(unsigned size)
{
- return rounddown_pow_of_two(size - 1);
+ return rounddown_pow_of_two(size);
}
static inline unsigned eytzinger1_last(unsigned size)
{
- return rounddown_pow_of_two(size) - 1;
+ return rounddown_pow_of_two(size + 1) - 1;
}
/*
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EBUG_ON(i >= size);
+ EBUG_ON(i > size);
- if (eytzinger1_right_child(i) < size) {
+ if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
- i <<= __fls(size) - __fls(i);
- i >>= i >= size;
+ i <<= __fls(size + 1) - __fls(i);
+ i >>= i > size;
} else {
i >>= ffz(i) + 1;
}
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EBUG_ON(i >= size);
+ EBUG_ON(i > size);
- if (eytzinger1_left_child(i) < size) {
+ if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
- i <<= __fls(size) - __fls(i);
+ i <<= __fls(size + 1) - __fls(i);
i -= 1;
- i >>= i >= size;
+ i >>= i > size;
} else {
i >>= __ffs(i) + 1;
}
static inline unsigned eytzinger1_extra(unsigned size)
{
- return (size - rounddown_pow_of_two(size - 1)) << 1;
+ return (size + 1 - rounddown_pow_of_two(size)) << 1;
}
static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned extra)
{
unsigned b = __fls(i);
- unsigned shift = __fls(size - 1) - b;
+ unsigned shift = __fls(size) - b;
int s;
- EBUG_ON(!i || i >= size);
+ EBUG_ON(!i || i > size);
i ^= 1U << b;
i <<= 1;
unsigned shift;
int s;
- EBUG_ON(!i || i >= size);
+ EBUG_ON(!i || i > size);
/*
* sign bit trick:
shift = __ffs(i);
i >>= shift + 1;
- i |= 1U << (__fls(size - 1) - shift);
+ i |= 1U << (__fls(size) - shift);
return i;
}
static inline unsigned eytzinger0_first(unsigned size)
{
- return eytzinger1_first(size + 1) - 1;
+ return eytzinger1_first(size) - 1;
}
static inline unsigned eytzinger0_last(unsigned size)
{
- return eytzinger1_last(size + 1) - 1;
+ return eytzinger1_last(size) - 1;
}
static inline unsigned eytzinger0_next(unsigned i, unsigned size)
{
- return eytzinger1_next(i + 1, size + 1) - 1;
+ return eytzinger1_next(i + 1, size) - 1;
}
static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
{
- return eytzinger1_prev(i + 1, size + 1) - 1;
+ return eytzinger1_prev(i + 1, size) - 1;
}
static inline unsigned eytzinger0_extra(unsigned size)
{
- return eytzinger1_extra(size + 1);
+ return eytzinger1_extra(size);
}
static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
unsigned extra)
{
- return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+ return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
}
static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
unsigned extra)
{
- return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+ return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
}
static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
#include "dirent.h"
#include "fs-common.h"
#include "inode.h"
+#include "subvolume.h"
#include "xattr.h"
#include <linux/posix_acl.h>
-int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+ return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+ subvol_inum dir,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *new_inode,
const struct qstr *name,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct posix_acl *default_acl,
- struct posix_acl *acl)
+ struct posix_acl *acl,
+ subvol_inum snapshot_src,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter *dir_iter = NULL;
- struct btree_iter *inode_iter = NULL;
- struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
+ subvol_inum new_inum = dir;
u64 now = bch2_current_time(c);
u64 cpu = raw_smp_processor_id();
- u64 dir_offset = 0;
+ u64 dir_target;
+ u32 snapshot;
+ unsigned dir_type = mode_to_type(mode);
int ret;
- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dir_iter);
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
if (ret)
goto err;
- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
-
- if (!name)
- new_inode->bi_flags |= BCH_INODE_UNLINKED;
-
- inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
- if (default_acl) {
- ret = bch2_set_acl_trans(trans, new_inode, &hash,
- default_acl, ACL_TYPE_DEFAULT);
+ if (!(flags & BCH_CREATE_SNAPSHOT)) {
+ /* Normal create path - allocate a new inode: */
+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+ if (flags & BCH_CREATE_TMPFILE)
+ new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
if (ret)
goto err;
+
+ snapshot_src = (subvol_inum) { 0 };
+ } else {
+ /*
+ * Creating a snapshot - we're not allocating a new inode, but
+ * we do have to lookup the root inode of the subvolume we're
+ * snapshotting and update it (in the new snapshot):
+ */
+
+ if (!snapshot_src.inum) {
+ /* Inode wasn't specified, just snapshot: */
+ struct bch_subvolume s;
+
+ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
+ BTREE_ITER_CACHED, &s);
+ if (ret)
+ goto err;
+
+ snapshot_src.inum = le64_to_cpu(s.inode);
+ }
+
+ ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+ BTREE_ITER_INTENT);
+ if (ret)
+ goto err;
+
+ if (new_inode->bi_subvol != snapshot_src.subvol) {
+ /* Not a subvolume root: */
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * If we're not root, we have to own the subvolume being
+ * snapshotted:
+ */
+ if (uid && new_inode->bi_uid != uid) {
+ ret = -EPERM;
+ goto err;
+ }
+
+ flags |= BCH_CREATE_SUBVOL;
}
- if (acl) {
- ret = bch2_set_acl_trans(trans, new_inode, &hash,
- acl, ACL_TYPE_ACCESS);
+ new_inum.inum = new_inode->bi_inum;
+ dir_target = new_inode->bi_inum;
+
+ if (flags & BCH_CREATE_SUBVOL) {
+ u32 new_subvol, dir_snapshot;
+
+ ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+ snapshot_src.subvol,
+ &new_subvol, &snapshot,
+ (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
+ if (ret)
+ goto err;
+
+ new_inode->bi_parent_subvol = dir.subvol;
+ new_inode->bi_subvol = new_subvol;
+ new_inum.subvol = new_subvol;
+ dir_target = new_subvol;
+ dir_type = DT_SUBVOL;
+
+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+ ret = bch2_btree_iter_traverse(&dir_iter);
if (ret)
goto err;
}
- if (name) {
+ if (!(flags & BCH_CREATE_SNAPSHOT)) {
+ if (default_acl) {
+ ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+ default_acl, ACL_TYPE_DEFAULT);
+ if (ret)
+ goto err;
+ }
+
+ if (acl) {
+ ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+ acl, ACL_TYPE_ACCESS);
+ if (ret)
+ goto err;
+ }
+ }
+
+ if (!(flags & BCH_CREATE_TMPFILE)) {
struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
- dir_u->bi_mtime = dir_u->bi_ctime = now;
+ u64 dir_offset;
- if (S_ISDIR(new_inode->bi_mode))
+ if (is_subdir_for_nlink(new_inode))
dir_u->bi_nlink++;
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
- ret = bch2_inode_write(trans, dir_iter, dir_u);
+ ret = bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
goto err;
- ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
- mode_to_type(new_inode->bi_mode),
- name, new_inode->bi_inum,
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
+ dir_type,
+ name,
+ dir_target,
&dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
- }
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- new_inode->bi_dir = dir_u->bi_inum;
- new_inode->bi_dir_offset = dir_offset;
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
+ }
}
- /* XXX use bch2_btree_iter_set_snapshot() */
- inode_iter->snapshot = U32_MAX;
- bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+ inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+ bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
- ret = bch2_btree_iter_traverse(inode_iter) ?:
- bch2_inode_write(trans, inode_iter, new_inode);
+ ret = bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_inode_write(trans, &inode_iter, new_inode);
err:
- bch2_trans_iter_put(trans, inode_iter);
- bch2_trans_iter_put(trans, dir_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &dir_iter);
return ret;
}
-int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
- u64 inum, struct bch_inode_unpacked *dir_u,
- struct bch_inode_unpacked *inode_u, const struct qstr *name)
+int bch2_link_trans(struct btree_trans *trans,
+ subvol_inum dir, struct bch_inode_unpacked *dir_u,
+ subvol_inum inum, struct bch_inode_unpacked *inode_u,
+ const struct qstr *name)
{
struct bch_fs *c = trans->c;
- struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
struct bch_hash_info dir_hash;
u64 now = bch2_current_time(c);
u64 dir_offset = 0;
int ret;
- inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ if (dir.subvol != inum.subvol)
+ return -EXDEV;
+
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
if (ret)
goto err;
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
- ret = PTR_ERR_OR_ZERO(dir_iter);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
dir_hash = bch2_hash_info_init(c, dir_u);
- ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
- name, inum, &dir_offset,
+ name, inum.inum, &dir_offset,
BCH_HASH_SET_MUST_CREATE);
if (ret)
goto err;
if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- inode_u->bi_dir = dir_inum;
+ inode_u->bi_dir = dir.inum;
inode_u->bi_dir_offset = dir_offset;
}
- ret = bch2_inode_write(trans, dir_iter, dir_u) ?:
- bch2_inode_write(trans, inode_iter, inode_u);
+ ret = bch2_inode_write(trans, &dir_iter, dir_u) ?:
+ bch2_inode_write(trans, &inode_iter, inode_u);
err:
- bch2_trans_iter_put(trans, dir_iter);
- bch2_trans_iter_put(trans, inode_iter);
+ bch2_trans_iter_exit(trans, &dir_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
return ret;
}
int bch2_unlink_trans(struct btree_trans *trans,
- u64 dir_inum, struct bch_inode_unpacked *dir_u,
+ subvol_inum dir,
+ struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
- const struct qstr *name)
+ const struct qstr *name,
+ bool deleting_snapshot)
{
struct bch_fs *c = trans->c;
- struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
- *inode_iter = NULL;
+ struct btree_iter dir_iter = { NULL };
+ struct btree_iter dirent_iter = { NULL };
+ struct btree_iter inode_iter = { NULL };
struct bch_hash_info dir_hash;
- u64 inum, now = bch2_current_time(c);
+ subvol_inum inum;
+ u64 now = bch2_current_time(c);
struct bkey_s_c k;
int ret;
- dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dir_iter);
+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
if (ret)
goto err;
dir_hash = bch2_hash_info_init(c, dir_u);
- dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
- name, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dirent_iter);
+ ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+ name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
- k = bch2_btree_iter_peek_slot(dirent_iter);
- ret = bkey_err(k);
+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
- inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+ if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+ ret = bch2_empty_dir_trans(trans, inum);
+ if (ret)
+ goto err;
+ }
- inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
- if (ret)
+ if (deleting_snapshot && !inode_u->bi_subvol) {
+ ret = -ENOENT;
goto err;
+ }
+
+ if (deleting_snapshot || inode_u->bi_subvol) {
+ ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
+ if (ret)
+ goto err;
- if (inode_u->bi_dir == k.k->p.inode &&
- inode_u->bi_dir_offset == k.k->p.offset) {
+ k = bch2_btree_iter_peek_slot(&dirent_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ /*
+ * If we're deleting a subvolume, we need to really delete the
+ * dirent, not just emit a whiteout in the current snapshot:
+ */
+ bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(&dirent_iter);
+ if (ret)
+ goto err;
+ } else {
+ bch2_inode_nlink_dec(inode_u);
+ }
+
+ if (inode_u->bi_dir == dirent_iter.pos.inode &&
+ inode_u->bi_dir_offset == dirent_iter.pos.offset) {
inode_u->bi_dir = 0;
inode_u->bi_dir_offset = 0;
}
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
- dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
- bch2_inode_nlink_dec(inode_u);
-
- ret = (S_ISDIR(inode_u->bi_mode)
- ? bch2_empty_dir_trans(trans, inum)
- : 0) ?:
- bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
- bch2_inode_write(trans, dir_iter, dir_u) ?:
- bch2_inode_write(trans, inode_iter, inode_u);
+ dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
+
+ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash, &dirent_iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_inode_write(trans, &dir_iter, dir_u) ?:
+ bch2_inode_write(trans, &inode_iter, inode_u);
err:
- bch2_trans_iter_put(trans, inode_iter);
- bch2_trans_iter_put(trans, dirent_iter);
- bch2_trans_iter_put(trans, dir_iter);
+ bch2_trans_iter_exit(trans, &inode_iter);
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ bch2_trans_iter_exit(trans, &dir_iter);
return ret;
}
bool ret = false;
for (id = 0; id < Inode_opt_nr; id++) {
+ /* Skip attributes that were explicitly set on this inode */
if (dst_u->bi_fields_set & (1 << id))
continue;
}
int bch2_rename_trans(struct btree_trans *trans,
- u64 src_dir, struct bch_inode_unpacked *src_dir_u,
- u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+ subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+ subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
struct bch_inode_unpacked *src_inode_u,
struct bch_inode_unpacked *dst_inode_u,
const struct qstr *src_name,
enum bch_rename_mode mode)
{
struct bch_fs *c = trans->c;
- struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
- struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
+ struct btree_iter src_dir_iter = { NULL };
+ struct btree_iter dst_dir_iter = { NULL };
+ struct btree_iter src_inode_iter = { NULL };
+ struct btree_iter dst_inode_iter = { NULL };
struct bch_hash_info src_hash, dst_hash;
- u64 src_inode, src_offset, dst_inode, dst_offset;
+ subvol_inum src_inum, dst_inum;
+ u64 src_offset, dst_offset;
u64 now = bch2_current_time(c);
int ret;
- src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(src_dir_iter);
+ ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
src_hash = bch2_hash_info_init(c, src_dir_u);
- if (dst_dir != src_dir) {
- dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dst_dir_iter);
+ if (dst_dir.inum != src_dir.inum ||
+ dst_dir.subvol != src_dir.subvol) {
+ ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
ret = bch2_dirent_rename(trans,
src_dir, &src_hash,
dst_dir, &dst_hash,
- src_name, &src_inode, &src_offset,
- dst_name, &dst_inode, &dst_offset,
+ src_name, &src_inum, &src_offset,
+ dst_name, &dst_inum, &dst_offset,
mode);
if (ret)
goto err;
- src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(src_inode_iter);
+ ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
- if (dst_inode) {
- dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(dst_inode_iter);
+ if (dst_inum.inum) {
+ ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
+ BTREE_ITER_INTENT);
if (ret)
goto err;
}
}
if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inode)) {
+ bch2_empty_dir_trans(trans, dst_inum)) {
ret = -ENOTEMPTY;
goto err;
}
goto err;
}
- if (S_ISDIR(src_inode_u->bi_mode)) {
+ if (is_subdir_for_nlink(src_inode_u)) {
src_dir_u->bi_nlink--;
dst_dir_u->bi_nlink++;
}
- if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+ if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
dst_dir_u->bi_nlink--;
src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
}
src_dir_u->bi_mtime = now;
src_dir_u->bi_ctime = now;
- if (src_dir != dst_dir) {
+ if (src_dir.inum != dst_dir.inum) {
dst_dir_u->bi_mtime = now;
dst_dir_u->bi_ctime = now;
}
src_inode_u->bi_ctime = now;
- if (dst_inode)
+ if (dst_inum.inum)
dst_inode_u->bi_ctime = now;
- ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
- (src_dir != dst_dir
- ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
+ ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
+ (src_dir.inum != dst_dir.inum
+ ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
: 0 ) ?:
- bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
- (dst_inode
- ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
+ bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
+ (dst_inum.inum
+ ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
: 0 );
err:
- bch2_trans_iter_put(trans, dst_inode_iter);
- bch2_trans_iter_put(trans, src_inode_iter);
- bch2_trans_iter_put(trans, dst_dir_iter);
- bch2_trans_iter_put(trans, src_dir_iter);
+ bch2_trans_iter_exit(trans, &dst_inode_iter);
+ bch2_trans_iter_exit(trans, &src_inode_iter);
+ bch2_trans_iter_exit(trans, &dst_dir_iter);
+ bch2_trans_iter_exit(trans, &src_dir_iter);
return ret;
}
struct posix_acl;
-int bch2_create_trans(struct btree_trans *, u64,
+#define BCH_CREATE_TMPFILE (1U << 0)
+#define BCH_CREATE_SUBVOL (1U << 1)
+#define BCH_CREATE_SNAPSHOT (1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO (1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
const struct qstr *,
uid_t, gid_t, umode_t, dev_t,
struct posix_acl *,
- struct posix_acl *);
+ struct posix_acl *,
+ subvol_inum, unsigned);
-int bch2_link_trans(struct btree_trans *, u64,
- u64, struct bch_inode_unpacked *,
- struct bch_inode_unpacked *,
+int bch2_link_trans(struct btree_trans *,
+ subvol_inum, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
const struct qstr *);
-int bch2_unlink_trans(struct btree_trans *,
- u64, struct bch_inode_unpacked *,
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
- const struct qstr *);
+ const struct qstr *, bool);
int bch2_rename_trans(struct btree_trans *,
- u64, struct bch_inode_unpacked *,
- u64, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
+ subvol_inum, struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
struct bch_inode_unpacked *,
const struct qstr *,
return;
mutex_lock(&inode->ei_quota_lock);
+ BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+ inode->v.i_blocks += sectors;
+
#ifdef CONFIG_BCACHEFS_QUOTA
if (quota_res && sectors > 0) {
BUG_ON(sectors > quota_res->sectors);
bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
}
#endif
- inode->v.i_blocks += sectors;
mutex_unlock(&inode->ei_quota_lock);
}
/* stored in page->private: */
struct bch_page_sector {
- /* Uncompressed, fully allocated replicas: */
- unsigned nr_replicas:3;
+ /* Uncompressed, fully allocated replicas (or on disk reservation): */
+ unsigned nr_replicas:4;
- /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
- unsigned replicas_reserved:3;
+ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+ unsigned replicas_reserved:4;
/* i_sectors: */
enum {
SECTOR_UNALLOCATED,
SECTOR_RESERVED,
SECTOR_DIRTY,
+ SECTOR_DIRTY_RESERVED,
SECTOR_ALLOCATED,
- } state:2;
+ } state:8;
};
struct bch_page_state {
spinlock_t lock;
atomic_t write_count;
+ bool uptodate;
struct bch_page_sector s[PAGE_SECTORS];
};
return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
}
+static unsigned bkey_to_sector_state(const struct bkey *k)
+{
+ if (k->type == KEY_TYPE_reservation)
+ return SECTOR_RESERVED;
+ if (bkey_extent_is_allocation(k))
+ return SECTOR_ALLOCATED;
+ return SECTOR_UNALLOCATED;
+}
+
+static void __bch2_page_state_set(struct page *page,
+ unsigned pg_offset, unsigned pg_len,
+ unsigned nr_ptrs, unsigned state)
+{
+ struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
+ unsigned i;
+
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ spin_lock(&s->lock);
+
+ for (i = pg_offset; i < pg_offset + pg_len; i++) {
+ s->s[i].nr_replicas = nr_ptrs;
+ s->s[i].state = state;
+ }
+
+ if (i == PAGE_SECTORS)
+ s->uptodate = true;
+
+ spin_unlock(&s->lock);
+}
+
+static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
+ struct page **pages, unsigned nr_pages)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
+ unsigned pg_idx = 0;
+ u32 snapshot;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, ret) {
+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k.k);
+
+ while (pg_idx < nr_pages) {
+ struct page *page = pages[pg_idx];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
+ unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
+
+ BUG_ON(k.k->p.offset < pg_start);
+ BUG_ON(bkey_start_offset(k.k) > pg_end);
+
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
+ __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
+
+ if (k.k->p.offset < pg_end)
+ break;
+ pg_idx++;
+ }
+
+ if (pg_idx == nr_pages)
+ break;
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
+ bch2_trans_exit(&trans);
+
+ return ret;
+}
+
+static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+ unsigned state = bkey_to_sector_state(k.k);
+
+ bio_for_each_segment(bv, bio, iter)
+ __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
+ bv.bv_len >> 9, nr_ptrs, state);
+}
+
+static void mark_pagecache_unallocated(struct bch_inode_info *inode,
+ u64 start, u64 end)
+{
+ pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+ struct pagevec pvec;
+
+ if (end <= start)
+ return;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i, j;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+ &index, end_index);
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(start, pg_start) - pg_start;
+ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+ struct bch_page_state *s;
+
+ BUG_ON(end <= pg_start);
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ lock_page(page);
+ s = bch2_page_state(page);
+
+ if (s) {
+ spin_lock(&s->lock);
+ for (j = pg_offset; j < pg_offset + pg_len; j++)
+ s->s[j].nr_replicas = 0;
+ spin_unlock(&s->lock);
+ }
+
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ } while (index <= end_index);
+}
+
+static void mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 start, u64 end)
+{
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+ struct pagevec pvec;
+ s64 i_sectors_delta = 0;
+
+ if (end <= start)
+ return;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i, j;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+ &index, end_index);
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+ u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+ unsigned pg_offset = max(start, pg_start) - pg_start;
+ unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+ struct bch_page_state *s;
+
+ BUG_ON(end <= pg_start);
+ BUG_ON(pg_offset >= PAGE_SECTORS);
+ BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+ lock_page(page);
+ s = bch2_page_state(page);
+
+ if (s) {
+ spin_lock(&s->lock);
+ for (j = pg_offset; j < pg_offset + pg_len; j++)
+ switch (s->s[j].state) {
+ case SECTOR_UNALLOCATED:
+ s->s[j].state = SECTOR_RESERVED;
+ break;
+ case SECTOR_DIRTY:
+ s->s[j].state = SECTOR_DIRTY_RESERVED;
+ i_sectors_delta--;
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&s->lock);
+ }
+
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ } while (index <= end_index);
+
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
{
/* XXX: this should not be open coded */
if (!s)
return -ENOMEM;
+ BUG_ON(!s->uptodate);
+
for (i = round_down(offset, block_bytes(c)) >> 9;
i < round_up(offset + len, block_bytes(c)) >> 9;
i++) {
disk_res.sectors += s->s[i].replicas_reserved;
s->s[i].replicas_reserved = 0;
- if (s->s[i].state == SECTOR_DIRTY) {
- dirty_sectors++;
+ switch (s->s[i].state) {
+ case SECTOR_DIRTY:
s->s[i].state = SECTOR_UNALLOCATED;
+ --dirty_sectors;
+ break;
+ case SECTOR_DIRTY_RESERVED:
+ s->s[i].state = SECTOR_RESERVED;
+ break;
+ default:
+ break;
}
}
bch2_disk_reservation_put(c, &disk_res);
- if (dirty_sectors)
- i_sectors_acct(c, inode, NULL, -dirty_sectors);
+ i_sectors_acct(c, inode, NULL, dirty_sectors);
bch2_page_state_release(page);
}
s->s[i].replicas_reserved += sectors;
res->disk.sectors -= sectors;
- if (s->s[i].state == SECTOR_UNALLOCATED)
+ switch (s->s[i].state) {
+ case SECTOR_UNALLOCATED:
+ s->s[i].state = SECTOR_DIRTY;
dirty_sectors++;
-
- s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
+ break;
+ case SECTOR_RESERVED:
+ s->s[i].state = SECTOR_DIRTY_RESERVED;
+ break;
+ default:
+ break;
+ }
}
spin_unlock(&s->lock);
- if (dirty_sectors)
- i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+ i_sectors_acct(c, inode, &res->quota, dirty_sectors);
if (!PageDirty(page))
__set_page_dirty_nobuffers(page);
struct bch2_page_reservation res;
unsigned len;
loff_t isize;
- int ret = VM_FAULT_LOCKED;
+ int ret;
bch2_page_reservation_init(c, inode, &res);
len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+
if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
bch2_page_reservation_put(c, inode, &res);
wait_for_stable_page(page);
+ ret = VM_FAULT_LOCKED;
out:
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
sb_end_pagefault(inode->v.i_sb);
return iter->pages[iter->idx];
}
-static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
- ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
- unsigned state = k.k->type == KEY_TYPE_reservation
- ? SECTOR_RESERVED
- : SECTOR_ALLOCATED;
-
- bio_for_each_segment(bv, bio, iter) {
- struct bch_page_state *s = bch2_page_state(bv.bv_page);
- unsigned i;
-
- for (i = bv.bv_offset >> 9;
- i < (bv.bv_offset + bv.bv_len) >> 9;
- i++) {
- s->s[i].nr_replicas = nr_ptrs;
- s->s[i].state = state;
- }
- }
-}
-
static bool extent_partial_reads_expensive(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
{
while (bio_sectors(bio) < sectors_this_extent &&
bio->bi_vcnt < bio->bi_max_vecs) {
- pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
+ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
struct page *page = readpage_iter_next(iter);
int ret;
}
}
-static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
- struct bch_read_bio *rbio, u64 inum,
+static void bchfs_read(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ subvol_inum inum,
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
+ struct btree_iter iter;
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
+ u32 snapshot;
int ret = 0;
rbio->c = c;
rbio->start_time = local_clock();
+ rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
retry:
bch2_trans_begin(trans);
+ iter = (struct btree_iter) { NULL };
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+ SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+ BTREE_ITER_SLOTS);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
break;
}
- bch2_btree_iter_set_pos(iter,
- POS(inum, rbio->bio.bi_iter.bi_sector));
+ bch2_btree_iter_set_pos(&iter,
+ POS(inum.inum, rbio->bio.bi_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
break;
- offset_into_extent = iter->pos.offset -
+ offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
sectors = min(sectors, k.k->size - offset_into_extent);
- bch2_trans_unlock(trans);
-
if (readpages_iter)
readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- if (bkey_extent_is_allocation(k.k))
- bch2_add_page_sectors(&rbio->bio, k);
+ bch2_bio_page_state_set(&rbio->bio, k);
- bch2_read_extent(trans, rbio, iter->pos,
+ bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
+
+ ret = btree_trans_too_many_iters(trans);
+ if (ret)
+ break;
}
+err:
+ bch2_trans_iter_exit(trans, &iter);
if (ret == -EINTR)
goto retry;
if (ret) {
- bch_err_inum_ratelimited(c, inum,
+ bch_err_inum_ratelimited(c, inum.inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
- struct btree_iter *iter;
struct page *page;
struct readpages_iter readpages_iter;
int ret;
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
- BTREE_ITER_SLOTS);
bch2_pagecache_add_get(&inode->ei_pagecache_lock);
readpages_iter.idx++;
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
- rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
+ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
- bchfs_read(&trans, iter, rbio, inode->v.i_ino,
+ bchfs_read(&trans, rbio, inode_inum(inode),
&readpages_iter);
}
bch2_pagecache_add_put(&inode->ei_pagecache_lock);
- bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
- u64 inum, struct page *page)
+ subvol_inum inum, struct page *page)
{
struct btree_trans trans;
- struct btree_iter *iter;
bch2_page_state_create(page, __GFP_NOFAIL);
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
rbio->bio.bi_iter.bi_sector =
- (sector_t) page->index << PAGE_SECTOR_SHIFT;
+ (sector_t) page->index << PAGE_SECTORS_SHIFT;
BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
- BTREE_ITER_SLOTS);
-
- bchfs_read(&trans, iter, rbio, inum, NULL);
-
- bch2_trans_iter_put(&trans, iter);
+ bchfs_read(&trans, rbio, inum, NULL);
bch2_trans_exit(&trans);
}
rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
rbio->bio.bi_end_io = bch2_readpages_end_io;
- __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+ __bchfs_readpage(c, rbio, inode_inum(inode), page);
return 0;
}
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_page_end_io;
- __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+ __bchfs_readpage(c, rbio, inode_inum(inode), page);
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
* racing with fallocate can cause us to add fewer sectors than
* expected - but we shouldn't add more sectors than expected:
*/
- BUG_ON(io->op.i_sectors_delta > 0);
+ WARN_ON(io->op.i_sectors_delta > 0);
/*
* (error (due to going RO) halfway through a page can screw that up
op = &w->io->op;
bch2_write_op_init(op, c, w->opts);
op->target = w->opts.foreground_target;
- op_journal_seq_set(op, &inode->ei_journal_seq);
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
+ op->subvol = inode->ei_subvol;
op->pos = POS(inode->v.i_ino, sector);
op->wbio.bio.bi_iter.bi_sector = sector;
op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
do_io:
s = bch2_page_state_create(page, __GFP_NOFAIL);
- ret = bch2_get_page_disk_reservation(c, inode, page, true);
- if (ret) {
- SetPageError(page);
- mapping_set_error(page->mapping, ret);
- unlock_page(page);
- return 0;
- }
+ /*
+ * Things get really hairy with errors during writeback:
+ */
+ ret = bch2_get_page_disk_reservation(c, inode, page, false);
+ BUG_ON(ret);
/* Before unlocking the page, get copy of reservations: */
+ spin_lock(&s->lock);
orig = *s;
+ spin_unlock(&s->lock);
for (i = 0; i < PAGE_SECTORS; i++) {
if (s->s[i].state < SECTOR_DIRTY)
offset = 0;
while (1) {
- unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+ unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
u64 sector;
while (offset < PAGE_SECTORS &&
if (offset == PAGE_SECTORS)
break;
- sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
-
while (offset + sectors < PAGE_SECTORS &&
- orig.s[offset + sectors].state >= SECTOR_DIRTY)
+ orig.s[offset + sectors].state >= SECTOR_DIRTY) {
+ reserved_sectors += orig.s[offset + sectors].replicas_reserved;
+ dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
sectors++;
-
- for (i = offset; i < offset + sectors; i++) {
- reserved_sectors += orig.s[i].replicas_reserved;
- dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
}
+ BUG_ON(!sectors);
+
+ sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
if (ret)
goto err;
out:
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
+ if (ret)
+ goto out;
+ }
+
ret = bch2_page_reservation_get(c, inode, page, res,
offset, len, true);
if (ret) {
}
while (reserved < len) {
- struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+ unsigned i = (offset + reserved) >> PAGE_SHIFT;
+ struct page *page = pages[i];
unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
unsigned pg_len = min_t(unsigned, len - reserved,
PAGE_SIZE - pg_offset);
-retry_reservation:
- ret = bch2_page_reservation_get(c, inode, page, &res,
- pg_offset, pg_len, true);
- if (ret && !PageUptodate(page)) {
- ret = bch2_read_single_page(page, mapping);
- if (!ret)
- goto retry_reservation;
+ if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+ ret = bch2_page_state_set(c, inode_inum(inode),
+ pages + i, nr_pages - i);
+ if (ret)
+ goto out;
}
+ ret = bch2_page_reservation_get(c, inode, page, &res,
+ pg_offset, pg_len, true);
if (ret)
goto out;
unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
unsigned pg_len = min_t(unsigned, len - copied,
PAGE_SIZE - pg_offset);
- unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
- iter, pg_offset, pg_len);
+ unsigned pg_copied = copy_page_from_iter_atomic(page,
+ pg_offset, pg_len,iter);
if (!pg_copied)
break;
}
flush_dcache_page(page);
- iov_iter_advance(iter, pg_copied);
copied += pg_copied;
if (pg_copied != pg_len)
if (iter->count)
closure_get(&dio->cl);
- bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+ bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
}
iter->count += shorten;
/* O_DIRECT writes */
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+ u64 offset, u64 size,
+ unsigned nr_replicas, bool compressed)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 end = offset + size;
+ u32 snapshot;
+ bool ret = true;
+ int err;
+
+ bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (err)
+ goto err;
+
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inum.inum, offset, snapshot),
+ BTREE_ITER_SLOTS, k, err) {
+ if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
+ break;
+
+ if (k.k->p.snapshot != snapshot ||
+ nr_replicas > bch2_bkey_replicas(c, k) ||
+ (!compressed && bch2_bkey_sectors_compressed(k))) {
+ ret = false;
+ break;
+ }
+ }
+
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (err == -EINTR)
+ goto retry;
+ bch2_trans_exit(&trans);
+
+ return err ? false : ret;
+}
+
static void bch2_dio_write_loop_async(struct bch_write_op *);
static long bch2_dio_write_loop(struct dio_write *dio)
bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
dio->op.end_io = bch2_dio_write_loop_async;
dio->op.target = dio->op.opts.foreground_target;
- op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
+ dio->op.subvol = inode->ei_subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
if ((req->ki_flags & IOCB_DSYNC) &&
ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
dio->op.opts.data_replicas, 0);
if (unlikely(ret) &&
- !bch2_check_range_allocated(c, dio->op.pos,
- bio_sectors(bio),
+ !bch2_check_range_allocated(c, inode_inum(inode),
+ dio->op.pos.offset, bio_sectors(bio),
dio->op.opts.data_replicas,
dio->op.opts.compression != 0))
goto err;
/* fsync: */
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+/*
+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
+ * insert trigger: look up the btree inode instead
+ */
+static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
{
- struct bch_inode_info *inode = file_bch_inode(file);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret, ret2;
+ struct bch_inode_unpacked inode;
+ int ret;
- ret = file_write_and_wait_range(file, start, end);
+ if (c->opts.journal_flush_disabled)
+ return 0;
+
+ ret = bch2_inode_find_by_inum(c, inum, &inode);
if (ret)
return ret;
- if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
- goto out;
+ return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
+}
- ret = sync_inode_metadata(&inode->v, 1);
- if (ret)
- return ret;
-out:
- if (!c->opts.journal_flush_disabled)
- ret = bch2_journal_flush_seq(&c->journal,
- inode->ei_journal_seq);
- ret2 = file_check_and_advance_wb_err(file);
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ int ret, ret2, ret3;
- return ret ?: ret2;
+ ret = file_write_and_wait_range(file, start, end);
+ ret2 = sync_inode_metadata(&inode->v, 1);
+ ret3 = bch2_flush_inode(c, inode_inum(inode));
+
+ return ret ?: ret2 ?: ret3;
}
/* truncate: */
-static inline int range_has_data(struct bch_fs *c,
- struct bpos start,
- struct bpos end)
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+ struct bpos start,
+ struct bpos end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+ if (ret)
+ goto err;
- for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
break;
}
}
- bch2_trans_iter_put(&trans, iter);
+ start = iter.pos;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
static int __bch2_truncate_page(struct bch_inode_info *inode,
unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
unsigned i;
struct page *page;
+ s64 i_sectors_delta = 0;
int ret = 0;
/* Page boundary? Nothing to do */
* XXX: we're doing two index lookups when we end up reading the
* page
*/
- ret = range_has_data(c,
- POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
- POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
+ ret = range_has_data(c, inode->ei_subvol,
+ POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
+ POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
if (ret <= 0)
return ret;
i < round_down(end_offset, block_bytes(c)) >> 9;
i++) {
s->s[i].nr_replicas = 0;
+ if (s->s[i].state == SECTOR_DIRTY)
+ i_sectors_delta--;
s->s[i].state = SECTOR_UNALLOCATED;
}
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+ /*
+ * Caller needs to know whether this page will be written out by
+ * writeback - doing an i_size update if necessary - or whether it will
+ * be responsible for the i_size update:
+ */
+ ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
+ PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
+
zero_user_segment(page, start_offset, end_offset);
/*
* XXX: because we aren't currently tracking whether the page has actual
* data in it (vs. just 0s, or only partially written) this wrong. ick.
*/
- ret = bch2_get_page_disk_reservation(c, inode, page, false);
- BUG_ON(ret);
+ BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
/*
* This removes any writeable userspace mappings; we need to force
from, round_up(from, PAGE_SIZE));
}
+static int bch2_truncate_pages(struct bch_inode_info *inode,
+ loff_t start, loff_t end)
+{
+ int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
+ start, end);
+
+ if (ret >= 0 &&
+ start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+ ret = __bch2_truncate_page(inode,
+ end >> PAGE_SHIFT,
+ start, end);
+ return ret;
+}
+
static int bch2_extend(struct user_namespace *mnt_userns,
struct bch_inode_info *inode,
struct bch_inode_unpacked *inode_u,
inode_dio_wait(&inode->v);
bch2_pagecache_block_get(&inode->ei_pagecache_lock);
- ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u);
+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
if (ret)
goto err;
iattr->ia_valid &= ~ATTR_SIZE;
ret = bch2_truncate_page(inode, iattr->ia_size);
- if (unlikely(ret))
+ if (unlikely(ret < 0))
goto err;
/*
truncate_setsize(&inode->v, iattr->ia_size);
- ret = bch2_fpunch(c, inode->v.i_ino,
+ ret = bch2_fpunch(c, inode_inum(inode),
round_up(iattr->ia_size, block_bytes(c)) >> 9,
- U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
+ U64_MAX, &i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
+ !bch2_journal_error(&c->journal));
+
if (unlikely(ret))
goto err;
static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
- u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
+ u64 end = offset + len;
+ u64 block_start = round_up(offset, block_bytes(c));
+ u64 block_end = round_down(end, block_bytes(c));
+ bool truncated_last_page;
int ret = 0;
- inode_lock(&inode->v);
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
- ret = __bch2_truncate_page(inode,
- offset >> PAGE_SHIFT,
- offset, offset + len);
- if (unlikely(ret))
+ ret = bch2_truncate_pages(inode, offset, end);
+ if (unlikely(ret < 0))
goto err;
- if (offset >> PAGE_SHIFT !=
- (offset + len) >> PAGE_SHIFT) {
- ret = __bch2_truncate_page(inode,
- (offset + len) >> PAGE_SHIFT,
- offset, offset + len);
- if (unlikely(ret))
- goto err;
- }
+ truncated_last_page = ret;
- truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+ truncate_pagecache_range(&inode->v, offset, end - 1);
- if (discard_start < discard_end) {
+ if (block_start < block_end ) {
s64 i_sectors_delta = 0;
- ret = bch2_fpunch(c, inode->v.i_ino,
- discard_start, discard_end,
- &inode->ei_journal_seq,
+ ret = bch2_fpunch(c, inode_inum(inode),
+ block_start >> 9, block_end >> 9,
&i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
}
mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
- ATTR_MTIME|ATTR_CTIME) ?: ret;
+ if (end >= inode->v.i_size && !truncated_last_page) {
+ ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+ ATTR_MTIME|ATTR_CTIME);
+ } else {
+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+ ATTR_MTIME|ATTR_CTIME);
+ }
mutex_unlock(&inode->ei_update_lock);
err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- inode_unlock(&inode->v);
-
return ret;
}
struct address_space *mapping = inode->v.i_mapping;
struct bkey_buf copy;
struct btree_trans trans;
- struct btree_iter *src, *dst, *del;
+ struct btree_iter src, dst, del;
loff_t shift, new_size;
u64 src_start;
int ret = 0;
if ((offset | len) & (block_bytes(c) - 1))
return -EINVAL;
- /*
- * We need i_mutex to keep the page cache consistent with the extents
- * btree, and the btree consistent with i_size - we don't need outside
- * locking for the extents btree itself, because we're using linked
- * iterators
- */
- inode_lock(&inode->v);
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
if (insert) {
- ret = -EFBIG;
if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
- goto err;
+ return -EFBIG;
- ret = -EINVAL;
if (offset >= inode->v.i_size)
- goto err;
+ return -EINVAL;
src_start = U64_MAX;
shift = len;
} else {
- ret = -EINVAL;
if (offset + len >= inode->v.i_size)
- goto err;
+ return -EINVAL;
src_start = offset + len;
shift = -len;
ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
if (ret)
- goto err;
+ return ret;
if (insert) {
i_size_write(&inode->v, new_size);
} else {
s64 i_sectors_delta = 0;
- ret = bch2_fpunch(c, inode->v.i_ino,
+ ret = bch2_fpunch(c, inode_inum(inode),
offset >> 9, (offset + len) >> 9,
- &inode->ei_journal_seq,
&i_sectors_delta);
i_sectors_acct(c, inode, NULL, i_sectors_delta);
if (ret)
- goto err;
+ return ret;
}
bch2_bkey_buf_init(©);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+ bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
POS(inode->v.i_ino, src_start >> 9),
BTREE_ITER_INTENT);
- dst = bch2_trans_copy_iter(&trans, src);
- del = bch2_trans_copy_iter(&trans, src);
+ bch2_trans_copy_iter(&dst, &src);
+ bch2_trans_copy_iter(&del, &src);
while (ret == 0 || ret == -EINTR) {
struct disk_reservation disk_res =
struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
struct bpos atomic_end;
unsigned trigger_flags = 0;
+ u32 snapshot;
+
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans,
+ inode->ei_subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&src, snapshot);
+ bch2_btree_iter_set_snapshot(&dst, snapshot);
+ bch2_btree_iter_set_snapshot(&del, snapshot);
bch2_trans_begin(&trans);
k = insert
- ? bch2_btree_iter_peek_prev(src)
- : bch2_btree_iter_peek(src);
+ ? bch2_btree_iter_peek_prev(&src)
+ : bch2_btree_iter_peek(&src);
if ((ret = bkey_err(k)))
continue;
bch2_cut_front(move_pos, copy.k);
copy.k->k.p.offset += shift >> 9;
- bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k));
+ bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k));
- ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
+ ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
if (ret)
continue;
delete.k.p = copy.k->k.p;
delete.k.size = copy.k->k.size;
delete.k.p.offset -= shift >> 9;
- bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
+ bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
BUG_ON(ret);
}
- ret = bch2_btree_iter_traverse(del) ?:
- bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
- bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
- bch2_trans_commit(&trans, &disk_res,
- &inode->ei_journal_seq,
+ ret = bch2_btree_iter_traverse(&del) ?:
+ bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
+ bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
+ bch2_trans_commit(&trans, &disk_res, NULL,
BTREE_INSERT_NOFAIL);
bch2_disk_reservation_put(c, &disk_res);
if (!ret)
- bch2_btree_iter_set_pos(src, next_pos);
+ bch2_btree_iter_set_pos(&src, next_pos);
}
- bch2_trans_iter_put(&trans, del);
- bch2_trans_iter_put(&trans, dst);
- bch2_trans_iter_put(&trans, src);
+ bch2_trans_iter_exit(&trans, &del);
+ bch2_trans_iter_exit(&trans, &dst);
+ bch2_trans_iter_exit(&trans, &src);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(©, c);
if (ret)
- goto err;
+ return ret;
+ mutex_lock(&inode->ei_update_lock);
if (!insert) {
i_size_write(&inode->v, new_size);
- mutex_lock(&inode->ei_update_lock);
ret = bch2_write_inode_size(c, inode, new_size,
ATTR_MTIME|ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
+ } else {
+ /* We need an inode update to update bi_journal_seq for fsync: */
+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+ ATTR_MTIME|ATTR_CTIME);
}
-err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- inode_unlock(&inode->v);
+ mutex_unlock(&inode->ei_update_lock);
return ret;
}
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bpos end_pos = POS(inode->v.i_ino, end_sector);
unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
POS(inode->v.i_ino, start_sector),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
+ while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
s64 i_sectors_delta = 0;
struct disk_reservation disk_res = { 0 };
struct quota_res quota_res = { 0 };
struct bkey_i_reservation reservation;
struct bkey_s_c k;
unsigned sectors;
+ u32 snapshot;
bch2_trans_begin(&trans);
- k = bch2_btree_iter_peek_slot(iter);
+ ret = bch2_subvolume_get_snapshot(&trans,
+ inode->ei_subvol, &snapshot);
+ if (ret)
+ goto bkey_err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek_slot(&iter);
if ((ret = bkey_err(k)))
goto bkey_err;
/* already reserved */
if (k.k->type == KEY_TYPE_reservation &&
bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
if (bkey_extent_is_data(k.k) &&
!(mode & FALLOC_FL_ZERO_RANGE)) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
reservation.k.p = k.k->p;
reservation.k.size = k.k->size;
- bch2_cut_front(iter->pos, &reservation.k_i);
+ bch2_cut_front(iter.pos, &reservation.k_i);
bch2_cut_back(end_pos, &reservation.k_i);
sectors = reservation.k.size;
reservation.v.nr_replicas = disk_res.nr_replicas;
}
- ret = bch2_extent_update(&trans, iter, &reservation.k_i,
- &disk_res, &inode->ei_journal_seq,
+ ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
+ &reservation.k_i,
+ &disk_res, NULL,
0, &i_sectors_delta, true);
+ if (ret)
+ goto bkey_err;
i_sectors_acct(c, inode, "a_res, i_sectors_delta);
bkey_err:
bch2_quota_reservation_put(c, inode, "a_res);
if (ret == -EINTR)
ret = 0;
}
- bch2_trans_iter_put(&trans, iter);
+
+ bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
+ mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
+
+ if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+ struct quota_res quota_res = { 0 };
+ s64 i_sectors_delta = 0;
+
+ bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+ end_sector, &i_sectors_delta);
+ i_sectors_acct(c, inode, "a_res, i_sectors_delta);
+ bch2_quota_reservation_put(c, inode, "a_res);
+ }
+
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
loff_t offset, loff_t len)
{
- struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- loff_t end = offset + len;
- loff_t block_start = round_down(offset, block_bytes(c));
- loff_t block_end = round_up(end, block_bytes(c));
- int ret;
-
- inode_lock(&inode->v);
- inode_dio_wait(&inode->v);
- bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+ u64 end = offset + len;
+ u64 block_start = round_down(offset, block_bytes(c));
+ u64 block_end = round_up(end, block_bytes(c));
+ bool truncated_last_page = false;
+ int ret, ret2 = 0;
if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
ret = inode_newsize_ok(&inode->v, end);
if (ret)
- goto err;
+ return ret;
}
if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = __bch2_truncate_page(inode,
- offset >> PAGE_SHIFT,
- offset, end);
-
- if (!ret &&
- offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
- ret = __bch2_truncate_page(inode,
- end >> PAGE_SHIFT,
- offset, end);
+ ret = bch2_truncate_pages(inode, offset, end);
+ if (unlikely(ret < 0))
+ return ret;
- if (unlikely(ret))
- goto err;
+ truncated_last_page = ret;
truncate_pagecache_range(&inode->v, offset, end - 1);
+
+ block_start = round_up(offset, block_bytes(c));
+ block_end = round_down(end, block_bytes(c));
}
ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
- if (ret)
- goto err;
/*
- * Do we need to extend the file?
- *
- * If we zeroed up to the end of the file, we dropped whatever writes
- * were going to write out the current i_size, so we have to extend
- * manually even if FL_KEEP_SIZE was set:
+ * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+ * so that the VFS cache i_size is consistent with the btree i_size:
*/
- if (end >= inode->v.i_size &&
- (!(mode & FALLOC_FL_KEEP_SIZE) ||
- (mode & FALLOC_FL_ZERO_RANGE))) {
+ if (ret &&
+ !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+ return ret;
- /*
- * Sync existing appends before extending i_size,
- * as in bch2_extend():
- */
- ret = filemap_write_and_wait_range(mapping,
- inode->ei_inode.bi_size, S64_MAX);
- if (ret)
- goto err;
+ if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+ end = inode->v.i_size;
- if (mode & FALLOC_FL_KEEP_SIZE)
- end = inode->v.i_size;
- else
- i_size_write(&inode->v, end);
+ if (end >= inode->v.i_size &&
+ (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+ !(mode & FALLOC_FL_KEEP_SIZE))) {
+ spin_lock(&inode->v.i_lock);
+ i_size_write(&inode->v, end);
+ spin_unlock(&inode->v.i_lock);
mutex_lock(&inode->ei_update_lock);
- ret = bch2_write_inode_size(c, inode, end, 0);
+ ret2 = bch2_write_inode_size(c, inode, end, 0);
mutex_unlock(&inode->ei_update_lock);
}
-err:
- bch2_pagecache_block_put(&inode->ei_pagecache_lock);
- inode_unlock(&inode->v);
- return ret;
+
+ return ret ?: ret2;
}
long bch2_fallocate_dispatch(struct file *file, int mode,
if (!percpu_ref_tryget(&c->writes))
return -EROFS;
+ inode_lock(&inode->v);
+ inode_dio_wait(&inode->v);
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
ret = bchfs_fallocate(inode, mode, offset, len);
else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
else
ret = -EOPNOTSUPP;
+
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+ inode_unlock(&inode->v);
percpu_ref_put(&c->writes);
return ret;
}
-static void mark_range_unallocated(struct bch_inode_info *inode,
- loff_t start, loff_t end)
-{
- pgoff_t index = start >> PAGE_SHIFT;
- pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
- struct pagevec pvec;
-
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i, j;
-
- nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
- &index, end_index);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct bch_page_state *s;
-
- lock_page(page);
- s = bch2_page_state(page);
-
- if (s) {
- spin_lock(&s->lock);
- for (j = 0; j < PAGE_SECTORS; j++)
- s->s[j].nr_replicas = 0;
- spin_unlock(&s->lock);
- }
-
- unlock_page(page);
- }
- pagevec_release(&pvec);
- } while (index <= end_index);
-}
-
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
if (ret)
goto err;
- mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+ mark_pagecache_unallocated(src, pos_src >> 9,
+ (pos_src + aligned_len) >> 9);
ret = bch2_remap_range(c,
- POS(dst->v.i_ino, pos_dst >> 9),
- POS(src->v.i_ino, pos_src >> 9),
+ inode_inum(dst), pos_dst >> 9,
+ inode_inum(src), pos_src >> 9,
aligned_len >> 9,
- &dst->ei_journal_seq,
pos_dst + len, &i_sectors_delta);
if (ret < 0)
goto err;
i_size_write(&dst->v, pos_dst + ret);
spin_unlock(&dst->v.i_lock);
- if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
- IS_SYNC(file_inode(file_dst))) &&
- !c->opts.journal_flush_disabled)
- ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq);
+ if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+ IS_SYNC(file_inode(file_dst)))
+ ret = bch2_flush_inode(c, inode_inum(dst));
err:
bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
+ subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
+ u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
return -ENXIO;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
break;
} else if (bkey_extent_is_data(k.k)) {
} else if (k.k->p.offset >> 9 > isize)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
if (ret)
return ret;
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
+ subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
+ u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
return -ENXIO;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
- for_each_btree_key(&trans, iter, BTREE_ID_extents,
- POS(inode->v.i_ino, offset >> 9),
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+ SPOS(inode->v.i_ino, offset >> 9, snapshot),
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
offset = max(offset, bkey_start_offset(k.k) << 9);
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
if (ret)
return ret;
#include "quota.h"
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
char *kname = NULL;
struct qstr qstr;
int ret = 0;
- u64 inum;
+ subvol_inum inum;
kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
if (!kname)
qstr.len = ret;
qstr.name = kname;
- ret = -ENOENT;
- inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
- &qstr);
- if (!inum)
+ ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+ if (ret)
goto err1;
vinode = bch2_vfs_inode_get(c, inum);
return ret;
}
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ struct inode *dir;
+ struct bch_inode_info *inode;
+ struct user_namespace *s_user_ns;
+ struct dentry *dst_dentry;
+ struct path src_path, dst_path;
+ int how = LOOKUP_FOLLOW;
+ int error;
+ subvol_inum snapshot_src = { 0 };
+ unsigned lookup_flags = 0;
+ unsigned create_flags = BCH_CREATE_SUBVOL;
+
+ if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+ BCH_SUBVOL_SNAPSHOT_RO))
+ return -EINVAL;
+
+ if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ (arg.src_ptr ||
+ (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+ return -EINVAL;
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ create_flags |= BCH_CREATE_SNAPSHOT;
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+ create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+ /* why do we need this lock? */
+ down_read(&c->vfs_sb->s_umount);
+
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ sync_inodes_sb(c->vfs_sb);
+retry:
+ if (arg.src_ptr) {
+ error = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.src_ptr,
+ how, &src_path);
+ if (error)
+ goto err1;
+
+ if (src_path.dentry->d_sb->s_fs_info != c) {
+ path_put(&src_path);
+ error = -EXDEV;
+ goto err1;
+ }
+
+ snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+ }
+
+ dst_dentry = user_path_create(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ &dst_path, lookup_flags);
+ error = PTR_ERR_OR_ZERO(dst_dentry);
+ if (error)
+ goto err2;
+
+ if (dst_dentry->d_sb->s_fs_info != c) {
+ error = -EXDEV;
+ goto err3;
+ }
+
+ if (dst_dentry->d_inode) {
+ error = -EEXIST;
+ goto err3;
+ }
+
+ dir = dst_path.dentry->d_inode;
+ if (IS_DEADDIR(dir)) {
+ error = -ENOENT;
+ goto err3;
+ }
+
+ s_user_ns = dir->i_sb->s_user_ns;
+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ !kgid_has_mapping(s_user_ns, current_fsgid())) {
+ error = -EOVERFLOW;
+ goto err3;
+ }
+
+ error = inode_permission(file_mnt_user_ns(filp),
+ dir, MAY_WRITE | MAY_EXEC);
+ if (error)
+ goto err3;
+
+ if (!IS_POSIXACL(dir))
+ arg.mode &= ~current_umask();
+
+ error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+ if (error)
+ goto err3;
+
+ if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+ !arg.src_ptr)
+ snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+
+ inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir),
+ dst_dentry, arg.mode|S_IFDIR,
+ 0, snapshot_src, create_flags);
+ error = PTR_ERR_OR_ZERO(inode);
+ if (error)
+ goto err3;
+
+ d_instantiate(dst_dentry, &inode->v);
+ fsnotify_mkdir(dir, dst_dentry);
+err3:
+ done_path_create(&dst_path, dst_dentry);
+err2:
+ if (arg.src_ptr)
+ path_put(&src_path);
+
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+err1:
+ up_read(&c->vfs_sb->s_umount);
+
+ return error;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
+{
+ struct path path;
+ struct inode *dir;
+ int ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ ret = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ret;
+
+ if (path.dentry->d_sb->s_fs_info != c) {
+ path_put(&path);
+ return -EXDEV;
+ }
+
+ dir = path.dentry->d_parent->d_inode;
+
+ ret = __bch2_unlink(dir, path.dentry, true);
+ if (!ret) {
+ fsnotify_rmdir(dir, path.dentry);
+ d_delete(path.dentry);
+ }
+ path_put(&path);
+
+ return ret;
+}
+
long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct bch_inode_info *inode = file_bch_inode(file);
case FS_IOC_GOINGDOWN:
return bch2_ioc_goingdown(c, (u32 __user *) arg);
+ case BCH_IOCTL_SUBVOLUME_CREATE: {
+ struct bch_ioctl_subvolume i;
+
+ if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+ return -EFAULT;
+ return bch2_ioctl_subvolume_create(c, file, i);
+ }
+
+ case BCH_IOCTL_SUBVOLUME_DESTROY: {
+ struct bch_ioctl_subvolume i;
+
+ if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+ return -EFAULT;
+ return bch2_ioctl_subvolume_destroy(c, file, i);
+ }
+
default:
return bch2_fs_ioctl(c, cmd, (void __user *) arg);
}
static struct kmem_cache *bch2_inode_cache;
-static void bch2_vfs_inode_init(struct bch_fs *,
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_info *,
- struct bch_inode_unpacked *);
-
-static void journal_seq_copy(struct bch_fs *c,
- struct bch_inode_info *dst,
- u64 journal_seq)
-{
- /*
- * atomic64_cmpxchg has a fallback for archs that don't support it,
- * cmpxchg does not:
- */
- atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
- u64 old, v = READ_ONCE(dst->ei_journal_seq);
-
- do {
- old = v;
-
- if (old >= journal_seq)
- break;
- } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
-
- bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
-}
+ struct bch_inode_unpacked *,
+ struct bch_subvolume *);
static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
{
__pagecache_lock_get(lock, -1);
}
-void bch2_inode_update_after_write(struct bch_fs *c,
+void bch2_inode_update_after_write(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
unsigned fields)
{
+ struct bch_fs *c = trans->c;
+
+ BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+ bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+ POS(0, bi->bi_inum),
+ c->opts.inodes_use_key_cache);
+
set_nlink(&inode->v, bch2_inode_nlink_get(bi));
i_uid_write(&inode->v, bi->bi_uid);
i_gid_write(&inode->v, bi->bi_gid);
void *p, unsigned fields)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter = { NULL };
struct bch_inode_unpacked inode_u;
int ret;
retry:
bch2_trans_begin(&trans);
- iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(iter) ?:
+ ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT) ?:
(set ? set(inode, &inode_u, p) : 0) ?:
- bch2_inode_write(&trans, iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL,
- &inode->ei_journal_seq,
- BTREE_INSERT_NOFAIL);
+ bch2_inode_write(&trans, &iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
* this is important for inode updates via bchfs_write_index_update
*/
if (!ret)
- bch2_inode_update_after_write(c, inode, &inode_u, fields);
+ bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR)
goto retry;
return ret;
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ subvol_inum *inum = p;
+
+ return inode->ei_subvol == inum->subvol &&
+ inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ subvol_inum *inum = p;
+
+ inode->v.i_ino = inum->inum;
+ inode->ei_subvol = inum->subvol;
+ inode->ei_inode.bi_inum = inum->inum;
+ return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+ return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
+ struct btree_trans trans;
+ struct bch_subvolume subvol;
int ret;
- inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+ inode = to_bch_ei(iget5_locked(c->vfs_sb,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->v.i_state & I_NEW))
return &inode->v;
- ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+ bch2_trans_init(&trans, c, 8, 0);
+ ret = lockrestart_do(&trans,
+ bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
+ bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+
+ if (!ret)
+ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+ bch2_trans_exit(&trans);
+
if (ret) {
iget_failed(&inode->v);
return ERR_PTR(ret);
}
- bch2_vfs_inode_init(c, inode, &inode_u);
-
- inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
-
unlock_new_inode(&inode->v);
return &inode->v;
}
-static int inum_test(struct inode *inode, void *p)
-{
- unsigned long *ino = p;
-
- return *ino == inode->i_ino;
-}
-
-static struct bch_inode_info *
+struct bch_inode_info *
__bch2_create(struct user_namespace *mnt_userns,
struct bch_inode_info *dir, struct dentry *dentry,
- umode_t mode, dev_t rdev, bool tmpfile)
+ umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+ unsigned flags)
{
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans trans;
struct bch_inode_info *inode, *old;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
+ subvol_inum inum;
+ struct bch_subvolume subvol;
u64 journal_seq = 0;
int ret;
bch2_inode_init_early(c, &inode_u);
- if (!tmpfile)
+ if (!(flags & BCH_CREATE_TMPFILE))
mutex_lock(&dir->ei_update_lock);
bch2_trans_init(&trans, c, 8,
- 2048 + (!tmpfile ? dentry->d_name.len : 0));
+ 2048 + (!(flags & BCH_CREATE_TMPFILE)
+ ? dentry->d_name.len : 0));
retry:
bch2_trans_begin(&trans);
- ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
- !tmpfile ? &dentry->d_name : NULL,
+ ret = bch2_create_trans(&trans,
+ inode_inum(dir), &dir_u, &inode_u,
+ !(flags & BCH_CREATE_TMPFILE)
+ ? &dentry->d_name : NULL,
from_kuid(mnt_userns, current_fsuid()),
from_kgid(mnt_userns, current_fsgid()),
mode, rdev,
- default_acl, acl) ?:
+ default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
KEY_TYPE_QUOTA_PREALLOC);
if (unlikely(ret))
goto err_before_quota;
- ret = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+ inum.inum = inode_u.bi_inum;
+
+ ret = bch2_subvolume_get(&trans, inum.subvol, true,
+ BTREE_ITER_WITH_UPDATES, &subvol) ?:
+ bch2_trans_commit(&trans, NULL, &journal_seq, 0);
if (unlikely(ret)) {
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
goto err_trans;
}
- if (!tmpfile) {
- bch2_inode_update_after_write(c, dir, &dir_u,
+ if (!(flags & BCH_CREATE_TMPFILE)) {
+ bch2_inode_update_after_write(&trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(c, dir, journal_seq);
mutex_unlock(&dir->ei_update_lock);
}
- bch2_vfs_inode_init(c, inode, &inode_u);
- journal_seq_copy(c, inode, journal_seq);
+ bch2_iget5_set(&inode->v, &inum);
+ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
*/
inode->v.i_state |= I_CREATING;
- old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
- inum_test, NULL, &inode->v.i_ino));
+
+ old = to_bch_ei(inode_insert5(&inode->v,
+ bch2_inode_hash(inum),
+ bch2_iget5_test,
+ bch2_iget5_set,
+ &inum));
BUG_ON(!old);
if (unlikely(old != inode)) {
* We raced, another process pulled the new inode into cache
* before us:
*/
- journal_seq_copy(c, old, journal_seq);
make_bad_inode(&inode->v);
iput(&inode->v);
posix_acl_release(acl);
return inode;
err_trans:
- if (!tmpfile)
+ if (!(flags & BCH_CREATE_TMPFILE))
mutex_unlock(&dir->ei_update_lock);
bch2_trans_exit(&trans);
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct inode *vinode = NULL;
- u64 inum;
+ subvol_inum inum = { .subvol = 1 };
+ int ret;
- inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
- &dentry->d_name);
+ ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+ &dentry->d_name, &inum);
- if (inum)
+ if (!ret)
vinode = bch2_vfs_inode_get(c, inum);
return d_splice_alias(vinode, dentry);
umode_t mode, dev_t rdev)
{
struct bch_inode_info *inode =
- __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false);
+ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
+ (subvol_inum) { 0 }, 0);
if (IS_ERR(inode))
return PTR_ERR(inode);
mutex_lock(&inode->ei_update_lock);
bch2_trans_init(&trans, c, 4, 1024);
- ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_link_trans(&trans,
- dir->v.i_ino,
- inode->v.i_ino, &dir_u, &inode_u,
+ inode_inum(dir), &dir_u,
+ inode_inum(inode), &inode_u,
&dentry->d_name));
if (likely(!ret)) {
- BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
- journal_seq_copy(c, inode, dir->ei_journal_seq);
- bch2_inode_update_after_write(c, dir, &dir_u,
+ bch2_inode_update_after_write(&trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+ bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
}
bch2_trans_exit(&trans);
return 0;
}
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+ bool deleting_snapshot)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
bch2_trans_init(&trans, c, 4, 1024);
- ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
+ ret = __bch2_trans_do(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL,
bch2_unlink_trans(&trans,
- dir->v.i_ino, &dir_u,
- &inode_u, &dentry->d_name));
+ inode_inum(dir), &dir_u,
+ &inode_u, &dentry->d_name,
+ deleting_snapshot));
if (likely(!ret)) {
- BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
- journal_seq_copy(c, inode, dir->ei_journal_seq);
- bch2_inode_update_after_write(c, dir, &dir_u,
+ bch2_inode_update_after_write(&trans, dir, &dir_u,
ATTR_MTIME|ATTR_CTIME);
- bch2_inode_update_after_write(c, inode, &inode_u,
+ bch2_inode_update_after_write(&trans, inode, &inode_u,
ATTR_MTIME);
}
return ret;
}
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+ return __bch2_unlink(vdir, dentry, false);
+}
+
static int bch2_symlink(struct user_namespace *mnt_userns,
struct inode *vdir, struct dentry *dentry,
const char *symname)
struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
int ret;
- inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+ inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (unlikely(IS_ERR(inode)))
return PTR_ERR(inode);
if (unlikely(ret))
goto err;
- journal_seq_copy(c, dir, inode->ei_journal_seq);
-
ret = __bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
goto err;
? BCH_RENAME_EXCHANGE
: dst_dentry->d_inode
? BCH_RENAME_OVERWRITE : BCH_RENAME;
- u64 journal_seq = 0;
int ret;
if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
goto err;
}
- ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
bch2_rename_trans(&trans,
- src_dir->v.i_ino, &src_dir_u,
- dst_dir->v.i_ino, &dst_dir_u,
+ inode_inum(src_dir), &src_dir_u,
+ inode_inum(dst_dir), &dst_dir_u,
&src_inode_u,
&dst_inode_u,
&src_dentry->d_name,
BUG_ON(dst_inode &&
dst_inode->v.i_ino != dst_inode_u.bi_inum);
- bch2_inode_update_after_write(c, src_dir, &src_dir_u,
+ bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(c, src_dir, journal_seq);
- if (src_dir != dst_dir) {
- bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
+ if (src_dir != dst_dir)
+ bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
ATTR_MTIME|ATTR_CTIME);
- journal_seq_copy(c, dst_dir, journal_seq);
- }
- bch2_inode_update_after_write(c, src_inode, &src_inode_u,
+ bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
ATTR_CTIME);
- journal_seq_copy(c, src_inode, journal_seq);
- if (dst_inode) {
- bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
+ if (dst_inode)
+ bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
ATTR_CTIME);
- journal_seq_copy(c, dst_inode, journal_seq);
- }
err:
bch2_trans_exit(&trans);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_qid qid;
struct btree_trans trans;
- struct btree_iter *inode_iter;
+ struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
int ret;
kfree(acl);
acl = NULL;
- inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
- BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
+ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+ BTREE_ITER_INTENT);
if (ret)
goto btree_err;
bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
if (attr->ia_valid & ATTR_MODE) {
- ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
+ ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+ inode_u.bi_mode, &acl);
if (ret)
goto btree_err;
}
- ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL,
- &inode->ei_journal_seq,
+ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
btree_err:
- bch2_trans_iter_put(&trans, inode_iter);
+ bch2_trans_iter_exit(&trans, &inode_iter);
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
goto err_trans;
- bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
+ bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
if (acl)
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
struct inode *vdir, struct dentry *dentry, umode_t mode)
{
struct bch_inode_info *inode =
- __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true);
+ __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
if (IS_ERR(inode))
return PTR_ERR(inode);
else
offset += p.crc.offset;
- if ((offset & (c->opts.block_size - 1)) ||
- (k.k->size & (c->opts.block_size - 1)))
+ if ((offset & (block_sectors(c) - 1)) ||
+ (k.k->size & (block_sectors(c) - 1)))
flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
ret = fiemap_fill_next_extent(info,
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *ei = to_bch_ei(vinode);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
unsigned offset_into_extent, sectors;
bool have_extent = false;
+ u32 snapshot;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
if (start + len < start)
return -EINVAL;
+ start >>= 9;
+
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- POS(ei->v.i_ino, start >> 9), 0);
retry:
bch2_trans_begin(&trans);
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(ei->v.i_ino, start, snapshot), 0);
+
+ while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = bkey_err(k)) &&
- bkey_cmp(iter->pos, end) < 0) {
+ bkey_cmp(iter.pos, end) < 0) {
enum btree_id data_btree = BTREE_ID_extents;
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
- offset_into_extent = iter->pos.offset -
+ offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
offset_into_extent),
cur.k);
bch2_key_resize(&cur.k->k, sectors);
- cur.k->k.p = iter->pos;
+ cur.k->k.p = iter.pos;
cur.k->k.p.offset += cur.k->k.size;
if (have_extent) {
bkey_copy(prev.k, cur.k);
have_extent = true;
- bch2_btree_iter_set_pos(iter,
- POS(iter->pos.inode, iter->pos.offset + sectors));
+ bch2_btree_iter_set_pos(&iter,
+ POS(iter.pos.inode, iter.pos.offset + sectors));
}
-
+ start = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
if (ret == -EINTR)
goto retry;
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
- bch2_trans_iter_put(&trans, iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
return ret < 0 ? ret : 0;
if (!dir_emit_dots(file, ctx))
return 0;
- return bch2_readdir(c, inode->v.i_ino, ctx);
+ return bch2_readdir(c, inode_inum(inode), ctx);
}
static const struct file_operations bch_file_operations = {
.error_remove_page = generic_error_remove_page,
};
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+struct bcachefs_fid {
+ u64 inum;
+ u32 subvol;
+ u32 gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+ struct bcachefs_fid fid;
+ struct bcachefs_fid dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
{
- struct bch_fs *c = sb->s_fs_info;
- struct inode *vinode;
+ switch (fh_type) {
+ case FILEID_BCACHEFS_WITHOUT_PARENT:
+ return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+ case FILEID_BCACHEFS_WITH_PARENT:
+ return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+ default:
+ return false;
+ }
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+ return (struct bcachefs_fid) {
+ .inum = inode->ei_inode.bi_inum,
+ .subvol = inode->ei_subvol,
+ .gen = inode->ei_inode.bi_generation,
+ };
+}
- if (ino < BCACHEFS_ROOT_INO)
- return ERR_PTR(-ESTALE);
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+ struct inode *vdir)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_inode_info *dir = to_bch_ei(vdir);
+
+ if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
+ return FILEID_INVALID;
+
+ if (!S_ISDIR(inode->v.i_mode) && dir) {
+ struct bcachefs_fid_with_parent *fid = (void *) fh;
+
+ fid->fid = bch2_inode_to_fid(inode);
+ fid->dir = bch2_inode_to_fid(dir);
+
+ *len = sizeof(*fid) / sizeof(u32);
+ return FILEID_BCACHEFS_WITH_PARENT;
+ } else {
+ struct bcachefs_fid *fid = (void *) fh;
+
+ *fid = bch2_inode_to_fid(inode);
+
+ *len = sizeof(*fid) / sizeof(u32);
+ return FILEID_BCACHEFS_WITHOUT_PARENT;
+ }
+}
- vinode = bch2_vfs_inode_get(c, ino);
- if (IS_ERR(vinode))
- return ERR_CAST(vinode);
- if (generation && vinode->i_generation != generation) {
- /* we didn't find the right inode.. */
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+ struct bcachefs_fid fid)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+ .subvol = fid.subvol,
+ .inum = fid.inum,
+ });
+ if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
iput(vinode);
- return ERR_PTR(-ESTALE);
+ vinode = ERR_PTR(-ESTALE);
}
return vinode;
}
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
int fh_len, int fh_type)
{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- bch2_nfs_get_inode);
+ struct bcachefs_fid *fid = (void *) _fid;
+
+ if (!bcachefs_fid_valid(fh_len, fh_type))
+ return NULL;
+
+ return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
}
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
int fh_len, int fh_type)
{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- bch2_nfs_get_inode);
+ struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+ if (!bcachefs_fid_valid(fh_len, fh_type) ||
+ fh_type != FILEID_BCACHEFS_WITH_PARENT)
+ return NULL;
+
+ return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+ struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ subvol_inum parent_inum = {
+ .subvol = inode->ei_inode.bi_parent_subvol ?:
+ inode->ei_subvol,
+ .inum = inode->ei_inode.bi_dir,
+ };
+
+ if (!parent_inum.inum)
+ return NULL;
+
+ return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+ struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+ struct bch_inode_info *dir = to_bch_ei(parent->d_inode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct btree_trans trans;
+ struct btree_iter iter1;
+ struct btree_iter iter2;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ struct bch_inode_unpacked inode_u;
+ subvol_inum target;
+ u32 snapshot;
+ unsigned name_len;
+ int ret;
+
+ if (!S_ISDIR(dir->v.i_mode))
+ return -EINVAL;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+ POS(dir->ei_inode.bi_inum, 0), 0);
+ bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+ POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter1, snapshot);
+ bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+ ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+ if (ret)
+ goto err;
+
+ if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+ bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+ k = bch2_btree_iter_peek_slot(&iter1);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_dirent) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ d = bkey_s_c_to_dirent(k);
+ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret)
+ goto err;
+
+ if (target.subvol == inode->ei_subvol &&
+ target.inum == inode->ei_inode.bi_inum)
+ goto found;
+ } else {
+ /*
+ * File with multiple hardlinks and our backref is to the wrong
+ * directory - linear search:
+ */
+ for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+ if (k.k->p.inode > dir->ei_inode.bi_inum)
+ break;
+
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
+
+ d = bkey_s_c_to_dirent(k);
+ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
+ if (target.subvol == inode->ei_subvol &&
+ target.inum == inode->ei_inode.bi_inum)
+ goto found;
+ }
+ }
+
+ ret = -ENOENT;
+ goto err;
+found:
+ name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+
+ memcpy(name, d.v->d_name, name_len);
+ name[name_len] = '\0';
+err:
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter1);
+ bch2_trans_iter_exit(&trans, &iter2);
+ bch2_trans_exit(&trans);
+
+ return ret;
}
static const struct export_operations bch_export_ops = {
+ .encode_fh = bch2_encode_fh,
.fh_to_dentry = bch2_fh_to_dentry,
.fh_to_parent = bch2_fh_to_parent,
- //.get_parent = bch2_get_parent,
+ .get_parent = bch2_get_parent,
+ .get_name = bch2_get_name,
};
-static void bch2_vfs_inode_init(struct bch_fs *c,
+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi)
+ struct bch_inode_unpacked *bi,
+ struct bch_subvolume *subvol)
{
- bch2_inode_update_after_write(c, inode, bi, ~0);
+ bch2_inode_update_after_write(trans, inode, bi, ~0);
+
+ if (BCH_SUBVOLUME_SNAP(subvol))
+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+ else
+ clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
inode->v.i_blocks = bi->bi_sectors;
inode->v.i_ino = bi->bi_inum;
inode->v.i_size = bi->bi_size;
inode->ei_flags = 0;
- inode->ei_journal_seq = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
+ inode->ei_subvol = inum.subvol;
inode->v.i_mapping->a_ops = &bch_address_space_operations;
mutex_init(&inode->ei_update_lock);
pagecache_lock_init(&inode->ei_pagecache_lock);
mutex_init(&inode->ei_quota_lock);
- inode->ei_journal_seq = 0;
return &inode->v;
}
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode->v.i_ino, true);
+ bch2_inode_rm(c, inode_inum(inode));
}
}
+void bch2_evict_subvolume_inodes(struct bch_fs *c,
+ struct snapshot_id_list *s)
+{
+ struct super_block *sb = c->vfs_sb;
+ struct inode *inode;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+ (inode->i_state & I_FREEING))
+ continue;
+
+ d_mark_dontcache(inode);
+ d_prune_aliases(inode);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+again:
+ cond_resched();
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+ (inode->i_state & I_FREEING))
+ continue;
+
+ if (!(inode->i_state & I_DONTCACHE)) {
+ d_mark_dontcache(inode);
+ d_prune_aliases(inode);
+ }
+
+ spin_lock(&inode->i_lock);
+ if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
+ !(inode->i_state & I_FREEING)) {
+ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
+ DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&sb->s_inode_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ goto again;
+ }
+
+ spin_unlock(&inode->i_lock);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+}
+
static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
- if (!(opt->mode & OPT_MOUNT))
+ if (!(opt->flags & OPT_MOUNT))
continue;
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
sb->s_flags |= SB_POSIXACL;
#endif
- vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+ sb->s_shrink.seeks = 0;
+
+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
if (IS_ERR(vinode)) {
bch_err(c, "error mounting: error getting root inode %i",
(int) PTR_ERR(vinode));
unsigned long ei_flags;
struct mutex ei_update_lock;
- u64 ei_journal_seq;
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
+ u32 ei_subvol;
+
/* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
};
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+ return (subvol_inum) {
+ .subvol = inode->ei_subvol,
+ .inum = inode->ei_inode.bi_inum,
+ };
+}
+
/*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent:
*/
#define EI_INODE_ERROR 0
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT 1
+
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
#ifndef NO_BCACHEFS_FS
+struct bch_inode_info *
+__bch2_create(struct user_namespace *, struct bch_inode_info *,
+ struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
int bch2_fs_quota_transfer(struct bch_fs *,
struct bch_inode_info *,
struct bch_qid,
KEY_TYPE_QUOTA_PREALLOC);
}
-struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
/* returns 0 if we want to do the update, or error is passed up */
typedef int (*inode_set_fn)(struct bch_inode_info *,
struct bch_inode_unpacked *, void *);
-void bch2_inode_update_after_write(struct bch_fs *,
+void bch2_inode_update_after_write(struct btree_trans *,
struct bch_inode_info *,
struct bch_inode_unpacked *,
unsigned);
int bch2_setattr_nonsize(struct user_namespace *,
struct bch_inode_info *,
struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
+
+void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
#else
+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+ struct snapshot_id_list *s) {}
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
#include "fsck.h"
#include "inode.h"
#include "keylist.h"
+#include "subvolume.h"
#include "super.h"
#include "xattr.h"
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+ u32 snapshot)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
u64 sectors = 0;
int ret;
for_each_btree_key(trans, iter, BTREE_ID_extents,
- POS(inum, 0), 0, k, ret) {
+ SPOS(inum, 0, snapshot), 0, k, ret) {
if (k.k->p.inode != inum)
break;
sectors += k.k->size;
}
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret ?: sectors;
}
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+ u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_dirent d;
+ u64 subdirs = 0;
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot), 0, k, ret) {
+ if (k.k->p.inode != inum)
+ break;
+
+ if (k.k->type != KEY_TYPE_dirent)
+ continue;
+
+ d = bkey_s_c_to_dirent(k);
+ if (d.v->d_type == DT_DIR)
+ subdirs++;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret ?: subdirs;
+}
+
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+ u32 *subvol)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+ POS(0, snapshot), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(trans->c, "snapshot %u not fonud", snapshot);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+
+}
+
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
+{
+ struct bch_subvolume s;
+ int ret;
+
+ ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+
+ *snapshot = le32_to_cpu(s.snapshot);
+ *inum = le64_to_cpu(s.inode);
+ return ret;
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
+{
+ return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
+}
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ POS(0, inode_nr),
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, inode);
+err:
+ if (ret && ret != -EINTR)
+ bch_err(trans->c, "error %i fetching inode %llu",
+ ret, inode_nr);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode,
u32 *snapshot)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
- POS(0, inode_nr), 0);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inode_nr, *snapshot), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (snapshot)
- *snapshot = iter->pos.snapshot;
- ret = k.k->type == KEY_TYPE_inode
- ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+ ret = bkey_is_inode(k.k)
+ ? bch2_inode_unpack(k, inode)
: -ENOENT;
+ if (!ret)
+ *snapshot = iter.pos.snapshot;
err:
- bch2_trans_iter_free(trans, iter);
+ if (ret && ret != -EINTR)
+ bch_err(trans->c, "error %i fetching inode %llu:%u",
+ ret, inode_nr, *snapshot);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
}
+static int __lookup_dirent(struct btree_trans *trans,
+ struct bch_hash_info hash_info,
+ subvol_inum dir, struct qstr *name,
+ u64 *target, unsigned *type)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0);
+ if (ret)
+ return ret;
+
+ d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+ *target = le64_to_cpu(d.v->d_inum);
+ *type = d.v->d_type;
+ bch2_trans_iter_exit(trans, &iter);
+ return 0;
+}
+
static int __write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
- struct btree_iter *inode_iter =
- bch2_trans_get_iter(trans, BTREE_ID_inodes,
- SPOS(0, inode->bi_inum, snapshot),
- BTREE_ITER_INTENT);
- int ret = bch2_btree_iter_traverse(inode_iter) ?:
- bch2_inode_write(trans, inode_iter, inode);
- bch2_trans_iter_put(trans, inode_iter);
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inode->bi_inum, snapshot),
+ BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_inode_write(trans, &iter, inode);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
return ret;
}
+static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+ struct btree_iter iter = { NULL };
+ struct bkey_i_inode_generation delete;
+ struct bch_inode_unpacked inode_u;
+ struct bkey_s_c k;
+ int ret;
+
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL) ?:
+ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+ SPOS(inum, 0, snapshot),
+ SPOS(inum, U64_MAX, snapshot),
+ 0, NULL);
+ if (ret)
+ goto err;
+retry:
+ bch2_trans_begin(trans);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!bkey_is_inode(k.k)) {
+ bch2_fs_inconsistent(trans->c,
+ "inode %llu:%u not found when deleting",
+ inum, snapshot);
+ ret = -EIO;
+ goto err;
+ }
+
+ bch2_inode_unpack(k, &inode_u);
+
+ /* Subvolume root? */
+ if (inode_u.bi_subvol) {
+ ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
+ if (ret)
+ goto err;
+ }
+
+ bkey_inode_generation_init(&delete.k_i);
+ delete.k.p = iter.pos;
+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret == -EINTR)
+ goto retry;
+
+ return ret;
+}
+
static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bch_inode_unpacked dir_inode;
struct bch_hash_info dir_hash_info;
int ret;
- ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
+ ret = lookup_first_inode(trans, pos.inode, &dir_inode);
if (ret)
return ret;
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, iter);
- bch2_trans_iter_put(trans, iter);
- return ret;
-}
-
-static int remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
- int ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- __remove_dirent(trans, pos));
- if (ret)
- bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
+ &dir_hash_info, &iter, 0);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans,
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
struct bch_inode_unpacked *lostfound)
{
struct bch_fs *c = trans->c;
struct bch_inode_unpacked root;
struct bch_hash_info root_hash_info;
struct qstr lostfound_str = QSTR("lost+found");
- u64 inum;
+ subvol_inum root_inum = { .subvol = subvol };
+ u64 inum = 0;
+ unsigned d_type = 0;
u32 snapshot;
int ret;
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
- if (ret && ret != -ENOENT)
+ ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+ if (ret)
+ return ret;
+
+ ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+ if (ret)
return ret;
root_hash_info = bch2_hash_info_init(c, &root);
- inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
- &lostfound_str);
- if (!inum) {
+
+ ret = __lookup_dirent(trans, root_hash_info, root_inum,
+ &lostfound_str, &inum, &d_type);
+ if (ret == -ENOENT) {
bch_notice(c, "creating lost+found");
goto create_lostfound;
}
- ret = lookup_inode(trans, inum, lostfound, &snapshot);
- if (ret && ret != -ENOENT) {
- /*
- * The check_dirents pass has already run, dangling dirents
- * shouldn't exist here:
- */
+ if (ret && ret != -EINTR)
bch_err(c, "error looking up lost+found: %i", ret);
+ if (ret)
return ret;
- }
- if (ret == -ENOENT) {
-create_lostfound:
- bch2_inode_init_early(c, lostfound);
-
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_create_trans(trans,
- BCACHEFS_ROOT_INO, &root,
- lostfound,
- &lostfound_str,
- 0, 0, S_IFDIR|0700, 0, NULL, NULL));
- if (ret)
- bch_err(c, "error creating lost+found: %i", ret);
+ if (d_type != DT_DIR) {
+ bch_err(c, "error looking up lost+found: not a directory");
+ return ret;
}
- return 0;
+ /*
+ * The check_dirents pass has already run, dangling dirents
+ * shouldn't exist here:
+ */
+ return __lookup_inode(trans, inum, lostfound, &snapshot);
+
+create_lostfound:
+ bch2_inode_init_early(c, lostfound);
+
+ ret = bch2_create_trans(trans, root_inum, &root,
+ lostfound, &lostfound_str,
+ 0, 0, S_IFDIR|0700, 0, NULL, NULL,
+ (subvol_inum) { }, 0);
+ if (ret && ret != -EINTR)
+ bch_err(c, "error creating lost+found: %i", ret);
+ return ret;
}
-static int reattach_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
+static int __reattach_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot)
{
struct bch_hash_info dir_hash;
struct bch_inode_unpacked lostfound;
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
+ u32 subvol;
int ret;
- ret = lookup_lostfound(trans, &lostfound);
+ ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+ if (ret)
+ return ret;
+
+ ret = lookup_lostfound(trans, subvol, &lostfound);
if (ret)
return ret;
if (S_ISDIR(inode->bi_mode)) {
lostfound.bi_nlink++;
- ret = write_inode(trans, &lostfound, U32_MAX);
+ ret = __write_inode(trans, &lostfound, U32_MAX);
if (ret)
return ret;
}
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
name = (struct qstr) QSTR(name_buf);
- ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
- bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
- mode_to_type(inode->bi_mode),
- &name, inode->bi_inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE));
+ ret = bch2_dirent_create(trans,
+ (subvol_inum) {
+ .subvol = subvol,
+ .inum = lostfound.bi_inum,
+ },
+ &dir_hash,
+ inode_d_type(inode),
+ &name, inode->bi_inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
+ if (ret)
+ return ret;
+
+ inode->bi_dir = lostfound.bi_inum;
+ inode->bi_dir_offset = dir_offset;
+
+ return __write_inode(trans, inode, inode_snapshot);
+}
+
+static int reattach_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 inode_snapshot)
+{
+ int ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ __reattach_inode(trans, inode, inode_snapshot));
if (ret) {
bch_err(trans->c, "error %i reattaching inode %llu",
ret, inode->bi_inum);
return ret;
}
- inode->bi_dir = lostfound.bi_inum;
- inode->bi_dir_offset = dir_offset;
-
- return write_inode(trans, inode, U32_MAX);
+ return ret;
}
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+ POS(inode->bi_dir, inode->bi_dir_offset), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto out;
goto out;
}
- ret = remove_dirent(trans, k.k->p);
+ ret = __remove_dirent(trans, k.k->p);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+{
+ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+
+ if (bkey_cmp(s->pos, pos))
+ s->nr = 0;
+ s->pos = pos;
+
+ /* Might get called multiple times due to lock restarts */
+ if (s->nr && s->d[s->nr - 1] == pos.snapshot)
+ return 0;
+
+ return snapshots_seen_add(c, s, pos.snapshot);
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+ u32 id, u32 ancestor)
+{
+ ssize_t i;
+
+ BUG_ON(id > ancestor);
+
+ id = snapshot_t(c, id)->equiv;
+ ancestor = snapshot_t(c, ancestor)->equiv;
+
+ /* @ancestor should be the snapshot most recently added to @seen */
+ BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
+ BUG_ON(seen->pos.snapshot != ancestor);
+
+ if (id == ancestor)
+ return true;
+
+ if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+ return false;
+
+ for (i = seen->nr - 2;
+ i >= 0 && seen->d[i] >= id;
+ --i)
+ if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
+ bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+ return false;
+
+ return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * This assumes we're visiting @src keys in natural key order.
+ *
+ * @s - list of snapshot IDs already seen at @src
+ * @src - snapshot ID of src key
+ * @dst - snapshot ID of dst key
+ */
+static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+ u32 src, u32 dst)
+{
+ return dst <= src
+ ? key_visible_in_snapshot(c, s, dst, src)
+ : bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
+ for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
struct inode_walker {
- bool first_this_inode;
- bool have_inode;
- u64 cur_inum;
- u32 snapshot;
- struct bch_inode_unpacked inode;
+ bool first_this_inode;
+ u64 cur_inum;
+
+ size_t nr;
+ size_t size;
+ struct inode_walker_entry {
+ struct bch_inode_unpacked inode;
+ u32 snapshot;
+ u64 count;
+ } *d;
};
+static void inode_walker_exit(struct inode_walker *w)
+{
+ kfree(w->d);
+ w->d = NULL;
+}
+
static struct inode_walker inode_walker_init(void)
{
- return (struct inode_walker) {
- .cur_inum = -1,
- .have_inode = false,
+ return (struct inode_walker) { 0, };
+}
+
+static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
+{
+ if (w->nr == w->size) {
+ size_t new_size = max_t(size_t, 8UL, w->size * 2);
+ void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
+ GFP_KERNEL);
+ if (!d) {
+ bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
+ new_size);
+ return -ENOMEM;
+ }
+
+ w->d = d;
+ w->size = new_size;
+ }
+
+ return 0;
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+ struct bkey_s_c inode)
+{
+ struct bch_inode_unpacked u;
+ int ret;
+
+ ret = inode_walker_realloc(c, w);
+ if (ret)
+ return ret;
+
+ BUG_ON(bch2_inode_unpack(inode, &u));
+
+ w->d[w->nr++] = (struct inode_walker_entry) {
+ .inode = u,
+ .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv,
};
+
+ return 0;
}
static int __walk_inode(struct btree_trans *trans,
- struct inode_walker *w, u64 inum)
+ struct inode_walker *w, struct bpos pos)
{
- if (inum != w->cur_inum) {
- int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot);
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ unsigned i, ancestor_pos;
+ int ret;
- if (ret && ret != -ENOENT)
- return ret;
+ pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
- w->have_inode = !ret;
- w->cur_inum = inum;
- w->first_this_inode = true;
- } else {
+ if (pos.inode == w->cur_inum) {
w->first_this_inode = false;
+ goto lookup_snapshot;
}
- return 0;
+ w->nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != pos.inode)
+ break;
+
+ if (bkey_is_inode(k.k))
+ add_inode(c, w, k);
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ return ret;
+
+ w->cur_inum = pos.inode;
+ w->first_this_inode = true;
+lookup_snapshot:
+ for (i = 0; i < w->nr; i++)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+ goto found;
+ return INT_MAX;
+found:
+ BUG_ON(pos.snapshot > w->d[i].snapshot);
+
+ if (pos.snapshot != w->d[i].snapshot) {
+ ancestor_pos = i;
+
+ while (i && w->d[i - 1].snapshot > pos.snapshot)
+ --i;
+
+ ret = inode_walker_realloc(c, w);
+ if (ret)
+ return ret;
+
+ array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
+ w->d[i].snapshot = pos.snapshot;
+ w->d[i].count = 0;
+ }
+
+ return i;
}
-static int walk_inode(struct btree_trans *trans,
- struct inode_walker *w, u64 inum)
+static int __get_visible_inodes(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct snapshots_seen *s,
+ u64 inum)
{
- return lockrestart_do(trans, __walk_inode(trans, w, inum));
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ w->nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+ add_inode(c, w, k);
+ if (k.k->p.snapshot >= s->pos.snapshot)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ char buf[200];
+ int ret = 0;
+
+ if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+ "key in missing snapshot: %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
+fsck_err:
+ return ret;
}
static int hash_redo_key(struct btree_trans *trans,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c k)
{
+ bch_err(trans->c, "hash_redo_key() not implemented yet");
+ return -EINVAL;
+#if 0
struct bkey_i *delete;
struct bkey_i *tmp;
return bch2_btree_iter_traverse(k_iter) ?:
bch2_trans_update(trans, k_iter, delete, 0) ?:
bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
-}
-
-static int fsck_hash_delete_at(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *info,
- struct btree_iter *iter)
-{
- int ret;
-retry:
- ret = bch2_hash_delete_at(trans, desc, info, iter) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
- if (ret == -EINTR) {
- ret = bch2_btree_iter_traverse(iter);
- if (!ret)
- goto retry;
- }
-
- return ret;
+#endif
}
static int hash_check_key(struct btree_trans *trans,
struct btree_iter *k_iter, struct bkey_s_c hash_k)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter = { NULL };
char buf[200];
struct bkey_s_c k;
u64 hash;
"duplicate hash table keys:\n%s",
(bch2_bkey_val_to_text(&PBUF(buf), c,
hash_k), buf))) {
- ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
- if (ret)
- return ret;
- ret = 1;
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
break;
}
if (bkey_deleted(k.k)) {
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
goto bad_hash;
}
}
- bch2_trans_iter_free(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
bad_hash:
if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
(bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
return 0;
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
- hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
+ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
if (ret) {
bch_err(c, "hash_redo_key err %i", ret);
return ret;
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_s_c_inode inode)
+ struct bch_inode_unpacked *prev,
+ bool full)
{
struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
struct bch_inode_unpacked u;
bool do_update = false;
- int ret = 0;
+ int ret;
- ret = bch2_inode_unpack(inode, &u);
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
- if (bch2_fs_inconsistent_on(ret, c,
- "error unpacking inode %llu in fsck",
- inode.k->p.inode))
+ ret = bkey_err(k);
+ if (ret)
return ret;
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ /*
+ * if snapshot id isn't a leaf node, skip it - deletion in
+ * particular is not atomic, so on the internal snapshot nodes
+ * we can see inodes marked for deletion after a clean shutdown
+ */
+ if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
+ return 0;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ BUG_ON(bch2_inode_unpack(k, &u));
+
+ if (!full &&
+ !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+ BCH_INODE_I_SECTORS_DIRTY|
+ BCH_INODE_UNLINKED)))
+ return 0;
+
+ if (prev->bi_inum != u.bi_inum)
+ *prev = u;
+
+ if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
+ inode_d_type(prev) != inode_d_type(&u), c,
+ "inodes in different snapshots don't match")) {
+ bch_err(c, "repair not implemented yet");
+ return -EINVAL;
+ }
+
if (u.bi_flags & BCH_INODE_UNLINKED &&
(!c->sb.clean ||
fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
- bch_verbose(c, "deleting inode %llu", u.bi_inum);
-
bch2_trans_unlock(trans);
bch2_fs_lazy_rw(c);
- ret = bch2_inode_rm(c, u.bi_inum, false);
+ ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i while deleting inode", ret);
return ret;
* just switch units to bytes and that issue goes away
*/
ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
+ SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+ iter->pos.snapshot),
POS(u.bi_inum, U64_MAX),
- NULL);
+ 0, NULL);
if (ret) {
bch_err(c, "error in fsck: error %i truncating inode", ret);
return ret;
bch_verbose(c, "recounting sectors for inode %llu",
u.bi_inum);
- sectors = bch2_count_inode_sectors(trans, u.bi_inum);
+ sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
if (sectors < 0) {
bch_err(c, "error in fsck: error %i recounting inode sectors",
(int) sectors);
}
if (do_update) {
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_btree_iter_traverse(iter) ?:
- bch2_inode_write(trans, iter, &u));
+ ret = write_inode(trans, &u, iter->pos.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i "
"updating inode", ret);
static int check_inodes(struct bch_fs *c, bool full)
{
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- struct bkey_s_c_inode inode;
+ struct btree_iter iter;
+ struct bch_inode_unpacked prev = { 0 };
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_inode)
- continue;
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
- inode = bkey_s_c_to_inode(k);
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_inode(&trans, &iter, &prev, full));
+ if (ret)
+ break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
- if (full ||
- (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
- BCH_INODE_I_SECTORS_DIRTY|
- BCH_INODE_UNLINKED))) {
- ret = check_inode(&trans, iter, inode);
- if (ret)
- break;
- }
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int check_subvol(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bkey_s_c k;
+ struct bkey_s_c_subvolume subvol;
+ int ret;
+
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
+
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_subvolume)
+ return 0;
+
+ subvol = bkey_s_c_to_subvolume(k);
+
+ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ if (ret && ret != -EINTR)
+ bch_err(trans->c, "error deleting subvolume %llu: %i",
+ iter->pos.offset, ret);
+ if (ret)
+ return ret;
}
- bch2_trans_iter_put(&trans, iter);
- BUG_ON(ret == -EINTR);
+ return 0;
+}
- return bch2_trans_exit(&trans) ?: ret;
+noinline_for_stack
+static int check_subvols(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
+ POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH);
+
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_subvol(&trans, &iter));
+ if (ret)
+ break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
}
+/*
+ * Checking for overlapping extents needs to be reimplemented
+ */
+#if 0
static int fix_overlapping_extent(struct btree_trans *trans,
struct bkey_s_c k, struct bpos cut_at)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_i *u;
int ret;
* assume things about extent overwrites - we should be running the
* triggers manually here
*/
- iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
- BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
+ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
- BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
- ret = bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?:
+ BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW);
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
+#endif
-static int inode_backpointer_exists(struct btree_trans *trans,
- struct bch_inode_unpacked *inode)
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos pos)
{
- struct btree_iter *iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0);
+ bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
+ if (!ret && k.k->type != KEY_TYPE_dirent)
+ ret = -ENOENT;
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
+ return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
+ }
+
+ return bkey_s_c_to_dirent(k);
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ return d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
+}
+
+static int inode_backpointer_exists(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d;
+ int ret;
+
+ d = dirent_get_by_pos(trans, &iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
+ ret = bkey_err(d.s_c);
if (ret)
- goto out;
- if (k.k->type != KEY_TYPE_dirent)
- goto out;
+ return ret;
- ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
-out:
- bch2_trans_iter_free(trans, iter);
+ ret = dirent_points_to_inode(d, inode);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ int ret = 0, ret2 = 0;
+ s64 count2;
+
+ for (i = w->d; i < w->d + w->nr; i++) {
+ if (i->inode.bi_sectors == i->count)
+ continue;
+
+ count2 = lockrestart_do(trans,
+ bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+
+ if (i->count != count2) {
+ bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
+ i->count, count2);
+ i->count = count2;
+ if (i->inode.bi_sectors == i->count)
+ continue;
+ }
+
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
+ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+ w->cur_inum, i->snapshot,
+ i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
+ continue;
+
+ i->inode.bi_sectors = i->count;
+ ret = write_inode(trans, &i->inode, i->snapshot);
+ if (ret)
+ break;
+ ret2 = -EINTR;
+ }
+fsck_err:
+ return ret ?: ret2;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+ struct inode_walker *inode,
+ struct snapshots_seen *s)
{
- return d.k->p.inode == inode->bi_dir &&
- d.k->p.offset == inode->bi_dir_offset;
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct inode_walker_entry *i;
+ char buf[200];
+ int ret = 0;
+
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
+
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ ret = snapshots_seen_update(c, s, k.k->p);
+ if (ret)
+ return ret;
+
+ if (k.k->type == KEY_TYPE_whiteout)
+ return 0;
+
+ if (inode->cur_inum != k.k->p.inode) {
+ ret = check_i_sectors(trans, inode);
+ if (ret)
+ return ret;
+ }
+#if 0
+ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+ char buf1[200];
+ char buf2[200];
+
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
+ bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+
+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+ return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+ }
+#endif
+ ret = __walk_inode(trans, inode, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ if (fsck_err_on(ret == INT_MAX, c,
+ "extent in missing inode:\n %s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+
+ if (ret == INT_MAX)
+ return 0;
+
+ i = inode->d + ret;
+ ret = 0;
+
+ if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
+ !S_ISLNK(i->inode.bi_mode), c,
+ "extent in non regular inode mode %o:\n %s",
+ i->inode.bi_mode,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+
+ if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
+ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ k.k->type != KEY_TYPE_reservation &&
+ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+ "extent type %u offset %llu past end of inode %llu, i_size %llu",
+ k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
+ bch2_fs_lazy_rw(c);
+ return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+ SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
+ k.k->p.snapshot),
+ POS(k.k->p.inode, U64_MAX),
+ 0, NULL) ?: -EINTR;
+ }
+ }
+ }
+
+ if (bkey_extent_is_allocation(k.k))
+ for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+ i->count += k.k->size;
+#if 0
+ bch2_bkey_buf_reassemble(&prev, c, k);
+#endif
+
+fsck_err:
+ return ret;
}
/*
static int check_extents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
+ struct snapshots_seen s;
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
- struct bkey_buf prev;
- u64 i_sectors = 0;
+ struct btree_iter iter;
int ret = 0;
+#if 0
+ struct bkey_buf prev;
bch2_bkey_buf_init(&prev);
prev.k->k = KEY(0, 0, 0);
+#endif
+ snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
bch_verbose(c, "checking extents");
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
-retry:
- while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k))) {
- if (w.have_inode &&
- w.cur_inum != k.k->p.inode &&
- !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
- fsck_err_on(w.inode.bi_sectors != i_sectors, c,
- "inode %llu has incorrect i_sectors: got %llu, should be %llu",
- w.inode.bi_inum,
- w.inode.bi_sectors, i_sectors)) {
- w.inode.bi_sectors = i_sectors;
-
- ret = write_inode(&trans, &w.inode, w.snapshot);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_extent(&trans, &iter, &w, &s));
+ if (ret)
+ break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
+#if 0
+ bch2_bkey_buf_exit(&prev, c);
+#endif
+ inode_walker_exit(&w);
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
+
+ return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ struct inode_walker_entry *i;
+ int ret = 0, ret2 = 0;
+ s64 count2;
+
+ for (i = w->d; i < w->d + w->nr; i++) {
+ if (i->inode.bi_nlink == i->count)
+ continue;
+
+ count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
+ if (count2 < 0)
+ return count2;
+
+ if (i->count != count2) {
+ bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+ i->count, count2);
+ i->count = count2;
+ if (i->inode.bi_nlink == i->count)
+ continue;
+ }
+
+ if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+ "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+ w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
+ i->inode.bi_nlink = i->count;
+ ret = write_inode(trans, &i->inode, i->snapshot);
if (ret)
break;
+ ret2 = -EINTR;
}
+ }
+fsck_err:
+ return ret ?: ret2;
+}
- if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
- char buf1[200];
- char buf2[200];
+static int check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ u32 target_snapshot)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_dirent *n;
+ bool backpointer_exists = true;
+ char buf[200];
+ int ret = 0;
+
+ if (!target->bi_dir &&
+ !target->bi_dir_offset) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
+ }
- bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
- bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+ if (!inode_points_to_dirent(target, d)) {
+ ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+ if (ret < 0)
+ goto err;
- if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
- return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
+ backpointer_exists = ret;
+ ret = 0;
+
+ if (fsck_err_on(S_ISDIR(target->bi_mode) &&
+ backpointer_exists, c,
+ "directory %llu with multiple links",
+ target->bi_inum)) {
+ ret = __remove_dirent(trans, d.k->p);
+ if (ret)
+ goto err;
+ return 0;
}
- ret = walk_inode(&trans, &w, k.k->p.inode);
- if (ret)
- break;
+ if (fsck_err_on(backpointer_exists &&
+ !target->bi_nlink, c,
+ "inode %llu has multiple links but i_nlink 0",
+ target->bi_inum)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_UNLINKED;
- if (w.first_this_inode)
- i_sectors = 0;
-
- if (fsck_err_on(!w.have_inode, c,
- "extent type %u for missing inode %llu",
- k.k->type, k.k->p.inode) ||
- fsck_err_on(w.have_inode &&
- !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
- "extent type %u for non regular file, inode %llu mode %o",
- k.k->type, k.k->p.inode, w.inode.bi_mode)) {
- bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- POS(k.k->p.inode, 0),
- POS(k.k->p.inode, U64_MAX),
- NULL) ?: -EINTR;
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
}
- if (fsck_err_on(w.have_inode &&
- !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- k.k->type != KEY_TYPE_reservation &&
- k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
- "extent type %u offset %llu past end of inode %llu, i_size %llu",
- k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
- bch2_fs_lazy_rw(c);
- return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9),
- POS(k.k->p.inode, U64_MAX),
- NULL) ?: -EINTR;
+ if (fsck_err_on(!backpointer_exists, c,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+
+ ret = __write_inode(trans, target, target_snapshot);
+ if (ret)
+ goto err;
}
+ }
- if (bkey_extent_is_allocation(k.k))
- i_sectors += k.k->size;
- bch2_bkey_buf_reassemble(&prev, c, k);
+ if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
+ "incorrect d_type: got %s, should be %s:\n%s",
+ bch2_d_type_str(d.v->d_type),
+ bch2_d_type_str(inode_d_type(target)),
+ (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_type = inode_d_type(target);
+
+ ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ if (ret)
+ return ret;
- bch2_btree_iter_advance(iter);
+ d = dirent_i_to_s_c(n);
}
+
+ if (d.v->d_type == DT_SUBVOL &&
+ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
+ (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
+ fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ target->bi_parent_subvol))) {
+ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+
+ ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+ if (ret)
+ return ret;
+
+ d = dirent_i_to_s_c(n);
+ }
+err:
fsck_err:
- if (ret == -EINTR)
- goto retry;
- bch2_trans_iter_put(&trans, iter);
- bch2_bkey_buf_exit(&prev, c);
- return bch2_trans_exit(&trans) ?: ret;
+ return ret;
}
static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_hash_info *hash_info,
- struct inode_walker *w, unsigned *nr_subdirs)
+ struct inode_walker *dir,
+ struct inode_walker *target,
+ struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bkey_s_c_dirent d;
- struct bch_inode_unpacked target;
- u32 target_snapshot;
- bool have_target;
- bool backpointer_exists = true;
- u64 d_inum;
+ struct inode_walker_entry *i;
char buf[200];
int ret;
if (ret)
return ret;
- if (w->have_inode &&
- w->cur_inum != k.k->p.inode &&
- fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c,
- "directory %llu with wrong i_nlink: got %u, should be %u",
- w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) {
- w->inode.bi_nlink = *nr_subdirs;
- ret = write_inode(trans, &w->inode, w->snapshot);
- return ret ?: -EINTR;
- }
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret < 0 ? ret : 0;
- ret = __walk_inode(trans, w, k.k->p.inode);
+ ret = snapshots_seen_update(c, s, k.k->p);
if (ret)
return ret;
- if (w->first_this_inode)
- *nr_subdirs = 0;
+ if (k.k->type == KEY_TYPE_whiteout)
+ return 0;
+
+ if (dir->cur_inum != k.k->p.inode) {
+ ret = check_subdir_count(trans, dir);
+ if (ret)
+ return ret;
+ }
+
+ ret = __walk_inode(trans, dir, k.k->p);
+ if (ret < 0)
+ return ret;
- if (fsck_err_on(!w->have_inode, c,
+ if (fsck_err_on(ret == INT_MAX, c,
"dirent in nonexisting directory:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) ||
- fsck_err_on(!S_ISDIR(w->inode.bi_mode), c,
- "dirent in non directory inode type %u:\n%s",
- mode_to_type(w->inode.bi_mode),
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
- return __bch2_trans_do(trans, NULL, NULL, 0,
- bch2_btree_delete_at(trans, iter, 0));
+ return bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
- if (!w->have_inode)
+ if (ret == INT_MAX)
return 0;
- if (w->first_this_inode)
- *hash_info = bch2_hash_info_init(c, &w->inode);
+ i = dir->d + ret;
+ ret = 0;
+
+ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
+ "dirent in non directory inode type %s:\n%s",
+ bch2_d_type_str(inode_d_type(&i->inode)),
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+ return bch2_btree_delete_at(trans, iter, 0);
+
+ if (dir->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
ret = hash_check_key(trans, bch2_dirent_hash_desc,
hash_info, iter, k);
return 0;
d = bkey_s_c_to_dirent(k);
- d_inum = le64_to_cpu(d.v->d_inum);
- ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
- if (ret && ret != -ENOENT)
- return ret;
+ if (d.v->d_type == DT_SUBVOL) {
+ struct bch_inode_unpacked subvol_root;
+ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+ u32 target_snapshot;
+ u64 target_inum;
- have_target = !ret;
- ret = 0;
+ ret = __subvol_lookup(trans, target_subvol,
+ &target_snapshot, &target_inum);
+ if (ret && ret != -ENOENT)
+ return ret;
- if (fsck_err_on(!have_target, c,
- "dirent points to missing inode:\n%s",
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf)))
- return remove_dirent(trans, d.k->p);
+ if (fsck_err_on(ret, c,
+ "dirent points to missing subvolume %llu",
+ le64_to_cpu(d.v->d_child_subvol)))
+ return __remove_dirent(trans, d.k->p);
- if (!have_target)
- return 0;
+ ret = __lookup_inode(trans, target_inum,
+ &subvol_root, &target_snapshot);
+ if (ret && ret != -ENOENT)
+ return ret;
+
+ if (fsck_err_on(ret, c,
+ "subvolume %u points to missing subvolume root %llu",
+ target_subvol,
+ target_inum)) {
+ bch_err(c, "repair not implemented yet");
+ return -EINVAL;
+ }
- if (!target.bi_dir &&
- !target.bi_dir_offset) {
- target.bi_dir = k.k->p.inode;
- target.bi_dir_offset = k.k->p.offset;
+ if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+ "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+ target_inum,
+ subvol_root.bi_subvol, target_subvol)) {
+ subvol_root.bi_subvol = target_subvol;
+ ret = __write_inode(trans, &subvol_root, target_snapshot);
+ if (ret)
+ return ret;
+ }
- ret = __write_inode(trans, &target, target_snapshot) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ ret = check_dirent_target(trans, iter, d, &subvol_root,
+ target_snapshot);
if (ret)
return ret;
- return -EINTR;
- }
-
- if (!inode_backpointer_matches(d, &target)) {
- ret = inode_backpointer_exists(trans, &target);
- if (ret < 0)
+ } else {
+ ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+ if (ret)
return ret;
- backpointer_exists = ret;
- ret = 0;
-
- if (fsck_err_on(S_ISDIR(target.bi_mode) &&
- backpointer_exists, c,
- "directory %llu with multiple links",
- target.bi_inum))
- return remove_dirent(trans, d.k->p);
-
- if (fsck_err_on(backpointer_exists &&
- !target.bi_nlink, c,
- "inode %llu has multiple links but i_nlink 0",
- d_inum)) {
- target.bi_nlink++;
- target.bi_flags &= ~BCH_INODE_UNLINKED;
-
- ret = write_inode(trans, &target, target_snapshot);
- return ret ?: -EINTR;
+ if (fsck_err_on(!target->nr, c,
+ "dirent points to missing inode:\n%s",
+ (bch2_bkey_val_to_text(&PBUF(buf), c,
+ k), buf))) {
+ ret = __remove_dirent(trans, d.k->p);
+ if (ret)
+ return ret;
}
- if (fsck_err_on(!backpointer_exists, c,
- "inode %llu has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- d_inum,
- target.bi_dir,
- target.bi_dir_offset,
- k.k->p.inode,
- k.k->p.offset)) {
- target.bi_dir = k.k->p.inode;
- target.bi_dir_offset = k.k->p.offset;
-
- ret = write_inode(trans, &target, target_snapshot);
- return ret ?: -EINTR;
+ for (i = target->d; i < target->d + target->nr; i++) {
+ ret = check_dirent_target(trans, iter, d,
+ &i->inode, i->snapshot);
+ if (ret)
+ return ret;
}
}
- if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
- "incorrect d_type: should be %u:\n%s",
- mode_to_type(target.bi_mode),
- (bch2_bkey_val_to_text(&PBUF(buf), c,
- k), buf))) {
- struct bkey_i_dirent *n;
-
- n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
- if (!n)
- return -ENOMEM;
-
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = mode_to_type(target.bi_mode);
-
- ret = __bch2_trans_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(trans, iter, &n->k_i, 0));
- kfree(n);
- return ret ?: -EINTR;
- }
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ i->count++;
- *nr_subdirs += d.v->d_type == DT_DIR;
- return 0;
fsck_err:
return ret;
}
noinline_for_stack
static int check_dirents(struct bch_fs *c)
{
- struct inode_walker w = inode_walker_init();
+ struct inode_walker dir = inode_walker_init();
+ struct inode_walker target = inode_walker_init();
+ struct snapshots_seen s;
struct bch_hash_info hash_info;
struct btree_trans trans;
- struct btree_iter *iter;
- unsigned nr_subdirs = 0;
+ struct btree_iter iter;
int ret = 0;
bch_verbose(c, "checking dirents");
+ snapshots_seen_init(&s);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
do {
- ret = lockrestart_do(&trans,
- check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs));
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_dirent(&trans, &iter, &hash_info,
+ &dir, &target, &s));
if (ret)
break;
- } while (bch2_btree_iter_advance(iter));
- bch2_trans_iter_put(&trans, iter);
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
+ inode_walker_exit(&dir);
+ inode_walker_exit(&target);
+ return ret;
+}
+
+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+ struct bch_hash_info *hash_info,
+ struct inode_walker *inode)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
+
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = check_key_has_snapshot(trans, iter, k);
+ if (ret)
+ return ret;
+
+ ret = __walk_inode(trans, inode, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ if (fsck_err_on(ret == INT_MAX, c,
+ "xattr for missing inode %llu",
+ k.k->p.inode))
+ return bch2_btree_delete_at(trans, iter, 0);
+
+ if (ret == INT_MAX)
+ return 0;
+
+ ret = 0;
+
+ if (inode->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
- return bch2_trans_exit(&trans) ?: ret;
+ ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+fsck_err:
+ return ret;
}
/*
noinline_for_stack
static int check_xattrs(struct bch_fs *c)
{
- struct inode_walker w = inode_walker_init();
+ struct inode_walker inode = inode_walker_init();
struct bch_hash_info hash_info;
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
+ struct btree_iter iter;
int ret = 0;
bch_verbose(c, "checking xattrs");
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH);
-retry:
- while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k))) {
- ret = walk_inode(&trans, &w, k.k->p.inode);
- if (ret)
- break;
-
- if (fsck_err_on(!w.have_inode, c,
- "xattr for missing inode %llu",
- k.k->p.inode)) {
- ret = bch2_btree_delete_at(&trans, iter, 0);
- if (ret)
- break;
- continue;
- }
-
- if (w.first_this_inode && w.have_inode)
- hash_info = bch2_hash_info_init(c, &w.inode);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
- ret = hash_check_key(&trans, bch2_xattr_hash_desc,
- &hash_info, iter, k);
+ do {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL,
+ check_xattr(&trans, &iter, &hash_info,
+ &inode));
if (ret)
break;
+ } while (bch2_btree_iter_advance(&iter));
+ bch2_trans_iter_exit(&trans, &iter);
- bch2_btree_iter_advance(iter);
- }
-fsck_err:
- if (ret == -EINTR)
- goto retry;
-
- bch2_trans_iter_put(&trans, iter);
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
-/* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+static int check_root_trans(struct btree_trans *trans)
{
- struct bkey_inode_buf packed;
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked root_inode;
u32 snapshot;
+ u64 inum;
int ret;
- bch_verbose(c, "checking root directory");
-
- ret = bch2_trans_do(c, NULL, NULL, 0,
- lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
+ ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
if (ret && ret != -ENOENT)
return ret;
- if (fsck_err_on(ret, c, "root directory missing"))
- goto create_root;
+ if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+ struct bkey_i_subvolume root_subvol;
- if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
- "root inode not a directory"))
- goto create_root;
+ snapshot = U32_MAX;
+ inum = BCACHEFS_ROOT_INO;
- return 0;
+ bkey_subvolume_init(&root_subvol.k_i);
+ root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_subvol.v.flags = 0;
+ root_subvol.v.snapshot = cpu_to_le32(snapshot);
+ root_subvol.v.inode = cpu_to_le64(inum);
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+ if (ret) {
+ bch_err(c, "error writing root subvol: %i", ret);
+ goto err;
+ }
+
+ }
+
+ ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ if (ret && ret != -ENOENT)
+ return ret;
+
+ if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
+ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+ "root inode not a directory")) {
+ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+ 0, NULL);
+ root_inode.bi_inum = inum;
+
+ ret = __write_inode(trans, &root_inode, snapshot);
+ if (ret)
+ bch_err(c, "error writing root inode: %i", ret);
+ }
+err:
fsck_err:
return ret;
-create_root:
- bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
- 0, NULL);
- root_inode->bi_inum = BCACHEFS_ROOT_INO;
+}
- bch2_inode_pack(c, &packed, root_inode);
+/* Get root directory, create if it doesn't exist: */
+noinline_for_stack
+static int check_root(struct bch_fs *c)
+{
+ bch_verbose(c, "checking root directory");
- return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
- NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ return bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ check_root_trans(&trans));
}
struct pathbuf {
struct pathbuf_entry {
u64 inum;
+ u32 snapshot;
} *entries;
};
-static int path_down(struct pathbuf *p, u64 inum)
+static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+{
+ struct pathbuf_entry *i;
+
+ for (i = p->entries; i < p->entries + p->nr; i++)
+ if (i->inum == inum &&
+ i->snapshot == snapshot)
+ return true;
+
+ return false;
+}
+
+static int path_down(struct bch_fs *c, struct pathbuf *p,
+ u64 inum, u32 snapshot)
{
if (p->nr == p->size) {
size_t new_size = max_t(size_t, 256UL, p->size * 2);
new_size * sizeof(p->entries[0]),
GFP_KERNEL);
if (!n) {
+ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+ new_size);
return -ENOMEM;
}
};
p->entries[p->nr++] = (struct pathbuf_entry) {
- .inum = inum,
+ .inum = inum,
+ .snapshot = snapshot,
};
return 0;
}
+/*
+ * Check that a given inode is reachable from the root:
+ *
+ * XXX: we should also be verifying that inodes are in the right subvolumes
+ */
static int check_path(struct btree_trans *trans,
struct pathbuf *p,
- struct bch_inode_unpacked *inode)
+ struct bch_inode_unpacked *inode,
+ u32 snapshot)
{
struct bch_fs *c = trans->c;
- u32 snapshot;
- size_t i;
int ret = 0;
+ snapshot = snapshot_t(c, snapshot)->equiv;
p->nr = 0;
- while (inode->bi_inum != BCACHEFS_ROOT_INO) {
+ while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
+ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ struct btree_iter dirent_iter;
+ struct bkey_s_c_dirent d;
+ u32 parent_snapshot = snapshot;
+
+ if (inode->bi_subvol) {
+ u64 inum;
+
+ ret = subvol_lookup(trans, inode->bi_parent_subvol,
+ &parent_snapshot, &inum);
+ if (ret)
+ break;
+ }
+
ret = lockrestart_do(trans,
- inode_backpointer_exists(trans, inode));
- if (ret < 0)
+ PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset,
+ parent_snapshot))).k));
+ if (ret && ret != -ENOENT)
break;
- if (!ret) {
- if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
- inode->bi_inum,
- mode_to_type(inode->bi_mode),
+ if (!ret && !dirent_points_to_inode(d, inode)) {
+ bch2_trans_iter_exit(trans, &dirent_iter);
+ ret = -ENOENT;
+ }
+
+ if (ret == -ENOENT) {
+ if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+ inode->bi_inum, snapshot,
+ bch2_d_type_str(inode_d_type(inode)),
inode->bi_nlink,
inode->bi_dir,
inode->bi_dir_offset))
- ret = reattach_inode(trans, inode);
+ ret = reattach_inode(trans, inode, snapshot);
break;
}
- ret = 0;
+
+ bch2_trans_iter_exit(trans, &dirent_iter);
if (!S_ISDIR(inode->bi_mode))
break;
- ret = path_down(p, inode->bi_inum);
+ ret = path_down(c, p, inode->bi_inum, snapshot);
if (ret) {
bch_err(c, "memory allocation failure");
return ret;
}
- for (i = 0; i < p->nr; i++) {
- if (inode->bi_dir != p->entries[i].inum)
- continue;
+ snapshot = parent_snapshot;
+
+ ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+ if (ret) {
+ /* Should have been caught in dirents pass */
+ bch_err(c, "error looking up parent directory: %i", ret);
+ break;
+ }
+
+ if (path_is_dup(p, inode->bi_inum, snapshot)) {
+ struct pathbuf_entry *i;
/* XXX print path */
+ bch_err(c, "directory structure loop");
+
+ for (i = p->entries; i < p->entries + p->nr; i++)
+ pr_err("%llu:%u", i->inum, i->snapshot);
+ pr_err("%llu:%u", inode->bi_inum, snapshot);
+
if (!fsck_err(c, "directory structure loop"))
return 0;
- ret = lockrestart_do(trans,
- remove_backpointer(trans, inode));
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ remove_backpointer(trans, inode));
if (ret) {
bch_err(c, "error removing dirent: %i", ret);
break;
}
- ret = reattach_inode(trans, inode);
- break;
- }
-
- ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
- if (ret) {
- /* Should have been caught in dirents pass */
- bch_err(c, "error looking up parent directory: %i", ret);
- break;
+ ret = reattach_inode(trans, inode, snapshot);
}
}
fsck_err:
* After check_dirents(), if an inode backpointer doesn't exist that means it's
* unreachable:
*/
+noinline_for_stack
static int check_directory_structure(struct bch_fs *c)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bch_inode_unpacked u;
struct pathbuf path = { 0, 0, NULL };
for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_inode)
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (!bkey_is_inode(k.k))
continue;
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+ ret = bch2_inode_unpack(k, &u);
if (ret) {
/* Should have been caught earlier in fsck: */
bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
break;
}
- ret = check_path(&trans, &path, &u);
+ if (u.bi_flags & BCH_INODE_UNLINKED)
+ continue;
+
+ ret = check_path(&trans, &path, &u, iter.pos.snapshot);
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
BUG_ON(ret == -EINTR);
kfree(path.entries);
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
struct nlink_table {
} *d;
};
-static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
+static int add_nlink(struct bch_fs *c, struct nlink_table *t,
+ u64 inum, u32 snapshot)
{
if (t->nr == t->size) {
size_t new_size = max_t(size_t, 128UL, t->size * 2);
void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
if (!d) {
+ bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
+ new_size);
return -ENOMEM;
}
return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
}
-static void inc_link(struct bch_fs *c, struct nlink_table *links,
- u64 range_start, u64 range_end, u64 inum)
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+ struct nlink_table *links,
+ u64 range_start, u64 range_end, u64 inum, u32 snapshot)
{
struct nlink *link, key = {
.inum = inum, .snapshot = U32_MAX,
link = __inline_bsearch(&key, links->d, links->nr,
sizeof(links->d[0]), nlink_cmp);
- if (link)
- link->count++;
+ if (!link)
+ return;
+
+ while (link > links->d && link[0].inum == link[-1].inum)
+ --link;
+
+ for (; link < links->d + links->nr && link->inum == inum; link++)
+ if (ref_visible(c, s, snapshot, link->snapshot)) {
+ link->count++;
+ if (link->snapshot >= snapshot)
+ break;
+ }
}
noinline_for_stack
u64 start, u64 *end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_inode inode;
struct bch_inode_unpacked u;
int ret = 0;
for_each_btree_key(&trans, iter, BTREE_ID_inodes,
POS(0, start),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_inode)
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (!bkey_is_inode(k.k))
continue;
- inode = bkey_s_c_to_inode(k);
+ /* Should never fail, checked by bch2_inode_invalid: */
+ BUG_ON(bch2_inode_unpack(k, &u));
/*
* Backpointer and directory structure checks are sufficient for
* directories, since they can't have hardlinks:
*/
- if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+ if (S_ISDIR(le16_to_cpu(u.bi_mode)))
continue;
- /* Should never fail, checked by bch2_inode_invalid: */
- BUG_ON(bch2_inode_unpack(inode, &u));
-
if (!u.bi_nlink)
continue;
- ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
+ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
if (ret) {
*end = k.k->p.offset;
ret = 0;
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
if (ret)
u64 range_start, u64 range_end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct snapshots_seen s;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent d;
int ret;
+ snapshots_seen_init(&s);
+
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = snapshots_seen_update(c, &s, k.k->p);
+ if (ret)
+ break;
+
switch (k.k->type) {
case KEY_TYPE_dirent:
d = bkey_s_c_to_dirent(k);
- if (d.v->d_type != DT_DIR)
- inc_link(c, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum));
+ if (d.v->d_type != DT_DIR &&
+ d.v->d_type != DT_SUBVOL)
+ inc_link(c, &s, links, range_start, range_end,
+ le64_to_cpu(d.v->d_inum),
+ d.k->p.snapshot);
break;
}
-
- bch2_trans_cond_resched(&trans);
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+ bch2_trans_exit(&trans);
+ snapshots_seen_exit(&s);
return ret;
}
u64 range_start, u64 range_end)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_inode inode;
struct bch_inode_unpacked u;
struct nlink *link = links->d;
int ret = 0;
for_each_btree_key(&trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH, k, ret) {
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset >= range_end)
break;
- if (k.k->type != KEY_TYPE_inode)
+ if (!bkey_is_inode(k.k))
continue;
- inode = bkey_s_c_to_inode(k);
- if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
- continue;
+ BUG_ON(bch2_inode_unpack(k, &u));
- BUG_ON(bch2_inode_unpack(inode, &u));
+ if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+ continue;
if (!u.bi_nlink)
continue;
- while (link->inum < k.k->p.offset) {
+ while ((cmp_int(link->inum, k.k->p.offset) ?:
+ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
link++;
BUG_ON(link >= links->d + links->nr);
}
bch2_inode_nlink_get(&u), link->count)) {
bch2_inode_nlink_set(&u, link->count);
- ret = __bch2_trans_do(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_btree_iter_traverse(iter) ?:
- bch2_inode_write(&trans, iter, &u));
+ ret = write_inode(&trans, &u, k.k->p.snapshot);
if (ret)
bch_err(c, "error in fsck: error %i updating inode", ret);
}
}
fsck_err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
if (ret)
return ret;
}
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
+{
+ struct bkey_s_c k;
+ struct bkey_s_c_reflink_p p;
+ struct bkey_i_reflink_p *u;
+ int ret;
+
+ k = bch2_btree_iter_peek(iter);
+ if (!k.k)
+ return 0;
+
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_reflink_p)
+ return 0;
+
+ p = bkey_s_c_to_reflink_p(k);
+
+ if (!p.v->front_pad && !p.v->back_pad)
+ return 0;
+
+ u = bch2_trans_kmalloc(trans, sizeof(*u));
+ ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
+
+ bkey_reassemble(&u->k_i, k);
+ u->v.front_pad = 0;
+ u->v.back_pad = 0;
+
+ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+}
+
+noinline_for_stack
+static int fix_reflink_p(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
+ return 0;
+
+ bch_verbose(c, "fixing reflink_p keys");
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->type == KEY_TYPE_reflink_p) {
+ ret = __bch2_trans_do(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_LAZY_RW,
+ fix_reflink_p_key(&trans, &iter));
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
/*
* Checks for inconsistencies that shouldn't happen, unless we have a bug.
* Doesn't fix them yet, mainly because they haven't yet been observed:
*/
int bch2_fsck_full(struct bch_fs *c)
{
- struct bch_inode_unpacked root_inode;
-
- return check_inodes(c, true) ?:
+ return bch2_fs_snapshots_check(c) ?:
+ check_inodes(c, true) ?:
+ check_subvols(c) ?:
check_extents(c) ?:
check_dirents(c) ?:
check_xattrs(c) ?:
- check_root(c, &root_inode) ?:
+ check_root(c) ?:
check_directory_structure(c) ?:
- check_nlinks(c);
+ check_nlinks(c) ?:
+ fix_reflink_p(c);
}
int bch2_fsck_walk_inodes_only(struct bch_fs *c)
#include "btree_key_cache.h"
#include "bkey_methods.h"
#include "btree_update.h"
+#include "buckets.h"
#include "error.h"
#include "extents.h"
+#include "extent_update.h"
#include "inode.h"
#include "str_hash.h"
+#include "subvolume.h"
#include "varint.h"
#include <linux/random.h>
};
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-static const u8 bits_table[8] = {
- 1 * 8 - 1,
- 2 * 8 - 2,
- 3 * 8 - 3,
- 4 * 8 - 4,
- 6 * 8 - 5,
- 8 * 8 - 6,
- 10 * 8 - 7,
- 13 * 8 - 8,
-};
-
-static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
-{
- __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
- unsigned shift, bytes, bits = likely(!hi)
- ? fls64(lo)
- : fls64(hi) + 64;
-
- for (shift = 1; shift <= 8; shift++)
- if (bits < bits_table[shift - 1])
- goto got_shift;
-
- BUG();
-got_shift:
- bytes = byte_table[shift - 1];
-
- BUG_ON(out + bytes > end);
-
- memcpy(out, (u8 *) in + 16 - bytes, bytes);
- *out |= (1 << 8) >> shift;
-
- return bytes;
-}
static int inode_decode_field(const u8 *in, const u8 *end,
u64 out[2], unsigned *out_bits)
return bytes;
}
-static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- struct bkey_i_inode *k = &packed->inode;
- u8 *out = k->v.fields;
- u8 *end = (void *) &packed[1];
- u8 *last_nonzero_field = out;
- unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
- unsigned bytes;
-
-#define x(_name, _bits) \
- out += inode_encode_field(out, end, 0, inode->_name); \
- nr_fields++; \
- \
- if (inode->_name) { \
- last_nonzero_field = out; \
- last_nonzero_fieldnr = nr_fields; \
- }
-
- BCH_INODE_FIELDS()
-#undef x
-
- out = last_nonzero_field;
- nr_fields = last_nonzero_fieldnr;
-
- bytes = out - (u8 *) &packed->inode.v;
- set_bkey_val_bytes(&packed->inode.k, bytes);
- memset_u64s_tail(&packed->inode.v, 0, bytes);
-
- SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
+void bch2_inode_pack(struct bch_fs *c,
+ struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
{
- struct bkey_i_inode *k = &packed->inode;
+ struct bkey_i_inode_v2 *k = &packed->inode;
u8 *out = k->v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
unsigned bytes;
int ret;
+ bkey_inode_v2_init(&packed->inode.k_i);
+ packed->inode.k.p.offset = inode->bi_inum;
+ packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq);
+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
+ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
+ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags);
+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
+
#define x(_name, _bits) \
nr_fields++; \
\
set_bkey_val_bytes(&packed->inode.k, bytes);
memset_u64s_tail(&packed->inode.v, 0, bytes);
- SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-void bch2_inode_pack(struct bch_fs *c,
- struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
-{
- bkey_inode_init(&packed->inode.k_i);
- packed->inode.k.p.offset = inode->bi_inum;
- packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
- packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
-
- if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
- SET_INODE_NEW_VARINT(&packed->inode.v, true);
- bch2_inode_pack_v2(packed, inode);
- } else {
- bch2_inode_pack_v1(packed, inode);
- }
+ SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
- int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+ int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
&unpacked);
BUG_ON(ret);
BUG_ON(unpacked.bi_inum != inode->bi_inum);
return 0;
}
-static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
- struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+ const u8 *in, const u8 *end,
+ unsigned nr_fields)
{
- const u8 *in = inode.v->fields;
- const u8 *end = bkey_val_end(inode);
unsigned fieldnr = 0;
int ret;
u64 v[2];
#define x(_name, _bits) \
- if (fieldnr < INODE_NR_FIELDS(inode.v)) { \
+ if (fieldnr < nr_fields) { \
ret = bch2_varint_decode_fast(in, end, &v[0]); \
if (ret < 0) \
return ret; \
return 0;
}
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
+int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+ switch (k.k->type) {
+ case KEY_TYPE_inode: {
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- if (INODE_NEW_VARINT(inode.v)) {
- return bch2_inode_unpack_v2(inode, unpacked);
- } else {
- return bch2_inode_unpack_v1(inode, unpacked);
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= 0;
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ if (INODE_NEW_VARINT(inode.v)) {
+ return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+ bkey_val_end(inode),
+ INODE_NR_FIELDS(inode.v));
+ } else {
+ return bch2_inode_unpack_v1(inode, unpacked);
+ }
+ break;
+ }
+ case KEY_TYPE_inode_v2: {
+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+ bkey_val_end(inode),
+ INODEv2_NR_FIELDS(inode.v));
+ }
+ default:
+ BUG();
}
-
- return 0;
}
-struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u64 inum, unsigned flags)
+int bch2_inode_peek(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode,
+ subvol_inum inum, unsigned flags)
{
- struct btree_iter *iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- if (trans->c->opts.inodes_use_key_cache)
- flags |= BTREE_ITER_CACHED;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags);
+ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot),
+ flags|BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
- ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
+ ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
if (ret)
goto err;
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+ ret = bch2_inode_unpack(k, inode);
if (ret)
goto err;
- return iter;
+ return 0;
err:
- bch2_trans_iter_put(trans, iter);
- return ERR_PTR(ret);
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
}
int bch2_inode_write(struct btree_trans *trans,
const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- struct bch_inode_unpacked unpacked;
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+ struct bch_inode_unpacked unpacked;
if (k.k->p.inode)
return "nonzero k.p.inode";
if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
return "invalid str hash type";
- if (bch2_inode_unpack(inode, &unpacked))
+ if (bch2_inode_unpack(k, &unpacked))
return "invalid variable length fields";
if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
unpacked.bi_nlink != 0)
return "flagged as unlinked but bi_nlink != 0";
+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+ return "subvolume root but not a directory";
+
+ return NULL;
+}
+
+const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+ struct bch_inode_unpacked unpacked;
+
+ if (k.k->p.inode)
+ return "nonzero k.p.inode";
+
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+ return "incorrect value size";
+
+ if (k.k->p.offset < BLOCKDEV_INODE_MAX)
+ return "fs inode in blockdev range";
+
+ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+ return "invalid str hash type";
+
+ if (bch2_inode_unpack(k, &unpacked))
+ return "invalid variable length fields";
+
+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+ return "invalid data checksum type";
+
+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+ return "invalid data checksum type";
+
+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+ unpacked.bi_nlink != 0)
+ return "flagged as unlinked but bi_nlink != 0";
+
+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+ return "subvolume root but not a directory";
+
return NULL;
}
static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
{
- pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags);
+ pr_buf(out, "mode %o flags %x journal_seq %llu",
+ inode->bi_mode, inode->bi_flags,
+ inode->bi_journal_seq);
#define x(_name, _bits) \
- pr_buf(out, #_name " %llu ", (u64) inode->_name);
+ pr_buf(out, " "#_name " %llu", (u64) inode->_name);
BCH_INODE_FIELDS()
#undef x
}
void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
- struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
- struct bch_inode_unpacked unpacked;
+ struct bch_inode_unpacked inode;
- if (bch2_inode_unpack(inode, &unpacked)) {
+ if (bch2_inode_unpack(k, &inode)) {
pr_buf(out, "(unpack error)");
return;
}
- __bch2_inode_unpacked_to_text(out, &unpacked);
+ __bch2_inode_unpacked_to_text(out, &inode);
}
const char *bch2_inode_generation_invalid(const struct bch_fs *c,
{
switch (k.k->type) {
case KEY_TYPE_inode:
+ case KEY_TYPE_inode_v2:
BUG();
case KEY_TYPE_inode_generation:
return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
}
}
-struct btree_iter *bch2_inode_create(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode_u,
+ u32 snapshot, u64 cpu)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter = NULL;
struct bkey_s_c k;
u64 min, max, start, pos, *hint;
int ret = 0;
start = min;
pos = start;
- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
- BTREE_ITER_ALL_SNAPSHOTS|
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
}
if (k.k->p.snapshot == snapshot &&
- k.k->type != KEY_TYPE_inode &&
+ !bkey_is_inode(k.k) &&
!bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
bch2_btree_iter_advance(iter);
continue;
ret = -ENOSPC;
if (ret) {
- bch2_trans_iter_put(trans, iter);
- return ERR_PTR(ret);
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
}
/* Retry from start */
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret) {
- bch2_trans_iter_put(trans, iter);
- return ERR_PTR(ret);
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
}
/* We may have raced while the iterator wasn't pointing at pos: */
- if (k.k->type == KEY_TYPE_inode ||
+ if (bkey_is_inode(k.k) ||
bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
goto again;
*hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
- return iter;
+ return 0;
}
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+ subvol_inum inum, enum btree_id id)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i delete;
+ u32 snapshot;
+ int ret = 0;
+
+ /*
+ * We're never going to be deleting extents, no need to use an extent
+ * iterator:
+ */
+ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_INTENT);
+
+ while (1) {
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || iter.pos.inode != inum.inum)
+ break;
+
+ bkey_init(&delete.k);
+ delete.k.p = iter.pos;
+
+ ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ if (ret && ret != -EINTR)
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans trans;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter = { NULL };
struct bkey_i_inode_generation delete;
- struct bpos start = POS(inode_nr, 0);
- struct bpos end = POS(inode_nr + 1, 0);
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
- unsigned iter_flags = BTREE_ITER_INTENT;
+ u32 snapshot;
int ret;
- if (cached && c->opts.inodes_use_key_cache)
- iter_flags |= BTREE_ITER_CACHED;
-
bch2_trans_init(&trans, c, 0, 1024);
/*
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
- ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
- start, end, NULL) ?:
- bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
- start, end, NULL) ?:
- bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
- start, end, NULL);
+ ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
+ bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
+ bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
if (ret)
goto err;
retry:
bch2_trans_begin(&trans);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
- POS(0, inode_nr), iter_flags);
- k = bch2_btree_iter_peek_slot(iter);
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inum.inum, snapshot),
+ BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != KEY_TYPE_inode) {
+ if (!bkey_is_inode(k.k)) {
bch2_fs_inconsistent(trans.c,
"inode %llu not found when deleting",
- inode_nr);
+ inum.inum);
ret = -EIO;
goto err;
}
- bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+ bch2_inode_unpack(k, &inode_u);
+
+ /* Subvolume root? */
+ BUG_ON(inode_u.bi_subvol);
bkey_inode_generation_init(&delete.k_i);
- delete.k.p = iter->pos;
+ delete.k.p = iter.pos;
delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
- ret = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?:
+ ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR)
goto retry;
return ret;
}
-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
int ret;
- iter = bch2_inode_peek(trans, inode, inode_nr, 0);
- ret = PTR_ERR_OR_ZERO(iter);
- bch2_trans_iter_put(trans, iter);
+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
struct bch_inode_unpacked *inode)
{
return bch2_trans_do(c, NULL, NULL, 0,
- bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
+ bch2_inode_find_by_inum_trans(&trans, inum, inode));
}
extern const char * const bch2_inode_opts[];
const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_inode (struct bkey_ops) { \
.val_to_text = bch2_inode_to_text, \
}
+#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \
+ .key_invalid = bch2_inode_v2_invalid, \
+ .val_to_text = bch2_inode_to_text, \
+}
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+ return k->type == KEY_TYPE_inode ||
+ k->type == KEY_TYPE_inode_v2;
+}
+
const char *bch2_inode_generation_invalid(const struct bch_fs *,
struct bkey_s_c);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
struct bch_inode_unpacked {
u64 bi_inum;
+ u64 bi_journal_seq;
__le64 bi_hash_seed;
u32 bi_flags;
u16 bi_mode;
};
struct bkey_inode_buf {
- struct bkey_i_inode inode;
+ struct bkey_i_inode_v2 inode;
#define x(_name, _bits) + 8 + _bits / 8
u8 _pad[0 + BCH_INODE_FIELDS()];
void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
-struct btree_iter *bch2_inode_peek(struct btree_trans *,
- struct bch_inode_unpacked *, u64, unsigned);
+int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_write(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *);
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
-struct btree_iter *bch2_inode_create(struct btree_trans *,
- struct bch_inode_unpacked *, u32, u64);
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+ struct bch_inode_unpacked *, u32, u64);
-int bch2_inode_rm(struct bch_fs *, u64, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
-int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+ struct bch_inode_unpacked *);
static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
{
return (mode >> 12) & 15;
}
+static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
+{
+ return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
#include "keylist.h"
#include "move.h"
#include "rebalance.h"
+#include "subvolume.h"
#include "super.h"
#include "super-io.h"
int bch2_sum_sector_overwrites(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct bkey_i *new,
- bool *maybe_extending,
bool *usage_increasing,
s64 *i_sectors_delta,
s64 *disk_sectors_delta)
{
struct bch_fs *c = trans->c;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c old;
unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
int ret = 0;
- *maybe_extending = true;
*usage_increasing = false;
*i_sectors_delta = 0;
*disk_sectors_delta = 0;
- iter = bch2_trans_copy_iter(trans, extent_iter);
+ bch2_trans_copy_iter(&iter, extent_iter);
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
: 0;
if (!*usage_increasing &&
- (new_replicas > bch2_bkey_replicas(c, old) ||
+ (new->k.p.snapshot != old.k->p.snapshot ||
+ new_replicas > bch2_bkey_replicas(c, old) ||
(!new_compressed && bch2_bkey_sectors_compressed(old))))
*usage_increasing = true;
- if (bkey_cmp(old.k->p, new->k.p) >= 0) {
- /*
- * Check if there's already data above where we're
- * going to be writing to - this means we're definitely
- * not extending the file:
- *
- * Note that it's not sufficient to check if there's
- * data up to the sector offset we're going to be
- * writing to, because i_size could be up to one block
- * less:
- */
- if (!bkey_cmp(old.k->p, new->k.p)) {
- old = bch2_btree_iter_next(iter);
- ret = bkey_err(old);
- if (ret)
- break;
- }
-
- if (old.k && !bkey_err(old) &&
- old.k->p.inode == extent_iter->pos.inode &&
- bkey_extent_is_data(old.k))
- *maybe_extending = false;
-
+ if (bkey_cmp(old.k->p, new->k.p) >= 0)
break;
- }
}
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_extent_update(struct btree_trans *trans,
+ subvol_inum inum,
struct btree_iter *iter,
struct bkey_i *k,
struct disk_reservation *disk_res,
s64 *i_sectors_delta_total,
bool check_enospc)
{
- /* this must live until after bch2_trans_commit(): */
- struct bkey_inode_buf inode_p;
- bool extending = false, usage_increasing;
+ struct btree_iter inode_iter;
+ struct bch_inode_unpacked inode_u;
+ struct bpos next_pos;
+ bool usage_increasing;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
int ret;
- ret = bch2_extent_trim_atomic(k, iter);
+ /*
+ * This traverses us the iterator without changing iter->path->pos to
+ * search_key() (which is pos + 1 for extents): we want there to be a
+ * path already traversed at iter->pos because
+ * bch2_trans_extent_update() will use it to attempt extent merging
+ */
+ ret = __bch2_btree_iter_traverse(iter);
if (ret)
return ret;
+ ret = bch2_extent_trim_atomic(trans, iter, k);
+ if (ret)
+ return ret;
+
+ new_i_size = min(k->k.p.offset << 9, new_i_size);
+ next_pos = k->k.p;
+
ret = bch2_sum_sector_overwrites(trans, iter, k,
- &extending,
&usage_increasing,
&i_sectors_delta,
&disk_sectors_delta);
if (ret)
return ret;
- if (!usage_increasing)
- check_enospc = false;
-
if (disk_res &&
disk_sectors_delta > (s64) disk_res->sectors) {
ret = bch2_disk_reservation_add(trans->c, disk_res,
disk_sectors_delta - disk_res->sectors,
- !check_enospc
+ !check_enospc || !usage_increasing
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
return ret;
}
- new_i_size = extending
- ? min(k->k.p.offset << 9, new_i_size)
- : 0;
-
- if (i_sectors_delta || new_i_size) {
- struct btree_iter *inode_iter;
- struct bch_inode_unpacked inode_u;
-
- inode_iter = bch2_inode_peek(trans, &inode_u,
- k->k.p.inode, BTREE_ITER_INTENT);
- ret = PTR_ERR_OR_ZERO(inode_iter);
- if (ret)
- return ret;
-
- /*
- * XXX:
- * writeback can race a bit with truncate, because truncate
- * first updates the inode then truncates the pagecache. This is
- * ugly, but lets us preserve the invariant that the in memory
- * i_size is always >= the on disk i_size.
- *
- BUG_ON(new_i_size > inode_u.bi_size &&
- (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
- */
- BUG_ON(new_i_size > inode_u.bi_size && !extending);
-
- if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
- new_i_size > inode_u.bi_size)
- inode_u.bi_size = new_i_size;
- else
- new_i_size = 0;
-
- inode_u.bi_sectors += i_sectors_delta;
-
- if (i_sectors_delta || new_i_size) {
- bch2_inode_pack(trans->c, &inode_p, &inode_u);
-
- inode_p.inode.k.p.snapshot = iter->snapshot;
-
- ret = bch2_trans_update(trans, inode_iter,
- &inode_p.inode.k_i, 0);
- }
+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
+ BTREE_ITER_INTENT);
+ if (ret)
+ return ret;
- bch2_trans_iter_put(trans, inode_iter);
+ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+ new_i_size > inode_u.bi_size)
+ inode_u.bi_size = new_i_size;
- if (ret)
- return ret;
- }
+ inode_u.bi_sectors += i_sectors_delta;
ret = bch2_trans_update(trans, iter, k, 0) ?:
+ bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, disk_res, journal_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL);
- BUG_ON(ret == -ENOSPC);
+ bch2_trans_iter_exit(trans, &inode_iter);
+
if (ret)
return ret;
if (i_sectors_delta_total)
*i_sectors_delta_total += i_sectors_delta;
+ bch2_btree_iter_set_pos(iter, next_pos);
+
return 0;
}
+/*
+ * Returns -EINTR if we had to drop locks:
+ */
int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
- struct bpos end, u64 *journal_seq,
+ subvol_inum inum, u64 end,
s64 *i_sectors_delta)
{
struct bch_fs *c = trans->c;
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+ struct bpos end_pos = POS(inum.inum, end);
struct bkey_s_c k;
int ret = 0, ret2 = 0;
+ u32 snapshot;
- while ((bch2_trans_begin(trans),
- (k = bch2_btree_iter_peek(iter)).k) &&
- bkey_cmp(iter->pos, end) < 0) {
+ while (!ret || ret == -EINTR) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
+ if (ret)
+ ret2 = ret;
+
+ bch2_trans_begin(trans);
+
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(iter, snapshot);
+
+ k = bch2_btree_iter_peek(iter);
+ if (bkey_cmp(iter->pos, end_pos) >= 0) {
+ bch2_btree_iter_set_pos(iter, end_pos);
+ break;
+ }
+
ret = bkey_err(k);
if (ret)
- goto btree_err;
+ continue;
bkey_init(&delete.k);
delete.k.p = iter->pos;
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
- bch2_cut_back(end, &delete);
+ bch2_cut_back(end_pos, &delete);
- ret = bch2_extent_update(trans, iter, &delete,
- &disk_res, journal_seq,
+ ret = bch2_extent_update(trans, inum, iter, &delete,
+ &disk_res, NULL,
0, i_sectors_delta, false);
bch2_disk_reservation_put(c, &disk_res);
-btree_err:
- if (ret == -EINTR) {
- ret2 = ret;
- ret = 0;
- }
- if (ret)
- break;
- }
-
- if (bkey_cmp(iter->pos, end) > 0) {
- bch2_btree_iter_set_pos(iter, end);
- ret = bch2_btree_iter_traverse(iter);
}
return ret ?: ret2;
}
-int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
- u64 *journal_seq, s64 *i_sectors_delta)
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+ s64 *i_sectors_delta)
{
struct btree_trans trans;
- struct btree_iter *iter;
- int ret = 0;
+ struct btree_iter iter;
+ int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- POS(inum, start),
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ POS(inum.inum, start),
+ BTREE_ITER_INTENT);
- ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
- journal_seq, i_sectors_delta);
+ ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
- if (ret == -EINTR)
- ret = 0;
-
- return ret;
+ return ret == -EINTR ? 0 : ret;
}
int bch2_write_index_default(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct bkey_buf sk;
+ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
struct keylist *keys = &op->insert_keys;
struct bkey_i *k = bch2_keylist_front(keys);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
+ subvol_inum inum = {
+ .subvol = op->subvol,
+ .inum = k->k.p.inode,
+ };
int ret;
+ BUG_ON(!inum.subvol);
+
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- bkey_start_pos(&k->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
do {
bch2_trans_begin(&trans);
k = bch2_keylist_front(keys);
+ bch2_bkey_buf_copy(&sk, c, k);
- k->k.p.snapshot = iter->snapshot;
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+ &sk.k->k.p.snapshot);
+ if (ret == -EINTR)
+ continue;
+ if (ret)
+ break;
- bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
- bkey_copy(sk.k, k);
- bch2_cut_front(iter->pos, sk.k);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ bkey_start_pos(&sk.k->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_extent_update(&trans, iter, sk.k,
+ ret = bch2_extent_update(&trans, inum, &iter, sk.k,
&op->res, op_journal_seq(op),
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
+ bch2_trans_iter_exit(&trans, &iter);
+
if (ret == -EINTR)
continue;
if (ret)
break;
- if (bkey_cmp(iter->pos, k->k.p) >= 0)
- bch2_keylist_pop_front(keys);
+ if (ec_ob)
+ bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
+
+ if (bkey_cmp(iter.pos, k->k.p) >= 0)
+ bch2_keylist_pop_front(&op->insert_keys);
+ else
+ bch2_cut_front(iter.pos, k);
} while (!bch2_keylist_empty(keys));
- bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
{
struct bch_fs *c = op->c;
struct bkey_i_extent *e;
- struct open_bucket *ob;
- unsigned i;
- BUG_ON(crc.compressed_size > wp->sectors_free);
- wp->sectors_free -= crc.compressed_size;
op->pos.offset += crc.uncompressed_size;
e = bkey_extent_init(op->insert_keys.top);
crc.nonce)
bch2_extent_crc_append(&e->k_i, crc);
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
- union bch_extent_entry *end =
- bkey_val_end(bkey_i_to_s(&e->k_i));
-
- end->ptr = ob->ptr;
- end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- end->ptr.cached = !ca->mi.durability ||
- (op->flags & BCH_WRITE_CACHED) != 0;
- end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
-
- e->k.u64s++;
-
- BUG_ON(crc.compressed_size > ob->sectors_free);
- ob->sectors_free -= crc.compressed_size;
- }
+ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size,
+ op->flags & BCH_WRITE_CACHED);
bch2_keylist_push(&op->insert_keys);
}
? ((unsigned long) buf & (PAGE_SIZE - 1))
: 0), PAGE_SIZE);
+ pages = min(pages, BIO_MAX_VECS);
+
bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
wbio = wbio_init(bio);
wbio->put_bio = true;
*/
bch2_bio_alloc_pages_pool(c, bio,
min_t(unsigned, output_available,
- c->sb.encoded_extent_max << 9));
+ c->opts.encoded_extent_max));
if (bio->bi_iter.bi_size < output_available)
*page_alloc_failed =
struct bio *src = &op->wbio.bio, *dst = src;
struct bvec_iter saved_iter;
void *ec_buf;
- struct bpos ec_pos = op->pos;
unsigned total_output = 0, total_input = 0;
bool bounce = false;
bool page_alloc_failed = false;
ret = -EIO;
goto err;
case PREP_ENCODED_CHECKSUM_ERR:
- BUG();
goto csum_err;
case PREP_ENCODED_DO_WRITE:
/* XXX look for bug here */
size_t dst_len, src_len;
if (page_alloc_failed &&
- bio_sectors(dst) < wp->sectors_free &&
- bio_sectors(dst) < c->sb.encoded_extent_max)
+ dst->bi_iter.bi_size < (wp->sectors_free << 9) &&
+ dst->bi_iter.bi_size < c->opts.encoded_extent_max)
break;
BUG_ON(op->compression_type &&
if (op->csum_type)
dst_len = min_t(unsigned, dst_len,
- c->sb.encoded_extent_max << 9);
+ c->opts.encoded_extent_max);
if (bounce) {
swap(dst->bi_iter.bi_size, dst_len);
dst->bi_iter.bi_size = total_output;
do_write:
- /* might have done a realloc... */
- bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
-
*_dst = dst;
return more;
csum_err:
*/
wp = bch2_alloc_sectors_start(c,
op->target,
- op->opts.erasure_code,
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
op->write_point,
&op->devs_have,
op->nr_replicas,
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
- if (bio_sectors(bio) & (c->opts.block_size - 1)) {
+ if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
bch_err_inum_ratelimited(c, op->pos.inode,
"misaligned write");
op->error = -EIO;
}
static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
+ struct bvec_iter bvec_iter,
struct bch_io_failures *failed,
unsigned flags)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
int ret;
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_SLOTS);
+ bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
+ rbio->read_pos, BTREE_ITER_SLOTS);
retry:
rbio->bio.bi_status = 0;
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
if (bkey_err(k))
goto err;
goto err;
out:
bch2_rbio_done(rbio);
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
return;
struct bch_fs *c = rbio->c;
struct bvec_iter iter = rbio->bvec_iter;
unsigned flags = rbio->flags;
- u64 inode = rbio->read_pos.inode;
+ subvol_inum inum = {
+ .subvol = rbio->subvol,
+ .inum = rbio->read_pos.inode,
+ };
struct bch_io_failures failed = { .nr = 0 };
trace_read_retry(&rbio->bio);
flags &= ~BCH_READ_MAY_PROMOTE;
if (flags & BCH_READ_NODECODE) {
- bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
+ bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
} else {
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
- __bch2_read(c, rbio, iter, inode, &failed, flags);
+ __bch2_read(c, rbio, iter, inum, &failed, flags);
}
}
struct bch_fs *c = rbio->c;
u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
struct bch_extent_crc_unpacked new_crc;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter;
struct bkey_i *new;
struct bkey_s_c k;
int ret = 0;
if (crc_is_compressed(rbio->pick.crc))
return 0;
- iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
if ((ret = bkey_err(k)))
goto out;
if (!bch2_bkey_narrow_crcs(new, new_crc))
goto out;
- ret = bch2_trans_update(trans, iter, new, 0);
+ ret = bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
out:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
return;
}
- if (rbio->pick.ptr.cached &&
- (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(ca, &rbio->pick.ptr))) {
+ if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+ ptr_stale(ca, &rbio->pick.ptr)) {
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
unsigned *offset_into_extent,
struct bkey_buf *orig_k)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
u64 reflink_offset;
int ret;
reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
*offset_into_extent;
- iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
- POS(0, reflink_offset),
- BTREE_ITER_SLOTS);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
+ POS(0, reflink_offset),
+ BTREE_ITER_SLOTS);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
goto err;
}
- *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+ *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
bch2_bkey_buf_reassemble(orig_k, trans->c, k);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct bch_extent_ptr ptr)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+ struct btree_iter iter;
+ char buf[200];
+ int ret;
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+ POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
+ BTREE_ITER_CACHED);
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
+ return;
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ bch_err(c, "%s", buf);
+ bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+ bch2_trans_iter_exit(trans, &iter);
+}
+
int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bvec_iter iter, struct bpos read_pos,
enum btree_id data_btree, struct bkey_s_c k,
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct bch_dev *ca;
+ struct bch_dev *ca = NULL;
struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
zero_fill_bio_iter(&orig->bio, iter);
goto out_read_done;
}
-
+retry_pick:
pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
/* hole or reservation - just zero fill: */
goto err;
}
- if (pick_ret > 0)
- ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+ if (!pick.ptr.cached &&
+ unlikely(ptr_stale(ca, &pick.ptr))) {
+ read_from_stale_dirty_pointer(trans, k, pick.ptr);
+ bch2_mark_io_failure(failed, &pick);
+ goto retry_pick;
+ }
+
+ /*
+ * Unlock the iterator while the btree node's lock is still in
+ * cache, before doing the IO:
+ */
+ bch2_trans_unlock(trans);
if (flags & BCH_READ_NODECODE) {
/*
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
if (crc_is_compressed(pick.crc) ||
- (pick.crc.csum_type != BCH_CSUM_NONE &&
+ (pick.crc.csum_type != BCH_CSUM_none &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
(flags & BCH_READ_USER_MAPPED)) ||
/* XXX: only initialize this if needed */
rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
+ rbio->subvol = orig->subvol;
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
}
void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
+ struct bvec_iter bvec_iter, subvol_inum inum,
struct bch_io_failures *failed, unsigned flags)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
BUG_ON(flags & BCH_READ_NODECODE);
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
- POS(inode, bvec_iter.bi_sector),
- BTREE_ITER_SLOTS);
retry:
bch2_trans_begin(&trans);
+ iter = (struct btree_iter) { NULL };
+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+ BTREE_ITER_SLOTS);
while (1) {
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
break;
}
- bch2_btree_iter_set_pos(iter,
- POS(inode, bvec_iter.bi_sector));
+ bch2_btree_iter_set_pos(&iter,
+ POS(inum.inum, bvec_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
break;
- offset_into_extent = iter->pos.offset -
+ offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
*/
sectors = min(sectors, k.k->size - offset_into_extent);
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- bch2_trans_unlock(&trans);
-
bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
if (bvec_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
data_btree, k,
offset_into_extent, failed, flags);
if (ret)
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+ ret = btree_trans_too_many_iters(&trans);
+ if (ret)
+ break;
}
+err:
+ bch2_trans_iter_exit(&trans, &iter);
if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
goto retry;
- bch2_trans_iter_put(&trans, iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
if (ret) {
- bch_err_inum_ratelimited(c, inode,
+ bch_err_inum_ratelimited(c, inum.inum,
"read error %i from btree lookup", ret);
rbio->bio.bi_status = BLK_STS_IOERR;
bch2_rbio_done(rbio);
mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
- c->sb.encoded_extent_max) /
- PAGE_SECTORS, 0) ||
+ c->opts.encoded_extent_max) /
+ PAGE_SIZE, 0) ||
rhashtable_init(&c->promote_table, &bch_promote_params))
return -ENOMEM;
? op->journal_seq_p : &op->journal_seq;
}
-static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
-{
- op->journal_seq_p = journal_seq;
- op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-}
-
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_MOVINGGC
}
int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, bool *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, struct disk_reservation *,
- u64 *, u64, s64 *, bool);
+ struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+ struct btree_iter *, struct bkey_i *,
+ struct disk_reservation *, u64 *, u64, s64 *, bool);
+
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
- struct bpos, u64 *, s64 *);
-int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+ subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
int bch2_write_index_default(struct bch_write_op *);
op->devs_have.nr = 0;
op->target = 0;
op->opts = opts;
+ op->subvol = 0;
op->pos = POS_MAX;
op->version = ZERO_VERSION;
op->write_point = (struct write_point_specifier) { 0 };
}
void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- u64, struct bch_io_failures *, unsigned flags);
+ subvol_inum, struct bch_io_failures *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- u64 inode)
+ subvol_inum inum)
{
struct bch_io_failures failed = { .nr = 0 };
rbio->c = c;
rbio->start_time = local_clock();
+ rbio->subvol = inum.subvol;
- __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+ __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED);
/*
* pos we read from - different from data_pos for indirect extents:
*/
+ u32 subvol;
struct bpos read_pos;
/*
u16 nonce;
struct bch_io_opts opts;
+ u32 subvol;
struct bpos pos;
struct bversion version;
buf->must_flush = false;
buf->separate_flush = false;
- memset(buf->has_inode, 0, sizeof(buf->has_inode));
-
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
buf->data->u64s = 0;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- j->err_seq = journal_cur_seq(j);
+ /*
+ * XXX: we're not using j->lock here because this can be called from
+ * interrupt context, this can race with journal_write_done()
+ */
+ if (!j->err_seq)
+ j->err_seq = journal_cur_seq(j);
journal_wake(j);
closure_wake_up(&journal_cur_buf(j)->wait);
}
mod_delayed_work(c->io_complete_wq,
&j->write_work,
- msecs_to_jiffies(j->write_delay_ms));
+ msecs_to_jiffies(c->opts.journal_flush_delay));
journal_wake(j);
return 0;
}
journal_entry_close(j);
}
-/*
- * Given an inode number, if that inode number has data in the journal that
- * hasn't yet been flushed, return the journal sequence number that needs to be
- * flushed:
- */
-u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
-{
- size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
- union journal_res_state s;
- unsigned i;
- u64 seq;
-
-
- spin_lock(&j->lock);
- seq = journal_cur_seq(j);
- s = READ_ONCE(j->reservations);
- i = s.idx;
-
- while (1) {
- if (test_bit(h, j->buf[i].has_inode))
- goto out;
-
- if (i == s.unwritten_idx)
- break;
-
- i = (i - 1) & JOURNAL_BUF_MASK;
- seq--;
- }
-
- seq = 0;
-out:
- spin_unlock(&j->lock);
-
- return seq;
-}
-
-void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq)
-{
- size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
- struct journal_buf *buf;
-
- spin_lock(&j->lock);
-
- if ((buf = journal_seq_to_buf(j, seq)))
- set_bit(h, buf->has_inode);
-
- spin_unlock(&j->lock);
-}
-
static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned flags)
{
spin_lock(&j->lock);
- BUG_ON(seq > journal_cur_seq(j));
+ if (WARN_ONCE(seq > journal_cur_seq(j),
+ "requested to flush journal seq %llu, but currently at %llu",
+ seq, journal_cur_seq(j)))
+ goto out;
/* Recheck under lock: */
if (j->err_seq && seq >= j->err_seq) {
u64 start_time = local_clock();
int ret, ret2;
+ /*
+ * Don't update time_stats when @seq is already flushed:
+ */
+ if (seq <= j->flushed_seq_ondisk)
+ return 0;
+
ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
if (!ret)
int bch2_journal_meta(struct journal *j)
{
+ struct journal_buf *buf;
struct journal_res res;
int ret;
if (ret)
return ret;
+ buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+ buf->must_flush = true;
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
bch2_journal_res_put(j, &res);
return bch2_journal_flush_seq(j, res.seq);
return bch2_journal_flush_seq(j, seq);
}
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ u64 unwritten_seq;
+ bool ret = false;
+
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+ return false;
+
+ if (seq <= c->journal.flushed_seq_ondisk)
+ return false;
+
+ spin_lock(&j->lock);
+ if (seq <= c->journal.flushed_seq_ondisk)
+ goto out;
+
+ for (unwritten_seq = last_unwritten_seq(j);
+ unwritten_seq < seq;
+ unwritten_seq++) {
+ struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+ /* journal write is already in flight, and was a flush write: */
+ if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+ goto out;
+
+ buf->noflush = true;
+ }
+
+ ret = true;
+out:
+ spin_unlock(&j->lock);
+ return ret;
+}
+
/* block/unlock the journal: */
void bch2_journal_unblock(struct journal *j)
long b;
if (new_fs) {
- if (c)
- percpu_down_read(&c->mark_lock);
b = bch2_bucket_alloc_new_fs(ca);
if (b < 0) {
- percpu_up_read(&c->mark_lock);
ret = -ENOSPC;
goto err;
}
goto err;
}
- b = sector_to_bucket(ca, ob->ptr.offset);
+ b = ob->bucket;
}
if (c)
if (c)
spin_unlock(&c->journal.lock);
- if (new_fs) {
- bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB),
- 0);
- if (c)
- percpu_up_read(&c->mark_lock);
- } else {
+ if (!new_fs) {
ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_trans_mark_metadata_bucket(&trans, ca,
b, BCH_DATA_journal,
j->replay_journal_seq = last_seq;
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
+ j->flushed_seq_ondisk = cur_seq - 1;
j->pin.front = last_seq;
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
+ if (list_empty(journal_entries))
+ j->last_empty_seq = cur_seq - 1;
+
fifo_for_each_entry_ptr(p, &j->pin, seq)
journal_pin_list_init(p, 1);
if (seq < last_seq)
continue;
+ if (journal_entry_empty(&i->j))
+ j->last_empty_seq = le64_to_cpu(i->j.seq);
+
p = journal_seq_pin(j, seq);
p->devs.nr = 0;
bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
}
+ if (list_empty(journal_entries))
+ j->last_empty_seq = cur_seq;
+
spin_lock(&j->lock);
set_bit(JOURNAL_STARTED, &j->flags);
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
- j->write_delay_ms = 1000;
- j->reclaim_delay_ms = 100;
-
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
struct bch_dev *ca;
+ unsigned long now = jiffies;
unsigned i;
rcu_read_lock();
s = READ_ONCE(j->reservations);
- pr_buf(out,
- "active journal entries:\t%llu\n"
- "seq:\t\t\t%llu\n"
- "last_seq:\t\t%llu\n"
- "last_seq_ondisk:\t%llu\n"
- "flushed_seq_ondisk:\t%llu\n"
- "prereserved:\t\t%u/%u\n"
- "each entry reserved:\t%u\n"
- "nr flush writes:\t%llu\n"
- "nr noflush writes:\t%llu\n"
- "nr direct reclaim:\t%llu\n"
- "nr background reclaim:\t%llu\n"
- "reclaim kicked:\t\t%u\n"
- "reclaim runs in:\t%u ms\n"
- "current entry sectors:\t%u\n"
- "current entry error:\t%u\n"
- "current entry:\t\t",
- fifo_used(&j->pin),
- journal_cur_seq(j),
- journal_last_seq(j),
- j->last_seq_ondisk,
- j->flushed_seq_ondisk,
- j->prereserved.reserved,
- j->prereserved.remaining,
- j->entry_u64s_reserved,
- j->nr_flush_writes,
- j->nr_noflush_writes,
- j->nr_direct_reclaim,
- j->nr_background_reclaim,
- j->reclaim_kicked,
- jiffies_to_msecs(j->next_reclaim - jiffies),
- j->cur_entry_sectors,
- j->cur_entry_error);
+ pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin));
+ pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
+ pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
+ pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
+ pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
+ pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ pr_buf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ pr_buf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
+ pr_buf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
+ pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+ pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error);
+ pr_buf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
case JOURNAL_ENTRY_ERROR_VAL:
pr_buf(out, "closed\n");
break;
default:
- pr_buf(out, "%u/%u\n",
- s.cur_entry_offset,
- j->cur_entry_u64s);
+ pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
break;
}
- pr_buf(out,
- "current entry:\t\tidx %u refcount %u\n",
- s.idx, journal_state_count(s, s.idx));
+ pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
i = s.idx;
while (i != s.unwritten_idx) {
if (!ja->nr)
continue;
- pr_buf(out,
- "dev %u:\n"
- "\tnr\t\t%u\n"
- "\tbucket size\t%u\n"
- "\tavailable\t%u:%u\n"
- "\tdiscard_idx\t%u\n"
- "\tdirty_ondisk\t%u (seq %llu)\n"
- "\tdirty_idx\t%u (seq %llu)\n"
- "\tcur_idx\t\t%u (seq %llu)\n",
- i, ja->nr, ca->mi.bucket_size,
- bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
- ja->sectors_free,
- ja->discard_idx,
- ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk],
- ja->dirty_idx, ja->bucket_seq[ja->dirty_idx],
- ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
+ pr_buf(out, "dev %u:\n", i);
+ pr_buf(out, "\tnr\t\t%u\n", ja->nr);
+ pr_buf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
+ pr_buf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+ pr_buf(out, "\tdiscard_idx\t%u\n", ja->discard_idx);
+ pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]);
+ pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]);
+ pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
}
rcu_read_unlock();
return j->pin.back - 1;
}
-u64 bch2_inode_journal_seq(struct journal *, u64);
void bch2_journal_set_has_inum(struct journal *, u64, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
s->buf3_count += s->idx == 3;
}
-static inline void bch2_journal_set_has_inode(struct journal *j,
- struct journal_res *res,
- u64 inum)
-{
- struct journal_buf *buf = &j->buf[res->idx];
- unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
-
- /* avoid atomic op if possible */
- if (unlikely(!test_bit(bit, buf->has_inode)))
- set_bit(bit, buf->has_inode);
-}
-
/*
* Amount of space that will be taken up by some keys in the journal (i.e.
* including the jset header)
ret = 0;
if ((flags & JOURNAL_RES_GET_RESERVED) ||
+ test_bit(JOURNAL_NOCHANGES, &j->flags) ||
new.reserved + d < new.remaining) {
new.reserved += d;
ret = 1;
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
int bch2_journal_meta(struct journal *);
void bch2_journal_halt(struct journal *);
return ret;
}
-static int journal_entry_validate_btree_keys(struct bch_fs *c,
+static int journal_entry_btree_keys_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return 0;
}
-static int journal_entry_validate_btree_root(struct bch_fs *c,
+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct bkey_i *k;
+ bool first = true;
+
+ vstruct_for_each(entry, k) {
+ if (!first) {
+ printbuf_newline(out);
+ pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ }
+ pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+ first = false;
+ }
+}
+
+static int journal_entry_btree_root_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return ret;
}
-static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return 0;
}
-static int journal_entry_validate_blacklist(struct bch_fs *c,
+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+}
+
+static int journal_entry_blacklist_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return ret;
}
-static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_blacklist *bl =
+ container_of(entry, struct jset_entry_blacklist, entry);
+
+ pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+}
+
+static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return ret;
}
-static int journal_entry_validate_usage(struct bch_fs *c,
+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_blacklist_v2 *bl =
+ container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+ pr_buf(out, "start=%llu end=%llu",
+ le64_to_cpu(bl->start),
+ le64_to_cpu(bl->end));
+}
+
+static int journal_entry_usage_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return ret;
}
-static int journal_entry_validate_data_usage(struct bch_fs *c,
+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_usage *u =
+ container_of(entry, struct jset_entry_usage, entry);
+
+ pr_buf(out, "type=%s v=%llu",
+ bch2_fs_usage_types[u->entry.btree_id],
+ le64_to_cpu(u->v));
+}
+
+static int journal_entry_data_usage_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return ret;
}
-static int journal_entry_validate_clock(struct bch_fs *c,
+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_data_usage *u =
+ container_of(entry, struct jset_entry_data_usage, entry);
+
+ bch2_replicas_entry_to_text(out, &u->r);
+ pr_buf(out, "=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_clock_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return ret;
}
-static int journal_entry_validate_dev_usage(struct bch_fs *c,
+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+
+ pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+}
+
+static int journal_entry_dev_usage_validate(struct bch_fs *c,
const char *where,
struct jset_entry *entry,
unsigned version, int big_endian, int write)
return ret;
}
+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+ pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+
+ for (i = 0; i < nr_types; i++) {
+ if (i < BCH_DATA_NR)
+ pr_buf(out, " %s", bch2_data_types[i]);
+ else
+ pr_buf(out, " (unknown data type %u)", i);
+ pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+ le64_to_cpu(u->d[i].buckets),
+ le64_to_cpu(u->d[i].sectors),
+ le64_to_cpu(u->d[i].fragmented));
+ }
+
+ pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
+ le64_to_cpu(u->buckets_ec),
+ le64_to_cpu(u->buckets_unavailable));
+}
+
+static int journal_entry_log_validate(struct bch_fs *c,
+ const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ return 0;
+}
+
+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+ unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
+
+ bch_scnmemcpy(out, l->d, strnlen(l->d, bytes));
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, const char *,
struct jset_entry *, unsigned, int, int);
+ void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
};
static const struct jset_entry_ops bch2_jset_entry_ops[] = {
#define x(f, nr) \
[BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
- .validate = journal_entry_validate_##f, \
+ .validate = journal_entry_##f##_validate, \
+ .to_text = journal_entry_##f##_to_text, \
},
BCH_JSET_ENTRY_TYPES()
#undef x
: 0;
}
+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ if (entry->type < BCH_JSET_ENTRY_NR) {
+ pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
+ } else {
+ pr_buf(out, "(unknown type %u)", entry->type);
+ }
+}
+
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
int write)
{
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
return 0;
- sectors = c->opts.block_size;
+ sectors = block_sectors(c);
goto next_block;
case JOURNAL_ENTRY_BAD:
saw_bad = true;
* field of the journal entry we read, so try reading
* again at next block boundary:
*/
- sectors = c->opts.block_size;
+ sectors = block_sectors(c);
break;
default:
return ret;
struct journal_device *ja =
container_of(cl, struct journal_device, read);
struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+ struct bch_fs *c = ca->fs;
struct journal_list *jlist =
container_of(cl->parent, struct journal_list, cl);
struct journal_read_buf buf = { NULL, 0 };
u64 min_seq = U64_MAX;
unsigned i;
- int ret;
+ int ret = 0;
if (!ja->nr)
goto out;
ja->discard_idx = ja->dirty_idx_ondisk =
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
+ bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvpfree(buf.data, buf.size);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
u64 v, seq;
int err = 0;
- bch2_time_stats_update(j->write_time, j->write_start_time);
+ bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+ ? j->flush_write_time
+ : j->noflush_write_time, j->write_start_time);
if (!w->devs_written.nr) {
bch_err(c, "unable to write journal to sufficient devices");
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = w->devs_written;
- j->seq_ondisk = seq;
- if (err && (!j->err_seq || seq < j->err_seq))
- j->err_seq = seq;
+ if (!err) {
+ j->seq_ondisk = seq;
- if (!JSET_NO_FLUSH(w->data)) {
- j->flushed_seq_ondisk = seq;
- j->last_seq_ondisk = w->last_seq;
- }
+ if (!JSET_NO_FLUSH(w->data)) {
+ j->flushed_seq_ondisk = seq;
+ j->last_seq_ondisk = w->last_seq;
+ }
+ } else if (!j->err_seq || seq < j->err_seq)
+ j->err_seq = seq;
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
spin_lock(&j->lock);
if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
- !w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+ (w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
- if (journal_entry_empty(jset))
+ if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
j->last_empty_seq = le64_to_cpu(jset->seq);
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
- if (c->opts.nochanges)
+ if (test_bit(JOURNAL_NOCHANGES, &j->flags))
goto no_io;
for_each_rw_member(ca, c, i)
}
}
- bch2_bucket_seq_cleanup(c);
-
continue_at(cl, do_journal_write, c->io_complete_wq);
return;
no_io:
- bch2_bucket_seq_cleanup(c);
-
continue_at(cl, journal_write_done, c->io_complete_wq);
return;
err:
- bch2_inconsistent_error(c);
+ bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
}
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
-int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
- unsigned, int, int);
+int bch2_journal_entry_validate(struct bch_fs *, const char *,
+ struct jset_entry *, unsigned, int, int);
+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
+ struct jset_entry *);
int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
struct journal_device *ja,
enum journal_space_from from)
{
- unsigned available = (journal_space_from(ja, from) -
- ja->cur_idx - 1 + ja->nr) % ja->nr;
+ unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
+ ? ((journal_space_from(ja, from) -
+ ja->cur_idx - 1 + ja->nr) % ja->nr)
+ : ja->nr;
/*
* Don't use the last bucket unless writing the new last_seq
u64 seq;
int err;
- if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
- return 0;
-
lockdep_assert_held(&j->reclaim_lock);
while (1) {
* make sure to flush at least one journal pin:
*/
if (time_after(jiffies, j->last_flushed +
- msecs_to_jiffies(j->reclaim_delay_ms)))
+ msecs_to_jiffies(c->opts.journal_reclaim_delay)))
min_nr = 1;
if (j->prereserved.reserved * 4 > j->prereserved.remaining)
if (fifo_free(&j->pin) <= 32)
min_nr = 1;
+ if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+ min_nr = 1;
+
trace_journal_reclaim_start(c,
min_nr,
j->prereserved.reserved,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));
- min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL);
+ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
nr_flushed = journal_flush_pins(j, seq_to_flush,
min_nr, min_key_cache);
static int bch2_journal_reclaim_thread(void *arg)
{
struct journal *j = arg;
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
unsigned long delay, now;
int ret = 0;
set_freezable();
- kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
-
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {
mutex_unlock(&j->reclaim_lock);
now = jiffies;
- delay = msecs_to_jiffies(j->reclaim_delay_ms);
+ delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
j->next_reclaim = j->last_flushed + delay;
if (!time_in_range(j->next_reclaim, now, now + delay))
return bl;
}
+static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
+ u64 start, u64 end)
+{
+ return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
+}
+
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
nr = blacklist_nr_entries(bl);
- if (bl) {
- for (i = 0; i < nr; i++) {
- struct journal_seq_blacklist_entry *e =
- bl->start + i;
-
- if (start == le64_to_cpu(e->start) &&
- end == le64_to_cpu(e->end))
- goto out;
-
- if (start <= le64_to_cpu(e->start) &&
- end >= le64_to_cpu(e->end)) {
- e->start = cpu_to_le64(start);
- e->end = cpu_to_le64(end);
-
- if (i + 1 < nr)
- bl = blacklist_entry_try_merge(c,
- bl, i);
- if (i)
- bl = blacklist_entry_try_merge(c,
- bl, i - 1);
- goto out_write_sb;
- }
+ for (i = 0; i < nr; i++) {
+ struct journal_seq_blacklist_entry *e =
+ bl->start + i;
+
+ if (bl_entry_contig_or_overlaps(e, start, end)) {
+ e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
+ e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
+
+ if (i + 1 < nr)
+ bl = blacklist_entry_try_merge(c,
+ bl, i);
+ if (i)
+ bl = blacklist_entry_try_merge(c,
+ bl, i - 1);
+ goto out_write_sb;
}
}
return 0;
}
-static const char *
-bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_journal_seq_blacklist *bl =
field_to_type(f, journal_seq_blacklist);
- struct journal_seq_blacklist_entry *i;
- unsigned nr = blacklist_nr_entries(bl);
+ unsigned i, nr = blacklist_nr_entries(bl);
- for (i = bl->start; i < bl->start + nr; i++) {
- if (le64_to_cpu(i->start) >=
- le64_to_cpu(i->end))
- return "entry start >= end";
-
- if (i + 1 < bl->start + nr &&
- le64_to_cpu(i[0].end) >
- le64_to_cpu(i[1].start))
- return "entries out of order";
+ for (i = 0; i < nr; i++) {
+ struct journal_seq_blacklist_entry *e = bl->start + i;
+
+ if (le64_to_cpu(e->start) >=
+ le64_to_cpu(e->end)) {
+ pr_buf(err, "entry %u start >= end (%llu >= %llu)",
+ i, le64_to_cpu(e->start), le64_to_cpu(e->end));
+ return -EINVAL;
+ }
+
+ if (i + 1 < nr &&
+ le64_to_cpu(e[0].end) >
+ le64_to_cpu(e[1].start)) {
+ pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
+ i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
+ return -EINVAL;
+ }
}
- return NULL;
+ return 0;
}
static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < BTREE_ID_NR; i++) {
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
- for_each_btree_node(&trans, iter, i, POS_MIN,
- BTREE_ITER_PREFETCH, b)
- if (test_bit(BCH_FS_STOPPING, &c->flags)) {
- bch2_trans_exit(&trans);
- return;
- }
- bch2_trans_iter_free(&trans, iter);
+ bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+ 0, 0, BTREE_ITER_PREFETCH);
+retry:
+ bch2_trans_begin(&trans);
+
+ b = bch2_btree_iter_peek_node(&iter);
+
+ while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+ b &&
+ !test_bit(BCH_FS_STOPPING, &c->flags))
+ b = bch2_btree_iter_next_node(&iter);
+
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter);
}
- ret = bch2_trans_exit(&trans);
+ bch2_trans_exit(&trans);
if (ret)
return;
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
- /* bloom filter: */
- unsigned long has_inode[1024 / sizeof(unsigned long)];
};
/*
enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
- JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,
+ JOURNAL_NOCHANGES,
};
/* Embedded in struct bch_fs */
struct mutex discard_lock;
bool can_discard;
- unsigned write_delay_ms;
- unsigned reclaim_delay_ms;
unsigned long last_flush_write;
u64 res_get_blocked_start;
u64 nr_flush_writes;
u64 nr_noflush_writes;
- struct time_stats *write_time;
- struct time_stats *delay_time;
+ struct time_stats *flush_write_time;
+ struct time_stats *noflush_write_time;
struct time_stats *blocked_time;
struct time_stats *flush_seq_time;
enum btree_id btree_id)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf sk;
int ret = 0;
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
- BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ while ((bch2_trans_begin(&trans),
+ (k = bch2_btree_iter_peek(&iter)).k) &&
!(ret = bkey_err(k))) {
if (!bch2_bkey_has_device(k, dev_idx)) {
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
continue;
}
*/
bch2_extent_normalize(c, bkey_i_to_s(sk.k));
- bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
+ /*
+ * Since we're not inserting through an extent iterator
+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+ * we aren't using the extent overwrite path to delete, we're
+ * just using the normal key deletion path:
+ */
+ if (bkey_deleted(&sk.k->k))
+ sk.k->k.size = 0;
- ret = bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(&trans, iter, sk.k, 0) ?:
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, sk.k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_NOFAIL);
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
BUG_ON(ret == -EINTR);
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct closure cl;
struct btree *b;
struct bkey_buf k;
closure_init_stack(&cl);
for (id = 0; id < BTREE_ID_NR; id++) {
- for_each_btree_node(&trans, iter, id, POS_MIN,
- BTREE_ITER_PREFETCH, b) {
+ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+ BTREE_ITER_PREFETCH);
retry:
+ ret = 0;
+ while (bch2_trans_begin(&trans),
+ (b = bch2_btree_iter_peek_node(&iter)) &&
+ !(ret = PTR_ERR_OR_ZERO(b))) {
if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
dev_idx))
- continue;
+ goto next;
bch2_bkey_buf_copy(&k, c, &b->key);
break;
}
- ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false);
+ ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
if (ret == -EINTR) {
- b = bch2_btree_iter_peek_node(iter);
ret = 0;
- goto retry;
+ continue;
}
+
if (ret) {
bch_err(c, "Error updating btree node key: %i", ret);
break;
}
+next:
+ bch2_btree_iter_next_node(&iter);
}
- bch2_trans_iter_free(&trans, iter);
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter);
if (ret)
goto err;
ret = 0;
err:
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&k, c);
BUG_ON(ret == -EINTR);
#include "btree_update_interior.h"
#include "buckets.h"
#include "disk_groups.h"
+#include "ec.h"
#include "inode.h"
#include "io.h"
#include "journal_reclaim.h"
#include "move.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super-io.h"
#include "keylist.h"
wait_queue_head_t wait;
};
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+ enum btree_id id,
+ struct bpos old_pos,
+ struct bpos new_pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter, update_iter;
+ struct bkey_s_c k;
+ struct snapshots_seen s;
+ int ret;
+
+ if (!btree_type_has_snapshots(id))
+ return 0;
+
+ snapshots_seen_init(&s);
+
+ if (!bkey_cmp(old_pos, new_pos))
+ return 0;
+
+ if (!snapshot_t(c, old_pos.snapshot)->children[0])
+ return 0;
+
+ bch2_trans_iter_init(trans, &iter, id, old_pos,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ while (1) {
+next:
+ k = bch2_btree_iter_prev(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ break;
+
+ if (bkey_cmp(old_pos, k.k->p))
+ break;
+
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+ struct bkey_i *update;
+ size_t i;
+
+ for (i = 0; i < s.nr; i++)
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
+ goto next;
+
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ break;
+
+ bkey_init(&update->k);
+ update->k.p = new_pos;
+ update->k.p.snapshot = k.k->p.snapshot;
+
+ bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS|
+ BTREE_ITER_INTENT);
+ ret = bch2_btree_iter_traverse(&update_iter) ?:
+ bch2_trans_update(trans, &update_iter, update,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &update_iter);
+ if (ret)
+ break;
+
+ ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ kfree(s.d);
+
+ return ret;
+}
+
static int bch2_migrate_index_update(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct migrate_write *m =
container_of(op, struct migrate_write, op);
+ struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
struct keylist *keys = &op->insert_keys;
struct bkey_buf _new, _insert;
int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
- iter = bch2_trans_get_iter(&trans, m->btree_id,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &iter, m->btree_id,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) {
struct bkey_s_c k;
struct bkey_i_extent *new;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
+ struct bpos next_pos;
bool did_work = false;
- bool extending = false, should_check_enospc;
+ bool should_check_enospc;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
bch2_trans_begin(&trans);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
new = bkey_i_to_extent(_new.k);
- bch2_cut_front(iter->pos, &new->k_i);
+ bch2_cut_front(iter.pos, &new->k_i);
- bch2_cut_front(iter->pos, insert);
+ bch2_cut_front(iter.pos, insert);
bch2_cut_back(new->k.p, insert);
bch2_cut_back(insert->k.p, &new->k_i);
extent_for_each_ptr(extent_i_to_s(new), new_ptr)
new_ptr->cached = true;
- bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+ __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
}
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
op->opts.background_target,
op->opts.data_replicas);
- ret = bch2_sum_sector_overwrites(&trans, iter, insert,
- &extending,
+ ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
&should_check_enospc,
&i_sectors_delta,
&disk_sectors_delta);
goto out;
}
- ret = bch2_trans_update(&trans, iter, insert, 0) ?:
+ next_pos = insert->k.p;
+
+ ret = insert_snapshot_whiteouts(&trans, m->btree_id,
+ k.k->p, insert->k.p) ?:
+ bch2_trans_update(&trans, &iter, insert,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
BTREE_INSERT_NOFAIL|
m->data_opts.btree_insert_flags);
-err:
- if (!ret)
+ if (!ret) {
+ bch2_btree_iter_set_pos(&iter, next_pos);
atomic_long_inc(&c->extent_migrate_done);
+ if (ec_ob)
+ bch2_ob_add_backpointer(c, ec_ob, &insert->k);
+ }
+err:
if (ret == -EINTR)
ret = 0;
if (ret)
break;
next:
- while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
+ while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
bch2_keylist_pop_front(keys);
if (bch2_keylist_empty(keys))
goto out;
continue;
nomatch:
if (m->ctxt) {
- BUG_ON(k.k->p.offset <= iter->pos.offset);
+ BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->ctxt->stats->keys_raced);
- atomic64_add(k.k->p.offset - iter->pos.offset,
+ atomic64_add(k.k->p.offset - iter.pos.offset,
&m->ctxt->stats->sectors_raced);
}
atomic_long_inc(&c->extent_migrate_raced);
trace_move_race(&new->k);
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
goto next;
}
out:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&_insert, c);
bch2_bkey_buf_exit(&_new, c);
m->op.crc = rbio->pick.crc;
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
- if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
- m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
- m->op.csum_type = m->op.crc.csum_type;
- }
-
if (m->data_cmd == DATA_REWRITE)
bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
}
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
+ struct bch_extent_crc_unpacked crc;
struct extent_ptr_decoded p;
int ret;
m->op.target = data_opts.target,
m->op.write_point = wp;
+ /*
+ * op->csum_type is normally initialized from the fs/file's current
+ * options - but if an extent is encrypted, we require that it stays
+ * encrypted:
+ */
+ bkey_for_each_crc(k.k, ptrs, crc, entry)
+ if (bch2_csum_type_is_encryption(crc.csum_type)) {
+ m->op.nonce = crc.nonce + crc.offset;
+ m->op.csum_type = crc.csum_type;
+ break;
+ }
+
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
m->op.alloc_reserve = RESERVE_MOVINGGC;
m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
unsigned compressed_sectors = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (p.ptr.dev == data_opts.rewrite_dev &&
- !p.ptr.cached &&
- crc_is_compressed(p.crc))
- compressed_sectors += p.crc.compressed_size;
+ if (p.ptr.dev == data_opts.rewrite_dev) {
+ if (p.ptr.cached)
+ m->op.flags |= BCH_WRITE_CACHED;
+
+ if (!p.ptr.cached &&
+ crc_is_compressed(p.crc))
+ compressed_sectors += p.crc.compressed_size;
+ }
if (compressed_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res,
closure_put(&ctxt->cl);
}
-static void do_pending_writes(struct moving_context *ctxt)
+static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
{
struct moving_io *io;
+ if (trans)
+ bch2_trans_unlock(trans);
+
while ((io = next_pending_write(ctxt))) {
list_del(&io->list);
closure_call(&io->cl, move_write, NULL, &ctxt->cl);
}
}
-#define move_ctxt_wait_event(_ctxt, _cond) \
+#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
do { \
- do_pending_writes(_ctxt); \
+ do_pending_writes(_ctxt, _trans); \
\
if (_cond) \
break; \
next_pending_write(_ctxt) || (_cond)); \
} while (1)
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
+ struct btree_trans *trans)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
- move_ctxt_wait_event(ctxt,
+ move_ctxt_wait_event(ctxt, trans,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->write_sectors) <
- SECTORS_IN_FLIGHT_PER_DEVICE);
-
- move_ctxt_wait_event(ctxt,
- atomic_read(&ctxt->read_sectors) <
- SECTORS_IN_FLIGHT_PER_DEVICE);
-
/* write path might have to decompress data: */
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
static int lookup_inode(struct btree_trans *trans, struct bpos pos,
struct bch_inode_unpacked *inode)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
goto err;
}
- ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+ ret = bkey_is_inode(k.k) ? 0 : -EIO;
if (ret)
goto err;
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+ ret = bch2_inode_unpack(k, inode);
if (ret)
goto err;
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct bkey_buf sk;
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct data_opts data_opts;
enum data_cmd data_cmd;
stats->btree_id = btree_id;
stats->pos = start;
- iter = bch2_trans_get_iter(&trans, btree_id, start,
- BTREE_ITER_PREFETCH);
+ bch2_trans_iter_init(&trans, &iter, btree_id, start,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
if (rate)
bch2_ratelimit_reset(rate);
schedule_timeout(delay);
if (unlikely(freezing(current))) {
- bch2_trans_unlock(&trans);
- move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
- bch2_trans_begin(&trans);
+ move_ctxt_wait_event(ctxt, &trans,
+ atomic_read(&ctxt->write_sectors) <
+ SECTORS_IN_FLIGHT_PER_DEVICE);
- k = bch2_btree_iter_peek(iter);
+ move_ctxt_wait_event(ctxt, &trans,
+ atomic_read(&ctxt->read_sectors) <
+ SECTORS_IN_FLIGHT_PER_DEVICE);
- stats->pos = iter->pos;
+ bch2_trans_begin(&trans);
+ k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
+
ret = bkey_err(k);
+ if (ret == -EINTR)
+ continue;
if (ret)
break;
+
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
+ stats->pos = iter.pos;
+
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
BUG();
}
- /* unlock before doing IO: */
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- bch2_trans_unlock(&trans);
ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts);
if (ret2) {
- if (ret2 == -EINTR) {
- bch2_trans_begin(&trans);
+ if (ret2 == -EINTR)
continue;
- }
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
+ bch2_move_ctxt_wait_for_io(ctxt, &trans);
continue;
}
if (rate)
bch2_ratelimit_increment(rate, k.k->size);
next:
- atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
- &stats->sectors_seen);
+ atomic64_add(k.k->size, &stats->sectors_seen);
next_nondata:
- bch2_btree_iter_advance(iter);
- bch2_trans_cond_resched(&trans);
+ bch2_btree_iter_advance(&iter);
}
out:
- bch2_trans_iter_put(&trans, iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
+inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+ memset(stats, 0, sizeof(*stats));
+
+ scnprintf(stats->name, sizeof(stats->name),
+ "%s", name);
+}
+
+static inline void progress_list_add(struct bch_fs *c,
+ struct bch_move_stats *stats)
+{
+ mutex_lock(&c->data_progress_lock);
+ list_add(&stats->list, &c->data_progress_list);
+ mutex_unlock(&c->data_progress_lock);
+}
+
+static inline void progress_list_del(struct bch_fs *c,
+ struct bch_move_stats *stats)
+{
+ mutex_lock(&c->data_progress_lock);
+ list_del(&stats->list);
+ mutex_unlock(&c->data_progress_lock);
+}
+
int bch2_move_data(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
enum btree_id end_btree_id, struct bpos end_pos,
enum btree_id id;
int ret;
+ progress_list_add(c, stats);
closure_init_stack(&ctxt.cl);
INIT_LIST_HEAD(&ctxt.reads);
init_waitqueue_head(&ctxt.wait);
}
- move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+ move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads));
closure_sync(&ctxt.cl);
EBUG_ON(atomic_read(&ctxt.write_sectors));
atomic64_read(&stats->sectors_moved),
atomic64_read(&stats->keys_moved));
+ progress_list_del(c, stats);
return ret;
}
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct btree *b;
enum btree_id id;
struct data_opts data_opts;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
+ progress_list_add(c, stats);
stats->data_type = BCH_DATA_btree;
id++) {
stats->btree_id = id;
- for_each_btree_node(&trans, iter, id,
- id == start_btree_id ? start_pos : POS_MIN,
- BTREE_ITER_PREFETCH, b) {
+ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+ BTREE_ITER_PREFETCH);
+retry:
+ ret = 0;
+ while (bch2_trans_begin(&trans),
+ (b = bch2_btree_iter_peek_node(&iter)) &&
+ !(ret = PTR_ERR_OR_ZERO(b))) {
if (kthread && kthread_should_stop())
break;
bpos_cmp(b->key.k.p, end_pos)) > 0)
break;
- stats->pos = iter->pos;
+ stats->pos = iter.pos;
switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
case DATA_SKIP:
BUG();
}
- ret = bch2_btree_node_rewrite(&trans, iter,
- b->data->keys.seq, 0) ?: ret;
+ ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+ if (ret == -EINTR)
+ continue;
+ if (ret)
+ break;
next:
- bch2_trans_cond_resched(&trans);
+ bch2_btree_iter_next_node(&iter);
}
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_iter_exit(&trans, &iter);
- ret = bch2_trans_iter_free(&trans, iter) ?: ret;
if (kthread && kthread_should_stop())
break;
}
if (ret)
bch_err(c, "error %i in bch2_move_btree", ret);
+ /* flush relevant btree updates */
+ closure_wait_event(&c->btree_interior_update_wait,
+ !bch2_btree_interior_updates_nr_pending(c));
+
+ progress_list_del(c, stats);
return ret;
}
struct data_opts *data_opts)
{
unsigned nr_good = bch2_bkey_durability(c, k);
- unsigned replicas = 0;
-
- switch (k.k->type) {
- case KEY_TYPE_btree_ptr:
- replicas = c->opts.metadata_replicas;
- break;
- case KEY_TYPE_extent:
- replicas = io_opts->data_replicas;
- break;
- }
+ unsigned replicas = bkey_is_btree_ptr(k.k)
+ ? c->opts.metadata_replicas
+ : io_opts->data_replicas;
if (!nr_good || nr_good >= replicas)
return DATA_SKIP;
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
+ bch_move_stats_init(stats, "rereplicate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
rereplicate_btree_pred, c, stats) ?: ret;
-
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
-
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
+ bch_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
+ bch_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
break;
default:
struct bch_move_stats *,
struct bch_ioctl_data);
+inline void bch_move_stats_init(struct bch_move_stats *stats,
+ char *name);
+
+
#endif /* _BCACHEFS_MOVE_H */
enum bch_data_type data_type;
enum btree_id btree_id;
struct bpos pos;
+ struct list_head list;
+ char name[32];
atomic64_t keys_moved;
atomic64_t keys_raced;
*/
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
#include "btree_update.h"
.dev = p.ptr.dev,
.offset = p.ptr.offset,
};
+ ssize_t i;
- ssize_t i = eytzinger0_find_le(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, &search);
+ if (p.ptr.cached)
+ continue;
+
+ i = eytzinger0_find_le(h->data, h->used,
+ sizeof(h->data[0]),
+ bucket_offset_cmp, &search);
#if 0
/* eytzinger search verify code: */
ssize_t j = -1, k;
BUG_ON(i != j);
#endif
if (i >= 0 &&
+ p.ptr.dev == h->data[i].dev &&
p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
p.ptr.gen == h->data[i].gen) {
/*
return cmp_int(l.fragmentation, r.fragmentation);
}
+static int walk_buckets_to_copygc(struct bch_fs *c)
+{
+ copygc_heap *h = &c->copygc_heap;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked u;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
+ struct copygc_heap_entry e;
+
+ u = bch2_alloc_unpack(k);
+
+ if (u.data_type != BCH_DATA_user ||
+ u.dirty_sectors >= ca->mi.bucket_size ||
+ bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
+ continue;
+
+ e = (struct copygc_heap_entry) {
+ .dev = iter.pos.inode,
+ .gen = u.gen,
+ .replicas = 1 + u.stripe_redundancy,
+ .fragmentation = u.dirty_sectors * (1U << 15)
+ / ca->mi.bucket_size,
+ .sectors = u.dirty_sectors,
+ .offset = bucket_to_sector(ca, iter.pos.offset),
+ };
+ heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int bucket_inorder_cmp(const void *_l, const void *_r)
+{
+ const struct copygc_heap_entry *l = _l;
+ const struct copygc_heap_entry *r = _r;
+
+ return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
+}
+
+static int check_copygc_was_done(struct bch_fs *c,
+ u64 *sectors_not_moved,
+ u64 *buckets_not_moved)
+{
+ copygc_heap *h = &c->copygc_heap;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_alloc_unpacked u;
+ struct copygc_heap_entry *i;
+ int ret = 0;
+
+ sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
+
+ bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
+
+ for (i = h->data; i < h->data + h->used; i++) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+
+ bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
+
+ ret = lockrestart_do(&trans,
+ bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
+ break;
+
+ u = bch2_alloc_unpack(k);
+
+ if (u.gen == i->gen && u.dirty_sectors) {
+ *sectors_not_moved += u.dirty_sectors;
+ *buckets_not_moved += 1;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
static int bch2_copygc(struct bch_fs *c)
{
copygc_heap *h = &c->copygc_heap;
struct copygc_heap_entry e, *i;
- struct bucket_array *buckets;
struct bch_move_stats move_stats;
- u64 sectors_to_move = 0, sectors_not_moved = 0;
+ u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
u64 sectors_reserved = 0;
u64 buckets_to_move, buckets_not_moved = 0;
struct bch_dev *ca;
unsigned dev_idx;
- size_t b, heap_size = 0;
+ size_t heap_size = 0;
int ret;
- memset(&move_stats, 0, sizeof(move_stats));
+ bch_move_stats_init(&move_stats, "copygc");
+
/*
* Find buckets with lowest sector counts, skipping completely
* empty buckets, by building a maxheap sorted by sector count,
spin_lock(&ca->fs->freelist_lock);
sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
spin_unlock(&ca->fs->freelist_lock);
+ }
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
-
- for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
- struct bucket *g = buckets->b + b;
- struct bucket_mark m = READ_ONCE(g->mark);
- struct copygc_heap_entry e;
-
- if (m.owned_by_allocator ||
- m.data_type != BCH_DATA_user ||
- !bucket_sectors_used(m) ||
- bucket_sectors_used(m) >= ca->mi.bucket_size)
- continue;
-
- WARN_ON(m.stripe && !g->stripe_redundancy);
-
- e = (struct copygc_heap_entry) {
- .dev = dev_idx,
- .gen = m.gen,
- .replicas = 1 + g->stripe_redundancy,
- .fragmentation = bucket_sectors_used(m) * (1U << 15)
- / ca->mi.bucket_size,
- .sectors = bucket_sectors_used(m),
- .offset = bucket_to_sector(ca, b),
- };
- heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
- }
- up_read(&ca->bucket_lock);
+ ret = walk_buckets_to_copygc(c);
+ if (ret) {
+ bch2_fs_fatal_error(c, "error walking buckets to copygc!");
+ return ret;
}
- if (!sectors_reserved) {
- bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
- return -1;
+ if (!h->used) {
+ bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
+ return 0;
}
/*
* Our btree node allocations also come out of RESERVE_MOVINGGC:
*/
- sectors_to_move = (sectors_to_move * 3) / 4;
+ sectors_reserved = (sectors_reserved * 3) / 4;
+ if (!sectors_reserved) {
+ bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
+ return -1;
+ }
- for (i = h->data; i < h->data + h->used; i++)
- sectors_to_move += i->sectors * i->replicas;
+ for (i = h->data; i < h->data + h->used; i++) {
+ sectors_to_move += i->sectors;
+ sectors_to_write += i->sectors * i->replicas;
+ }
- while (sectors_to_move > sectors_reserved) {
+ while (sectors_to_write > sectors_reserved) {
BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
- sectors_to_move -= e.sectors * e.replicas;
+ sectors_to_write -= e.sectors * e.replicas;
}
buckets_to_move = h->used;
- if (!buckets_to_move)
+ if (!buckets_to_move) {
+ bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
+ sectors_reserved);
return 0;
+ }
eytzinger0_sort(h->data, h->used,
sizeof(h->data[0]),
writepoint_ptr(&c->copygc_write_point),
copygc_pred, NULL,
&move_stats);
+ if (ret) {
+ bch_err(c, "error %i from bch2_move_data() in copygc", ret);
+ return ret;
+ }
- for_each_rw_member(ca, c, dev_idx) {
- down_read(&ca->bucket_lock);
- buckets = bucket_array(ca);
- for (i = h->data; i < h->data + h->used; i++) {
- struct bucket_mark m;
- size_t b;
-
- if (i->dev != dev_idx)
- continue;
-
- b = sector_to_bucket(ca, i->offset);
- m = READ_ONCE(buckets->b[b].mark);
-
- if (i->gen == m.gen &&
- bucket_sectors_used(m)) {
- sectors_not_moved += bucket_sectors_used(m);
- buckets_not_moved++;
- }
- }
- up_read(&ca->bucket_lock);
+ ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved);
+ if (ret) {
+ bch_err(c, "error %i from check_copygc_was_done()", ret);
+ return ret;
}
- if (sectors_not_moved && !ret)
+ if (sectors_not_moved)
bch_warn_ratelimited(c,
"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
sectors_not_moved, sectors_to_move,
NULL
};
+const char * const bch2_csum_types[] = {
+ BCH_CSUM_TYPES()
+ NULL
+};
+
const char * const bch2_csum_opts[] = {
BCH_CSUM_OPTS()
NULL
};
+const char * const bch2_compression_types[] = {
+ BCH_COMPRESSION_TYPES()
+ NULL
+};
+
const char * const bch2_compression_opts[] = {
BCH_COMPRESSION_OPTS()
NULL
};
const char * const bch2_str_hash_types[] = {
+ BCH_STR_HASH_TYPES()
+ NULL
+};
+
+const char * const bch2_str_hash_opts[] = {
BCH_STR_HASH_OPTS()
NULL
};
NULL
};
-const char * const bch2_cache_replacement_policies[] = {
- BCH_CACHE_REPLACEMENT_POLICIES()
+const char * const bch2_member_states[] = {
+ BCH_MEMBER_STATES()
NULL
};
-const char * const bch2_member_states[] = {
- BCH_MEMBER_STATES()
+const char * const bch2_jset_entry_types[] = {
+ BCH_JSET_ENTRY_TYPES()
+ NULL
+};
+
+const char * const bch2_fs_usage_types[] = {
+ BCH_FS_USAGE_TYPES()
NULL
};
#undef x
-const char * const bch2_d_types[DT_MAX] = {
+const char * const bch2_d_types[BCH_DT_MAX] = {
[DT_UNKNOWN] = "unknown",
[DT_FIFO] = "fifo",
[DT_CHR] = "chr",
[DT_LNK] = "lnk",
[DT_SOCK] = "sock",
[DT_WHT] = "whiteout",
+ [DT_SUBVOL] = "subvol",
};
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
}
}
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
-{
- struct bch_opts opts = bch2_opts_empty();
-
-#define x(_name, _bits, _mode, _type, _sb_opt, ...) \
- if (_sb_opt != NO_SB_OPT) \
- opt_set(opts, _name, _sb_opt(sb));
- BCH_OPTS()
-#undef x
-
- return opts;
-}
-
const struct bch_option bch2_opt_table[] = {
-#define OPT_BOOL() .type = BCH_OPT_BOOL
-#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max
-#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max
-#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices
+#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2
+#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
+ .min = _min, .max = _max
+#define OPT_STR(_choices) .type = BCH_OPT_STR, \
+ .min = 0, .max = ARRAY_SIZE(_choices),\
+ .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, \
.parse = _fn##_parse, \
.to_text = _fn##_to_text
-#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \
+#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
[Opt_##_name] = { \
.attr = { \
.name = #_name, \
- .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \
+ .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
}, \
- .mode = _mode, \
+ .flags = _flags, \
.hint = _hint, \
.help = _help, \
+ .get_sb = _sb_opt, \
.set_sb = SET_##_sb_opt, \
_type \
},
return bch2_opt_lookup(name);
}
-int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
+static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v)
+{
+ if (v < opt->min) {
+ if (msg)
+ pr_err("invalid %s%s: too small (min %llu)",
+ msg, opt->attr.name, opt->min);
+ return -ERANGE;
+ }
+
+ if (opt->max && v >= opt->max) {
+ if (msg)
+ pr_err("invalid %s%s: too big (max %llu)",
+ msg, opt->attr.name, opt->max);
+ return -ERANGE;
+ }
+
+ if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
+ if (msg)
+ pr_err("invalid %s %s: not a multiple of 512",
+ msg, opt->attr.name);
+ return -EINVAL;
+ }
+
+ if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
+ if (msg)
+ pr_err("invalid %s%s: must be a power of two",
+ msg, opt->attr.name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int bch2_opt_parse(struct bch_fs *c, const char *msg,
+ const struct bch_option *opt,
const char *val, u64 *res)
{
ssize_t ret;
ret = kstrtou64(val, 10, res);
if (ret < 0)
return ret;
-
- if (*res > 1)
- return -ERANGE;
break;
case BCH_OPT_UINT:
- ret = kstrtou64(val, 10, res);
- if (ret < 0)
- return ret;
-
- if (*res < opt->min || *res >= opt->max)
- return -ERANGE;
- break;
- case BCH_OPT_SECTORS:
- ret = bch2_strtou64_h(val, res);
+ ret = opt->flags & OPT_HUMAN_READABLE
+ ? bch2_strtou64_h(val, res)
+ : kstrtou64(val, 10, res);
if (ret < 0)
return ret;
-
- if (*res & 511)
- return -EINVAL;
-
- *res >>= 9;
-
- if (*res < opt->min || *res >= opt->max)
- return -ERANGE;
break;
case BCH_OPT_STR:
ret = match_string(opt->choices, -1, val);
if (!c)
return 0;
- return opt->parse(c, val, res);
+ ret = opt->parse(c, val, res);
+ if (ret < 0)
+ return ret;
}
- return 0;
+ return bch2_opt_validate(opt, msg, *res);
}
void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
switch (opt->type) {
case BCH_OPT_BOOL:
case BCH_OPT_UINT:
- pr_buf(out, "%lli", v);
- break;
- case BCH_OPT_SECTORS:
- bch2_hprint(out, v);
+ if (opt->flags & OPT_HUMAN_READABLE)
+ bch2_hprint(out, v);
+ else
+ pr_buf(out, "%lli", v);
break;
case BCH_OPT_STR:
if (flags & OPT_SHOW_FULL_LIST)
if (id < 0)
goto bad_opt;
- ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v);
+ ret = bch2_opt_parse(c, "mount option ",
+ &bch2_opt_table[id], val, &v);
if (ret < 0)
goto bad_val;
} else {
goto no_val;
}
- if (!(bch2_opt_table[id].mode & OPT_MOUNT))
+ if (!(bch2_opt_table[id].flags & OPT_MOUNT))
goto bad_opt;
if (id == Opt_acl &&
return ret;
}
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
+{
+ unsigned id;
+ int ret;
+
+ for (id = 0; id < bch2_opts_nr; id++) {
+ const struct bch_option *opt = bch2_opt_table + id;
+ u64 v;
+
+ if (opt->get_sb == NO_SB_OPT)
+ continue;
+
+ v = opt->get_sb(sb);
+
+ if (opt->flags & OPT_SB_FIELD_ILOG2)
+ v = 1ULL << v;
+
+ if (opt->flags & OPT_SB_FIELD_SECTORS)
+ v <<= 9;
+
+ ret = bch2_opt_validate(opt, "superblock option ", v);
+ if (ret)
+ return ret;
+
+ bch2_opt_set_by_id(opts, id, v);
+ }
+
+ return 0;
+}
+
+void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+{
+ if (opt->set_sb == SET_NO_SB_OPT)
+ return;
+
+ if (opt->flags & OPT_SB_FIELD_SECTORS)
+ v >>= 9;
+
+ if (opt->flags & OPT_SB_FIELD_ILOG2)
+ v = ilog2(v);
+
+ opt->set_sb(sb, v);
+}
+
+void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+{
+ if (opt->set_sb == SET_NO_SB_OPT)
+ return;
+
+ mutex_lock(&c->sb_lock);
+ __bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
+
/* io opts: */
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const bch2_btree_ids[];
+extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_str_hash_opts[];
extern const char * const bch2_data_types[];
-extern const char * const bch2_cache_replacement_policies[];
extern const char * const bch2_member_states[];
+extern const char * const bch2_jset_entry_types[];
+extern const char * const bch2_fs_usage_types[];
extern const char * const bch2_d_types[];
+static inline const char *bch2_d_type_str(unsigned d_type)
+{
+ return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
+}
+
/*
* Mount options; we also store defaults in the superblock.
*
LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
/* When can be set: */
-enum opt_mode {
- OPT_FORMAT = (1 << 0),
- OPT_MOUNT = (1 << 1),
- OPT_RUNTIME = (1 << 2),
- OPT_INODE = (1 << 3),
- OPT_DEVICE = (1 << 4),
+enum opt_flags {
+ OPT_FS = (1 << 0), /* Filesystem option */
+ OPT_DEVICE = (1 << 1), /* Device option */
+ OPT_INODE = (1 << 2), /* Inode option */
+ OPT_FORMAT = (1 << 3), /* May be specified at format time */
+ OPT_MOUNT = (1 << 4), /* May be specified at mount time */
+ OPT_RUNTIME = (1 << 5), /* May be specified at runtime */
+ OPT_HUMAN_READABLE = (1 << 6),
+ OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
+ OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
+ OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
};
enum opt_type {
BCH_OPT_BOOL,
BCH_OPT_UINT,
- BCH_OPT_SECTORS,
BCH_OPT_STR,
BCH_OPT_FN,
};
*/
#ifdef __KERNEL__
-#define RATELIMIT_ERRORS true
+#define RATELIMIT_ERRORS_DEFAULT true
#else
-#define RATELIMIT_ERRORS false
+#define RATELIMIT_ERRORS_DEFAULT false
#endif
#define BCH_OPTS() \
x(block_size, u16, \
- OPT_FORMAT, \
- OPT_SECTORS(1, 128), \
+ OPT_FS|OPT_FORMAT| \
+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
+ OPT_UINT(512, 1U << 16), \
BCH_SB_BLOCK_SIZE, 8, \
"size", NULL) \
- x(btree_node_size, u16, \
- OPT_FORMAT, \
- OPT_SECTORS(1, 512), \
+ x(btree_node_size, u32, \
+ OPT_FS|OPT_FORMAT| \
+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
+ OPT_UINT(512, 1U << 20), \
BCH_SB_BTREE_NODE_SIZE, 512, \
"size", "Btree node size, default 256k") \
x(errors, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \
NULL, "Action to take on filesystem error") \
x(metadata_replicas, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_META_REPLICAS_WANT, 1, \
"#", "Number of metadata replicas") \
x(data_replicas, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_DATA_REPLICAS_WANT, 1, \
"#", "Number of data replicas") \
x(metadata_replicas_required, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_META_REPLICAS_REQ, 1, \
"#", NULL) \
x(data_replicas_required, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_DATA_REPLICAS_REQ, 1, \
"#", NULL) \
+ x(encoded_extent_max, u32, \
+ OPT_FS|OPT_FORMAT| \
+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
+ OPT_UINT(4096, 2U << 20), \
+ BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \
+ "size", "Maximum size of checksummed/compressed extents")\
x(metadata_checksum, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_csum_opts), \
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(data_checksum, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(compression, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_compression_opts), \
BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
x(background_compression, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_compression_opts), \
BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
NULL, NULL) \
x(str_hash, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_STR(bch2_str_hash_types), \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_STR(bch2_str_hash_opts), \
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
"(target)", "Device or disk group for metadata writes") \
x(foreground_target, u16, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_FOREGROUND_TARGET, 0, \
"(target)", "Device or disk group for foreground writes") \
x(background_target, u16, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_BACKGROUND_TARGET, 0, \
"(target)", "Device or disk group to move data to in the background")\
x(promote_target, u16, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_PROMOTE_TARGET, 0, \
"(target)", "Device or disk group to promote data to on read")\
x(erasure_code, u16, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
x(inodes_32bit, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, true, \
NULL, "Constrain inode numbers to 32 bits") \
x(shard_inode_numbers, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- BCH_SB_SHARD_INUMS, false, \
+ BCH_SB_SHARD_INUMS, true, \
NULL, "Shard new inode numbers by CPU id") \
x(inodes_use_key_cache, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_INODES_USE_KEY_CACHE, true, \
NULL, "Use the btree key cache for the inodes btree") \
x(btree_node_mem_ptr_optimization, u8, \
- OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
x(gc_reserve_percent, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
BCH_SB_GC_RESERVE, 8, \
"%", "Percentage of disk space to reserve for copygc")\
x(gc_reserve_bytes, u64, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_SECTORS(0, U64_MAX), \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \
+ OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
+ OPT_UINT(0, U64_MAX), \
BCH_SB_GC_RESERVE_BYTES, 0, \
"%", "Amount of disk space to reserve for copygc\n" \
"Takes precedence over gc_reserve_percent if set")\
x(root_reserve_percent, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_UINT(0, 100), \
BCH_SB_ROOT_RESERVE, 0, \
"%", "Percentage of disk space to reserve for superuser")\
x(wide_macs, u8, \
- OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_128_BIT_MACS, false, \
NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\
x(inline_data, u8, \
- OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
x(acl, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_POSIX_ACL, true, \
NULL, "Enable POSIX acls") \
x(usrquota, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_USRQUOTA, false, \
NULL, "Enable user quotas") \
x(grpquota, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_GRPQUOTA, false, \
NULL, "Enable group quotas") \
x(prjquota, u8, \
- OPT_FORMAT|OPT_MOUNT, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_PRJQUOTA, false, \
NULL, "Enable project quotas") \
x(degraded, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Allow mounting in degraded mode") \
x(very_degraded, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
x(discard, u8, \
- OPT_MOUNT|OPT_DEVICE, \
+ OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Enable discard/TRIM support") \
x(verbose, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Extra debugging information during mount/recovery")\
+ x(journal_flush_delay, u32, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, U32_MAX), \
+ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \
+ NULL, "Delay in milliseconds before automatic journal commits")\
x(journal_flush_disabled, u8, \
- OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- NO_SB_OPT, false, \
+ BCH_SB_JOURNAL_FLUSH_DISABLED,false, \
NULL, "Disable journal flush on sync/fsync\n" \
"If enabled, writes can be lost, but only since the\n"\
"last journal write (default 1 second)") \
+ x(journal_reclaim_delay, u32, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, U32_MAX), \
+ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \
+ NULL, "Delay in milliseconds before automatic journal reclaim")\
x(fsck, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
x(fix_errors, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Fix errors during fsck without asking") \
x(ratelimit_errors, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
- NO_SB_OPT, RATELIMIT_ERRORS, \
+ NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \
NULL, "Ratelimit error messages during fsck") \
x(nochanges, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Super read only mode - no writes at all will be issued,\n"\
"even if we have to replay the journal") \
x(norecovery, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Don't replay the journal") \
x(rebuild_replicas, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Rebuild the superblock replicas section") \
x(keep_journal, u8, \
- OPT_MOUNT, \
+ 0, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Don't free journal entries/keys after startup")\
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Read all journal entries, not just dirty ones")\
+ x(journal_transaction_names, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \
+ NULL, "Log transaction function names in journal") \
x(noexcl, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Don't open device in exclusive mode") \
NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
- 0, \
+ OPT_FS, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, NULL) \
NO_SB_OPT, false, \
NULL, "Don\'t start filesystem, only open devices") \
x(reconstruct_alloc, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Reconstruct alloc btree") \
x(version_upgrade, u8, \
- OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false, \
NULL, "Set superblock to latest version,\n" \
"allowing any new features to be used") \
+ x(buckets_nouse, u8, \
+ 0, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false, \
+ NULL, "Allocate the buckets_nouse bitmap") \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \
NULL, NULL) \
x(fs_size, u64, \
OPT_DEVICE, \
- OPT_SECTORS(0, S64_MAX), \
+ OPT_UINT(0, S64_MAX), \
NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(bucket, u32, \
OPT_DEVICE, \
- OPT_SECTORS(0, S64_MAX), \
+ OPT_UINT(0, S64_MAX), \
NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(durability, u8, \
struct bch_option {
struct attribute attr;
+ u64 (*get_sb)(const struct bch_sb *);
void (*set_sb)(struct bch_sb *, u64);
- enum opt_mode mode;
enum opt_type type;
+ enum opt_flags flags;
+ u64 min, max;
union {
struct {
- u64 min, max;
};
struct {
const char * const *choices;
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
-struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
+void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
+void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
+ const char *, u64 *);
#define OPT_SHOW_FULL_LIST (1 << 0)
#define OPT_SHOW_MOUNT_STYLE (1 << 1)
#include "btree_update.h"
#include "inode.h"
#include "quota.h"
+#include "subvolume.h"
#include "super-io.h"
-static const char *bch2_sb_validate_quota(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
- if (vstruct_bytes(&q->field) != sizeof(*q))
- return "invalid field quota: wrong size";
+ if (vstruct_bytes(&q->field) < sizeof(*q)) {
+ pr_buf(err, "wrong size (got %llu should be %zu)",
+ vstruct_bytes(&q->field), sizeof(*q));
+ }
- return NULL;
+ return 0;
}
const struct bch_sb_field_ops bch_sb_field_ops_quota = {
static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
+ return ret;
}
void bch2_fs_quota_exit(struct bch_fs *c)
}
}
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_inode_unpacked u;
+ struct bch_subvolume subvolume;
+ struct bkey_s_c k;
+ int ret;
+
+ k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!k.k)
+ return 1;
+
+ ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
+ if (ret)
+ return ret;
+
+ /*
+ * We don't do quota accounting in snapshots:
+ */
+ if (BCH_SUBVOLUME_SNAP(&subvolume))
+ goto advance;
+
+ if (!bkey_is_inode(k.k))
+ goto advance;
+
+ ret = bch2_inode_unpack(k, &u);
+ if (ret)
+ return ret;
+
+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+ KEY_TYPE_QUOTA_NOCHECK);
+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+ KEY_TYPE_QUOTA_NOCHECK);
+advance:
+ bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1));
+ return 0;
+}
+
int bch2_fs_quota_read(struct bch_fs *c)
{
unsigned i, qtypes = enabled_qtypes(c);
struct bch_memquota_type *q;
struct btree_trans trans;
- struct btree_iter *iter;
- struct bch_inode_unpacked u;
- struct bkey_s_c k;
+ struct btree_iter iter;
int ret;
mutex_lock(&c->sb_lock);
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- switch (k.k->type) {
- case KEY_TYPE_inode:
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
- if (ret)
- return ret;
-
- bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
- KEY_TYPE_QUOTA_NOCHECK);
- bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
- KEY_TYPE_QUOTA_NOCHECK);
- }
- }
- bch2_trans_iter_put(&trans, iter);
-
- return bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS);
+ do {
+ ret = lockrestart_do(&trans,
+ bch2_fs_quota_read_inode(&trans, &iter));
+ } while (!ret);
+ bch2_trans_iter_exit(&trans, &iter);
+
+ bch2_trans_exit(&trans);
+ return ret < 0 ? ret : 0;
}
/* Enable/disable/delete quotas for an entire filesystem: */
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_USR, 0),
POS(QTYP_USR + 1, 0),
- NULL);
+ 0, NULL);
if (ret)
return ret;
}
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_GRP, 0),
POS(QTYP_GRP + 1, 0),
- NULL);
+ 0, NULL);
if (ret)
return ret;
}
ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
POS(QTYP_PRJ, 0),
POS(QTYP_PRJ + 1, 0),
- NULL);
+ 0, NULL);
if (ret)
return ret;
}
struct bkey_i_quota *new_quota,
struct qc_dqblk *qdq)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (unlikely(ret))
if (qdq->d_fieldmask & QC_INO_HARD)
new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
- ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0);
- bch2_trans_iter_put(trans, iter);
+ ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
struct rebalance_work w, p;
+ struct bch_move_stats move_stats;
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
prev_start = jiffies;
prev_cputime = curr_cputime();
+ bch_move_stats_init(&move_stats, "rebalance");
while (!kthread_wait_freezable(r->enabled)) {
cond_resched();
prev_cputime = cputime;
r->state = REBALANCE_RUNNING;
- memset(&r->move_stats, 0, sizeof(r->move_stats));
+ memset(&move_stats, 0, sizeof(move_stats));
rebalance_work_reset(c);
bch2_move_data(c,
NULL, /* &r->pd.rate, */
writepoint_ptr(&c->rebalance_write_point),
rebalance_pred, NULL,
- &r->move_stats);
+ &move_stats);
}
return 0;
h1);
break;
case REBALANCE_RUNNING:
- pr_buf(out, "running\n"
- "pos ");
- bch2_bpos_to_text(out, r->move_stats.pos);
- pr_buf(out, "\n");
+ pr_buf(out, "running\n");
break;
}
}
enum rebalance_state state;
u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
- struct bch_move_stats move_stats;
unsigned enabled:1;
};
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super-io.h"
#include <linux/sort.h>
static int __journal_key_cmp(enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
- struct journal_key *r)
+ const struct journal_key *r)
{
return (cmp_int(l_btree_id, r->btree_id) ?:
cmp_int(l_level, r->level) ?:
bpos_cmp(l_pos, r->k->k.p));
}
-static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
{
- return (cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p));
+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
-static size_t journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
+size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
size_t l = 0, r = journal_keys->nr, m;
iter->idx++;
}
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bkey_i *k)
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bkey_i *k)
{
struct journal_key n = {
.btree_id = id,
.level = level,
.k = k,
- .allocated = true
+ .allocated = true,
+ /*
+ * Ensure these keys are done last by journal replay, to unblock
+ * journal reclaim:
+ */
+ .journal_seq = U32_MAX,
};
struct journal_keys *keys = &c->journal_keys;
struct journal_iter *iter;
- unsigned idx = journal_key_search(keys, id, level, k->k.p);
+ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+ BUG_ON(test_bit(BCH_FS_RW, &c->flags));
if (idx < keys->nr &&
journal_key_cmp(&n, &keys->d[idx]) == 0) {
return 0;
}
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
- unsigned level, struct bpos pos)
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bkey_i *k)
{
- struct bkey_i *whiteout =
- kmalloc(sizeof(struct bkey), GFP_KERNEL);
+ struct bkey_i *n;
int ret;
- if (!whiteout) {
- bch_err(c, "%s: error allocating new key", __func__);
+ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+ if (!n)
return -ENOMEM;
- }
-
- bkey_init(&whiteout->k);
- whiteout->k.p = pos;
- ret = bch2_journal_key_insert(c, id, level, whiteout);
+ bkey_copy(n, k);
+ ret = bch2_journal_key_insert_take(c, id, level, n);
if (ret)
- kfree(whiteout);
+ kfree(n);
return ret;
}
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+ unsigned level, struct bpos pos)
+{
+ struct bkey_i whiteout;
+
+ bkey_init(&whiteout.k);
+ whiteout.k.p = pos;
+
+ return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+ unsigned level, struct bpos pos)
+{
+ struct journal_keys *keys = &c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+ if (idx < keys->nr &&
+ keys->d[idx].btree_id == btree &&
+ keys->d[idx].level == level &&
+ !bpos_cmp(keys->d[idx].k->k.p, pos))
+ keys->d[idx].overwritten = true;
+}
+
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->idx - iter->keys->nr
- ? iter->keys->d + iter->idx : NULL;
+ struct journal_key *k = iter->keys->d + iter->idx;
+
+ while (k < iter->keys->d + iter->keys->nr &&
+ k->btree_id == iter->btree_id &&
+ k->level == iter->level) {
+ if (!k->overwritten)
+ return k->k;
- if (k &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level)
- return k->k;
+ iter->idx++;
+ k = iter->keys->d + iter->idx;
+ }
- iter->idx = iter->keys->nr;
return NULL;
}
iter->btree_id = id;
iter->level = level;
iter->keys = &c->journal_keys;
- iter->idx = journal_key_search(&c->journal_keys, id, level, pos);
- list_add(&iter->list, &c->journal_iters);
+ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
- struct btree *b)
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
iter->b = b;
- bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
- bch2_journal_iter_init(c, &iter->journal,
- b->c.btree_id, b->c.level, b->data->min_key);
-}
-
-/* Walk btree, overlaying keys from the journal: */
-
-static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
- struct btree_and_journal_iter iter)
-{
- unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
- struct bkey_s_c k;
- struct bkey_buf tmp;
-
- BUG_ON(!b->c.level);
-
- bch2_bkey_buf_init(&tmp);
-
- while (i < nr &&
- (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
-
- bch2_btree_node_prefetch(c, NULL, tmp.k,
- b->c.btree_id, b->c.level - 1);
-
- bch2_btree_and_journal_iter_advance(&iter);
- i++;
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
- enum btree_id btree_id,
- btree_walk_key_fn key_fn)
-{
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf tmp;
- struct btree *child;
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
-
- child = bch2_btree_node_get_noiter(c, tmp.k,
- b->c.btree_id, b->c.level - 1,
- false);
-
- ret = PTR_ERR_OR_ZERO(child);
- if (ret)
- break;
-
- btree_and_journal_iter_prefetch(c, b, iter);
-
- ret = bch2_btree_and_journal_walk_recurse(c, child,
- btree_id, key_fn);
- six_unlock_read(&child->c.lock);
- } else {
- ret = key_fn(c, k);
- }
-
- if (ret)
- break;
-
- bch2_btree_and_journal_iter_advance(&iter);
- }
-
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
+ iter->node_iter = node_iter;
+ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ INIT_LIST_HEAD(&iter->journal.list);
}
-int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id,
- btree_walk_key_fn key_fn)
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b)
{
- struct btree *b = c->btree_roots[btree_id].b;
- int ret = 0;
-
- if (btree_node_fake(b))
- return 0;
-
- six_lock_read(&b->c.lock, NULL, NULL);
- ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn);
- six_unlock_read(&b->c.lock);
+ struct btree_node_iter node_iter;
- return ret;
+ bch2_btree_node_iter_init_from_start(&node_iter, b);
+ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+ list_add(&iter->journal.list, &c->journal_iters);
}
/* sort and dedup all keys in the journal: */
const struct journal_key *l = _l;
const struct journal_key *r = _r;
- return cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p) ?:
+ return journal_key_cmp(l, r) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
-static int __bch2_journal_replay_key(struct btree_trans *trans,
- enum btree_id id, unsigned level,
- struct bkey_i *k)
+static int bch2_journal_replay_key(struct btree_trans *trans,
+ struct journal_key *k)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
+ unsigned iter_flags =
+ BTREE_ITER_INTENT|
+ BTREE_ITER_NOT_EXTENTS;
int ret;
- iter = bch2_trans_get_node_iter(trans, id, k->k.p,
- BTREE_MAX_DEPTH, level,
- BTREE_ITER_INTENT|
- BTREE_ITER_NOT_EXTENTS);
- ret = bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
- bch2_trans_iter_put(trans, iter);
- return ret;
-}
-
-static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
-{
- unsigned commit_flags = BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW;
-
- if (!k->allocated)
- commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+ if (!k->level && k->btree_id == BTREE_ID_alloc)
+ iter_flags |= BTREE_ITER_CACHED;
- return bch2_trans_do(c, NULL, NULL, commit_flags,
- __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
-}
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, k->level,
+ iter_flags);
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto out;
-static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
-{
- struct btree_iter *iter;
- int ret;
+ /* Must be checked with btree locked: */
+ if (k->overwritten)
+ goto out;
- iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
- BTREE_ITER_CACHED|
- BTREE_ITER_CACHED_NOFILL|
- BTREE_ITER_INTENT);
- ret = bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
- bch2_trans_iter_put(trans, iter);
+ ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
+out:
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
- return bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_JOURNAL_REPLAY,
- __bch2_alloc_replay_key(&trans, k));
-}
-
static int journal_sort_seq_cmp(const void *_l, const void *_r)
{
- const struct journal_key *l = _l;
- const struct journal_key *r = _r;
+ const struct journal_key *l = *((const struct journal_key **)_l);
+ const struct journal_key *r = *((const struct journal_key **)_r);
- return cmp_int(r->level, l->level) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->btree_id, r->btree_id) ?:
- bpos_cmp(l->k->k.p, r->k->k.p);
+ return cmp_int(l->journal_seq, r->journal_seq);
}
-static int bch2_journal_replay(struct bch_fs *c,
- struct journal_keys keys)
+static int bch2_journal_replay(struct bch_fs *c)
{
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key **keys_sorted, *k;
struct journal *j = &c->journal;
- struct journal_key *i;
- u64 seq;
+ size_t i;
int ret;
- sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
+ keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
+ if (!keys_sorted)
+ return -ENOMEM;
- if (keys.nr)
- replay_now_at(j, keys.journal_seq_base);
+ for (i = 0; i < keys->nr; i++)
+ keys_sorted[i] = &keys->d[i];
- seq = j->replay_journal_seq;
+ sort(keys_sorted, keys->nr,
+ sizeof(keys_sorted[0]),
+ journal_sort_seq_cmp, NULL);
- /*
- * First replay updates to the alloc btree - these will only update the
- * btree key cache:
- */
- for_each_journal_key(keys, i) {
- cond_resched();
+ if (keys->nr)
+ replay_now_at(j, keys->journal_seq_base);
- if (!i->level && i->btree_id == BTREE_ID_alloc) {
- j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
- ret = bch2_alloc_replay_key(c, i->k);
- if (ret)
- goto err;
- }
- }
+ for (i = 0; i < keys->nr; i++) {
+ k = keys_sorted[i];
- /*
- * Next replay updates to interior btree nodes:
- */
- for_each_journal_key(keys, i) {
cond_resched();
- if (i->level) {
- j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
- ret = bch2_journal_replay_key(c, i);
- if (ret)
- goto err;
- }
- }
-
- /*
- * Now that the btree is in a consistent state, we can start journal
- * reclaim (which will be flushing entries from the btree key cache back
- * to the btree:
- */
- set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
- set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
- journal_reclaim_kick(j);
-
- j->replay_journal_seq = seq;
-
- /*
- * Now replay leaf node updates:
- */
- for_each_journal_key(keys, i) {
- cond_resched();
+ if (!k->allocated)
+ replay_now_at(j, keys->journal_seq_base + k->journal_seq);
- if (i->level || i->btree_id == BTREE_ID_alloc)
- continue;
-
- replay_now_at(j, keys.journal_seq_base + i->journal_seq);
-
- ret = bch2_journal_replay_key(c, i);
- if (ret)
+ ret = bch2_trans_do(c, NULL, NULL,
+ BTREE_INSERT_LAZY_RW|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_JOURNAL_RESERVED|
+ (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
+ bch2_journal_replay_key(&trans, k));
+ if (ret) {
+ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+ ret, bch2_btree_ids[k->btree_id], k->level);
goto err;
+ }
}
replay_now_at(j, j->replay_journal_seq_end);
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
- return bch2_journal_error(j);
+ ret = bch2_journal_error(j);
err:
- bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
- ret, bch2_btree_ids[i->btree_id], i->level);
+ kvfree(keys_sorted);
return ret;
}
container_of(entry, struct jset_entry_usage, entry);
switch (entry->btree_id) {
- case FS_USAGE_RESERVED:
+ case BCH_FS_USAGE_reserved:
if (entry->level < BCH_REPLICAS_MAX)
c->usage_base->persistent_reserved[entry->level] =
le64_to_cpu(u->v);
break;
- case FS_USAGE_INODES:
+ case BCH_FS_USAGE_inodes:
c->usage_base->nr_inodes = le64_to_cpu(u->v);
break;
- case FS_USAGE_KEY_VERSION:
+ case BCH_FS_USAGE_key_version:
atomic64_set(&c->key_version,
le64_to_cpu(u->v));
break;
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
- unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
- unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
- sizeof(struct jset_entry_dev_usage_type);
- unsigned i;
+ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
return ret;
}
+static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
+{
+ struct bkey_i_snapshot root_snapshot;
+ struct bkey_i_subvolume root_volume;
+ int ret;
+
+ bkey_snapshot_init(&root_snapshot.k_i);
+ root_snapshot.k.p.offset = U32_MAX;
+ root_snapshot.v.flags = 0;
+ root_snapshot.v.parent = 0;
+ root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL;
+ root_snapshot.v.pad = 0;
+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+ ret = bch2_btree_insert(c, BTREE_ID_snapshots,
+ &root_snapshot.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+
+ bkey_subvolume_init(&root_volume.k_i);
+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+ root_volume.v.flags = 0;
+ root_volume.v.snapshot = cpu_to_le32(U32_MAX);
+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+ ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
+ &root_volume.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_inode_unpacked inode;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!bkey_is_inode(k.k)) {
+ bch_err(trans->c, "root inode not found");
+ ret = -ENOENT;
+ goto err;
+ }
+
+ ret = bch2_inode_unpack(k, &inode);
+ BUG_ON(ret);
+
+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+ ret = bch2_inode_write(trans, &iter, &inode);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
if (c->sb.clean)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
+ else
+ bch_info(c, "recovering from unclean shutdown");
if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
ret = -EINVAL;
goto err;
-
}
if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
- if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
- bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
- c->opts.version_upgrade = true;
- c->opts.fsck = true;
- c->opts.fix_errors = FSCK_OPT_YES;
- }
-
- if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
- bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
- c->opts.version_upgrade = true;
+ if (!c->opts.nochanges) {
+ if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
+ bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+ c->opts.version_upgrade = true;
+ c->opts.fsck = true;
+ c->opts.fix_errors = FSCK_OPT_YES;
+ } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
+ bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
+ c->opts.version_upgrade = true;
+ c->opts.fsck = true;
+ } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
+ bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
+ c->opts.version_upgrade = true;
+ }
}
ret = bch2_blacklist_table_initialize(c);
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
struct journal_replay *i;
+ bch_verbose(c, "starting journal read");
ret = bch2_journal_read(c, &c->journal_entries,
&blacklist_seq, &journal_seq);
if (ret)
bch_verbose(c, "starting alloc read");
err = "error reading allocation information";
- ret = bch2_alloc_read(c);
+
+ down_read(&c->gc_lock);
+ ret = bch2_alloc_read(c, false, false);
+ up_read(&c->gc_lock);
+
if (ret)
goto err;
bch_verbose(c, "alloc read done");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+ /*
+ * If we're not running fsck, this ensures bch2_fsck_err() calls are
+ * instead interpreted as bch2_inconsistent_err() calls:
+ */
+ if (!c->opts.fsck)
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
if (c->opts.fsck ||
!(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
!(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
bool metadata_only = c->opts.norecovery;
- bch_info(c, "starting mark and sweep");
+ bch_info(c, "checking allocations");
err = "error in mark and sweep";
ret = bch2_gc(c, true, metadata_only);
if (ret)
goto err;
- bch_verbose(c, "mark and sweep done");
+ bch_verbose(c, "done checking allocations");
}
bch2_stripes_heap_start(c);
if (c->opts.norecovery)
goto out;
- bch_verbose(c, "starting journal replay");
+ bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
err = "journal replay failed";
- ret = bch2_journal_replay(c, c->journal_keys);
+ ret = bch2_journal_replay(c);
if (ret)
goto err;
- bch_verbose(c, "journal replay done");
+ if (c->opts.verbose || !c->sb.clean)
+ bch_info(c, "journal replay done");
- if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
- !c->opts.nochanges) {
- /*
- * note that even when filesystem was clean there might be work
- * to do here, if we ran gc (because of fsck) which recalculated
- * oldest_gen:
- */
- bch_verbose(c, "writing allocation info");
- err = "error writing out alloc info";
- ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
- bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
- if (ret) {
- bch_err(c, "error writing alloc info");
+ if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+ bch2_fs_lazy_rw(c);
+
+ err = "error creating root snapshot node";
+ ret = bch2_fs_initialize_subvolumes(c);
+ if (ret)
+ goto err;
+ }
+
+ bch_verbose(c, "reading snapshots table");
+ err = "error reading snapshots table";
+ ret = bch2_fs_snapshots_start(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "reading snapshots done");
+
+ if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+ /* set bi_subvol on root inode */
+ err = "error upgrade root inode for subvolumes";
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ bch2_fs_upgrade_for_subvolumes(&trans));
+ if (ret)
goto err;
- }
- bch_verbose(c, "alloc write done");
}
if (c->opts.fsck) {
bch_verbose(c, "quotas done");
}
- if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
- !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
- struct bch_move_stats stats = { 0 };
-
- bch_info(c, "scanning for old btree nodes");
- ret = bch2_fs_read_write(c);
- if (ret)
- goto err;
-
- ret = bch2_scan_old_btree_nodes(c, &stats);
- if (ret)
- goto err;
- bch_info(c, "scanning for old btree nodes done");
- }
-
mutex_lock(&c->sb_lock);
if (c->opts.version_upgrade) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
+ le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
+ struct bch_move_stats stats;
+
+ bch_move_stats_init(&stats, "recovery");
+
+ bch_info(c, "scanning for old btree nodes");
+ ret = bch2_fs_read_write(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_scan_old_btree_nodes(c, &stats);
+ if (ret)
+ goto err;
+ bch_info(c, "scanning for old btree nodes done");
+ }
+
if (c->journal_seq_blacklist_table &&
c->journal_seq_blacklist_table->nr > 128)
queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
bch2_write_super(c);
}
-
- for_each_online_member(ca, c, i)
- bch2_mark_dev_superblock(c, ca, 0);
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
- set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
-
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);
percpu_ref_put(&ca->ref);
goto err;
}
+
+ ca->new_fs_bucket_idx = 0;
}
+ err = "error creating root snapshot node";
+ ret = bch2_fs_initialize_subvolumes(c);
+ if (ret)
+ goto err;
+
+ bch_verbose(c, "reading snapshots table");
+ err = "error reading snapshots table";
+ ret = bch2_fs_snapshots_start(c);
+ if (ret)
+ goto err;
+ bch_verbose(c, "reading snapshots done");
+
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
- root_inode.bi_inum = BCACHEFS_ROOT_INO;
+ root_inode.bi_inum = BCACHEFS_ROOT_INO;
+ root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
bch2_inode_pack(c, &packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
err = "error creating lost+found";
ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+ bch2_create_trans(&trans,
+ BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
&lostfound,
0, 0, S_IFDIR|0700, 0,
- NULL, NULL));
+ NULL, NULL, (subvol_inum) { 0 }, 0));
if (ret) {
bch_err(c, "error creating lost+found");
goto err;
}
err = "error writing first journal entry";
- ret = bch2_journal_meta(&c->journal);
+ ret = bch2_journal_flush(&c->journal);
if (ret)
goto err;
} last;
};
+size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
+ unsigned, struct bpos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+ unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+ unsigned, struct bpos);
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+ struct bch_fs *, struct btree *,
+ struct btree_node_iter, struct bpos);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
-typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k);
-
-int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn);
-
void bch2_journal_keys_free(struct journal_keys *);
void bch2_journal_entries_free(struct list_head *);
#include "inode.h"
#include "io.h"
#include "reflink.h"
+#include "subvolume.h"
#include <linux/sched/signal.h>
if (bkey_val_bytes(p.k) != sizeof(*p.v))
return "incorrect value size";
+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
+ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
+ return "idx < front_pad";
+
return NULL;
}
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+ pr_buf(out, "idx %llu front_pad %u back_pad %u",
+ le64_to_cpu(p.v->idx),
+ le32_to_cpu(p.v->front_pad),
+ le32_to_cpu(p.v->back_pad));
}
bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
struct bkey_i *orig)
{
struct bch_fs *c = trans->c;
- struct btree_iter *reflink_iter;
+ struct btree_iter reflink_iter = { NULL };
struct bkey_s_c k;
struct bkey_i *r_v;
struct bkey_i_reflink_p *r_p;
if (orig->k.type == KEY_TYPE_inline_data)
bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
- for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
+ for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
POS(0, c->reflink_hint),
BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
- if (reflink_iter->pos.inode) {
- bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+ if (reflink_iter.pos.inode) {
+ bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
continue;
}
goto err;
/* rewind iter to start of hole, if necessary: */
- bch2_btree_iter_set_pos_to_extent_start(reflink_iter);
+ bch2_btree_iter_set_pos_to_extent_start(&reflink_iter);
r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
ret = PTR_ERR_OR_ZERO(r_v);
bkey_init(&r_v->k);
r_v->k.type = bkey_type_to_indirect(&orig->k);
- r_v->k.p = reflink_iter->pos;
+ r_v->k.p = reflink_iter.pos;
bch2_key_resize(&r_v->k, orig->k.size);
r_v->k.version = orig->k.version;
*refcount = 0;
memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
- ret = bch2_trans_update(trans, reflink_iter, r_v, 0);
+ ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
if (ret)
goto err;
+ /*
+ * orig is in a bkey_buf which statically allocates 5 64s for the val,
+ * so we know it will be big enough:
+ */
orig->k.type = KEY_TYPE_reflink_p;
r_p = bkey_i_to_reflink_p(orig);
set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+ memset(&r_p->v, 0, sizeof(r_p->v));
+
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
- ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
+ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
err:
- if (!IS_ERR(reflink_iter))
- c->reflink_hint = reflink_iter->pos.offset;
- bch2_trans_iter_put(trans, reflink_iter);
+ c->reflink_hint = reflink_iter.pos.offset;
+ bch2_trans_iter_exit(trans, &reflink_iter);
return ret;
}
struct bkey_s_c k;
int ret;
- for_each_btree_key_continue(iter, 0, k, ret) {
+ for_each_btree_key_continue_norestart(*iter, 0, k, ret) {
if (bkey_cmp(iter->pos, end) >= 0)
break;
}
s64 bch2_remap_range(struct bch_fs *c,
- struct bpos dst_start, struct bpos src_start,
- u64 remap_sectors, u64 *journal_seq,
+ subvol_inum dst_inum, u64 dst_offset,
+ subvol_inum src_inum, u64 src_offset,
+ u64 remap_sectors,
u64 new_i_size, s64 *i_sectors_delta)
{
struct btree_trans trans;
- struct btree_iter *dst_iter, *src_iter;
+ struct btree_iter dst_iter, src_iter;
struct bkey_s_c src_k;
struct bkey_buf new_dst, new_src;
+ struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+ struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
struct bpos src_want;
u64 dst_done;
+ u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
if (!percpu_ref_tryget(&c->writes))
bch2_bkey_buf_init(&new_src);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
- src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
- BTREE_ITER_INTENT);
- dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+ BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+ BTREE_ITER_INTENT);
while ((ret == 0 || ret == -EINTR) &&
- bkey_cmp(dst_iter->pos, dst_end) < 0) {
+ bkey_cmp(dst_iter.pos, dst_end) < 0) {
struct disk_reservation disk_res = { 0 };
bch2_trans_begin(&trans);
break;
}
- dst_done = dst_iter->pos.offset - dst_start.offset;
+ ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+ &src_snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+ ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+ &dst_snapshot);
+ if (ret)
+ continue;
+
+ bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
+ dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
- bch2_btree_iter_set_pos(src_iter, src_want);
+ bch2_btree_iter_set_pos(&src_iter, src_want);
- src_k = get_next_src(src_iter, src_end);
+ src_k = get_next_src(&src_iter, src_end);
ret = bkey_err(src_k);
if (ret)
continue;
- if (bkey_cmp(src_want, src_iter->pos) < 0) {
- ret = bch2_fpunch_at(&trans, dst_iter,
- bpos_min(dst_end,
- POS(dst_iter->pos.inode, dst_iter->pos.offset +
- src_iter->pos.offset - src_want.offset)),
- journal_seq, i_sectors_delta);
+ if (bkey_cmp(src_want, src_iter.pos) < 0) {
+ ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+ min(dst_end.offset,
+ dst_iter.pos.offset +
+ src_iter.pos.offset - src_want.offset),
+ i_sectors_delta);
continue;
}
if (src_k.k->type != KEY_TYPE_reflink_p) {
- bch2_btree_iter_set_pos_to_extent_start(src_iter);
+ bch2_btree_iter_set_pos_to_extent_start(&src_iter);
bch2_bkey_buf_reassemble(&new_src, c, src_k);
src_k = bkey_i_to_s_c(new_src.k);
- ret = bch2_make_extent_indirect(&trans, src_iter,
+ ret = bch2_make_extent_indirect(&trans, &src_iter,
new_src.k);
if (ret)
continue;
BUG();
}
- new_dst.k->k.p = dst_iter->pos;
+ new_dst.k->k.p = dst_iter.pos;
bch2_key_resize(&new_dst.k->k,
min(src_k.k->p.offset - src_want.offset,
- dst_end.offset - dst_iter->pos.offset));
- ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
- &disk_res, journal_seq,
+ dst_end.offset - dst_iter.pos.offset));
+
+ ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res, NULL,
new_i_size, i_sectors_delta,
true);
bch2_disk_reservation_put(c, &disk_res);
}
- bch2_trans_iter_put(&trans, dst_iter);
- bch2_trans_iter_put(&trans, src_iter);
+ bch2_trans_iter_exit(&trans, &dst_iter);
+ bch2_trans_iter_exit(&trans, &src_iter);
- BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
- BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+ BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end));
+ BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0);
- dst_done = dst_iter->pos.offset - dst_start.offset;
- new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+ dst_done = dst_iter.pos.offset - dst_start.offset;
+ new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
do {
struct bch_inode_unpacked inode_u;
- struct btree_iter *inode_iter;
+ struct btree_iter inode_iter = { NULL };
bch2_trans_begin(&trans);
- inode_iter = bch2_inode_peek(&trans, &inode_u,
- dst_start.inode, BTREE_ITER_INTENT);
- ret2 = PTR_ERR_OR_ZERO(inode_iter);
+ ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
+ dst_inum, BTREE_ITER_INTENT);
if (!ret2 &&
inode_u.bi_size < new_i_size) {
inode_u.bi_size = new_i_size;
- ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
- bch2_trans_commit(&trans, NULL, journal_seq, 0);
+ ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+ bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
}
- bch2_trans_iter_put(&trans, inode_iter);
+ bch2_trans_iter_exit(&trans, &inode_iter);
} while (ret2 == -EINTR);
- ret = bch2_trans_exit(&trans) ?: ret;
+ bch2_trans_exit(&trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
}
}
-s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
- u64, u64 *, u64, s64 *);
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+ subvol_inum, u64, u64, u64, s64 *);
#endif /* _BCACHEFS_REFLINK_H */
{
unsigned i;
- pr_buf(out, "%s: %u/%u [",
- bch2_data_types[e->data_type],
- e->nr_required,
- e->nr_devs);
+ if (e->data_type < BCH_DATA_NR)
+ pr_buf(out, "%s", bch2_data_types[e->data_type]);
+ else
+ pr_buf(out, "(invalid data type %u)", e->data_type);
+ pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
for (i = 0; i < e->nr_devs; i++)
pr_buf(out, i ? " %u" : "%u", e->devs[i]);
pr_buf(out, "]");
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
- struct bch_replicas_cpu *r)
+ struct bch_replicas_cpu *r)
{
struct bch_replicas_entry *e;
bool first = true;
goto out;
}
-static int __bch2_mark_replicas(struct bch_fs *c,
- struct bch_replicas_entry *r,
- bool check)
-{
- return likely(bch2_replicas_marked(c, r)) ? 0
- : check ? -1
- : bch2_mark_replicas_slowpath(c, r);
-}
-
int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
{
- return __bch2_mark_replicas(c, r, false);
-}
-
-static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
- bool check)
-{
- struct bch_replicas_padded search;
- struct bch_devs_list cached = bch2_bkey_cached_devs(k);
- unsigned i;
- int ret;
-
- memset(&search, 0, sizeof(search));
-
- for (i = 0; i < cached.nr; i++) {
- bch2_replicas_entry_cached(&search.e, cached.devs[i]);
-
- ret = __bch2_mark_replicas(c, &search.e, check);
- if (ret)
- return ret;
- }
-
- bch2_bkey_to_replicas(&search.e, k);
-
- ret = __bch2_mark_replicas(c, &search.e, check);
- if (ret)
- return ret;
-
- if (search.e.data_type == BCH_DATA_parity) {
- search.e.data_type = BCH_DATA_cached;
- ret = __bch2_mark_replicas(c, &search.e, check);
- if (ret)
- return ret;
-
- search.e.data_type = BCH_DATA_user;
- ret = __bch2_mark_replicas(c, &search.e, check);
- if (ret)
- return ret;
- }
-
- return 0;
+ return likely(bch2_replicas_marked(c, r))
+ ? 0 : bch2_mark_replicas_slowpath(c, r);
}
/* replicas delta list: */
-bool bch2_replicas_delta_list_marked(struct bch_fs *c,
- struct replicas_delta_list *r)
-{
- struct replicas_delta *d = r->d;
- struct replicas_delta *top = (void *) r->d + r->used;
-
- percpu_rwsem_assert_held(&c->mark_lock);
-
- for (d = r->d; d != top; d = replicas_delta_next(d))
- if (bch2_replicas_entry_idx(c, &d->r) < 0)
- return false;
- return true;
-}
-
int bch2_replicas_delta_list_mark(struct bch_fs *c,
struct replicas_delta_list *r)
{
return ret;
}
-/* bkey replicas: */
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
- struct bkey_s_c k)
-{
- return __bch2_mark_bkey_replicas(c, k, true) == 0;
-}
-
-int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
- return __bch2_mark_bkey_replicas(c, k, false);
-}
-
/*
* Old replicas_gc mechanism: only used for journal replicas entries now, should
* die at some point:
return 0;
}
-static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
+ struct bch_sb *sb,
+ struct printbuf *err)
{
- unsigned i;
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+ unsigned i, j;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
- for (i = 0; i + 1 < cpu_r->nr; i++) {
- struct bch_replicas_entry *l =
+ for (i = 0; i < cpu_r->nr; i++) {
+ struct bch_replicas_entry *e =
cpu_replicas_entry(cpu_r, i);
- struct bch_replicas_entry *r =
- cpu_replicas_entry(cpu_r, i + 1);
-
- BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
- if (!memcmp(l, r, cpu_r->entry_size))
- return "duplicate replicas entry";
- }
+ if (e->data_type >= BCH_DATA_NR) {
+ pr_buf(err, "invalid data type in entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
- return NULL;
-}
+ if (!e->nr_devs) {
+ pr_buf(err, "no devices in entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
-{
- struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- struct bch_replicas_cpu cpu_r = { .entries = NULL };
- struct bch_replicas_entry *e;
- const char *err;
- unsigned i;
+ if (e->nr_required > 1 &&
+ e->nr_required >= e->nr_devs) {
+ pr_buf(err, "bad nr_required in entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
- for_each_replicas_entry(sb_r, e) {
- err = "invalid replicas entry: invalid data type";
- if (e->data_type >= BCH_DATA_NR)
- goto err;
+ for (j = 0; j < e->nr_devs; j++)
+ if (!bch2_dev_exists(sb, mi, e->devs[j])) {
+ pr_buf(err, "invalid device %u in entry ", e->devs[j]);
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
- err = "invalid replicas entry: no devices";
- if (!e->nr_devs)
- goto err;
+ if (i + 1 < cpu_r->nr) {
+ struct bch_replicas_entry *n =
+ cpu_replicas_entry(cpu_r, i + 1);
- err = "invalid replicas entry: bad nr_required";
- if (e->nr_required > 1 &&
- e->nr_required >= e->nr_devs)
- goto err;
+ BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
- err = "invalid replicas entry: invalid device";
- for (i = 0; i < e->nr_devs; i++)
- if (!bch2_dev_exists(sb, mi, e->devs[i]))
- goto err;
+ if (!memcmp(e, n, cpu_r->entry_size)) {
+ pr_buf(err, "duplicate replicas entry ");
+ bch2_replicas_entry_to_text(err, e);
+ return -EINVAL;
+ }
+ }
}
- err = "cannot allocate memory";
+ return 0;
+}
+
+static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
+{
+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+ struct bch_replicas_cpu cpu_r;
+ int ret;
+
if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
- goto err;
+ return -ENOMEM;
- err = check_dup_replicas_entries(&cpu_r);
-err:
+ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
kfree(cpu_r.entries);
- return err;
+ return ret;
}
static void bch2_sb_replicas_to_text(struct printbuf *out,
.to_text = bch2_sb_replicas_to_text,
};
-static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
+static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
- struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
- struct bch_replicas_cpu cpu_r = { .entries = NULL };
- struct bch_replicas_entry_v0 *e;
- const char *err;
- unsigned i;
-
- for_each_replicas_entry_v0(sb_r, e) {
- err = "invalid replicas entry: invalid data type";
- if (e->data_type >= BCH_DATA_NR)
- goto err;
-
- err = "invalid replicas entry: no devices";
- if (!e->nr_devs)
- goto err;
-
- err = "invalid replicas entry: invalid device";
- for (i = 0; i < e->nr_devs; i++)
- if (!bch2_dev_exists(sb, mi, e->devs[i]))
- goto err;
- }
+ struct bch_replicas_cpu cpu_r;
+ int ret;
- err = "cannot allocate memory";
if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
- goto err;
+ return -ENOMEM;
- err = check_dup_replicas_entries(&cpu_r);
-err:
+ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
kfree(cpu_r.entries);
- return err;
+ return ret;
}
const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
bool metadata = e->data_type < BCH_DATA_user;
+ if (e->data_type == BCH_DATA_cached)
+ continue;
+
for (i = 0; i < e->nr_devs; i++) {
struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
return (void *) d + replicas_entry_bytes(&d->r) + 8;
}
-bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
-bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
-int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
unsigned dev)
#include "error.h"
#include "inode.h"
#include "siphash.h"
+#include "subvolume.h"
#include "super.h"
#include <linux/crc32c.h>
{
switch (opt) {
case BCH_STR_HASH_OPT_crc32c:
- return BCH_STR_HASH_CRC32C;
+ return BCH_STR_HASH_crc32c;
case BCH_STR_HASH_OPT_crc64:
- return BCH_STR_HASH_CRC64;
+ return BCH_STR_HASH_crc64;
case BCH_STR_HASH_OPT_siphash:
return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
- ? BCH_STR_HASH_SIPHASH
- : BCH_STR_HASH_SIPHASH_OLD;
+ ? BCH_STR_HASH_siphash
+ : BCH_STR_HASH_siphash_old;
default:
BUG();
}
.siphash_key = { .k0 = bi->bi_hash_seed }
};
- if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
+ if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
SHASH_DESC_ON_STACK(desc, c->sha256);
u8 digest[SHA256_DIGEST_SIZE];
const struct bch_hash_info *info)
{
switch (info->type) {
- case BCH_STR_HASH_CRC32C:
+ case BCH_STR_HASH_crc32c:
ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
sizeof(info->siphash_key.k0));
break;
- case BCH_STR_HASH_CRC64:
+ case BCH_STR_HASH_crc64:
ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
sizeof(info->siphash_key.k0));
break;
- case BCH_STR_HASH_SIPHASH_OLD:
- case BCH_STR_HASH_SIPHASH:
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
SipHash24_Init(&ctx->siphash, &info->siphash_key);
break;
default:
const void *data, size_t len)
{
switch (info->type) {
- case BCH_STR_HASH_CRC32C:
+ case BCH_STR_HASH_crc32c:
ctx->crc32c = crc32c(ctx->crc32c, data, len);
break;
- case BCH_STR_HASH_CRC64:
+ case BCH_STR_HASH_crc64:
ctx->crc64 = crc64_be(ctx->crc64, data, len);
break;
- case BCH_STR_HASH_SIPHASH_OLD:
- case BCH_STR_HASH_SIPHASH:
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
SipHash24_Update(&ctx->siphash, data, len);
break;
default:
const struct bch_hash_info *info)
{
switch (info->type) {
- case BCH_STR_HASH_CRC32C:
+ case BCH_STR_HASH_crc32c:
return ctx->crc32c;
- case BCH_STR_HASH_CRC64:
+ case BCH_STR_HASH_crc64:
return ctx->crc64 >> 1;
- case BCH_STR_HASH_SIPHASH_OLD:
- case BCH_STR_HASH_SIPHASH:
+ case BCH_STR_HASH_siphash_old:
+ case BCH_STR_HASH_siphash:
return SipHash24_End(&ctx->siphash) >> 1;
default:
BUG();
u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
bool (*cmp_key)(struct bkey_s_c, const void *);
bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+ bool (*is_visible)(subvol_inum inum, struct bkey_s_c);
};
-static __always_inline struct btree_iter *
+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
+{
+ return k.k->type == desc.key_type &&
+ (!desc.is_visible || desc.is_visible(inum, k));
+}
+
+static __always_inline int
bch2_hash_lookup(struct btree_trans *trans,
+ struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key,
+ subvol_inum inum, const void *key,
unsigned flags)
{
- struct btree_iter *iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- for_each_btree_key(trans, iter, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ SPOS(inum.inum, desc.hash_key(info, key), snapshot),
BTREE_ITER_SLOTS|flags, k, ret) {
- if (iter->pos.inode != inode)
+ if (iter->pos.inode != inum.inum)
break;
- if (k.k->type == desc.key_type) {
+ if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
- return iter;
+ return 0;
} else if (k.k->type == KEY_TYPE_hash_whiteout) {
;
} else {
break;
}
}
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, iter);
- return ERR_PTR(ret ?: -ENOENT);
+ return ret ?: -ENOENT;
}
-static __always_inline struct btree_iter *
+static __always_inline int
bch2_hash_hole(struct btree_trans *trans,
+ struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key)
+ subvol_inum inum, const void *key)
{
- struct btree_iter *iter;
struct bkey_s_c k;
+ u32 snapshot;
int ret;
- for_each_btree_key(trans, iter, desc.btree_id,
- POS(inode, desc.hash_key(info, key)),
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+ SPOS(inum.inum, desc.hash_key(info, key), snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter->pos.inode != inode)
+ if (iter->pos.inode != inum.inum)
break;
- if (k.k->type != desc.key_type)
- return iter;
+ if (!is_visible_key(desc, inum, k))
+ return 0;
}
+ bch2_trans_iter_exit(trans, iter);
- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- bch2_trans_iter_put(trans, iter);
-
- return ERR_PTR(ret ?: -ENOSPC);
+ return ret ?: -ENOSPC;
}
static __always_inline
const struct bch_hash_info *info,
struct btree_iter *start)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret;
- iter = bch2_trans_copy_iter(trans, start);
+ bch2_trans_copy_iter(&iter, start);
- bch2_btree_iter_advance(iter);
+ bch2_btree_iter_advance(&iter);
- for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_hash_whiteout)
break;
if (k.k->type == desc.key_type &&
desc.hash_bkey(info, k) <= start->pos.offset) {
- iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
ret = 1;
break;
}
}
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, struct bkey_i *insert, int flags)
+ subvol_inum inum,
+ struct bkey_i *insert, int flags)
{
- struct btree_iter *iter, *slot = NULL;
+ struct btree_iter iter, slot = { NULL };
struct bkey_s_c k;
bool found = false;
+ u32 snapshot;
int ret;
- for_each_btree_key(trans, iter, desc.btree_id,
- POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+
+ for_each_btree_key_norestart(trans, iter, desc.btree_id,
+ SPOS(inum.inum,
+ desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+ snapshot),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- if (iter->pos.inode != inode)
+ if (iter.pos.inode != inum.inum)
break;
- if (k.k->type == desc.key_type) {
+ if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;
continue;
}
- if (!slot &&
+ if (!slot.path &&
!(flags & BCH_HASH_SET_MUST_REPLACE))
- slot = bch2_trans_copy_iter(trans, iter);
+ bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
if (!ret)
ret = -ENOSPC;
out:
- bch2_trans_iter_put(trans, slot);
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &slot);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
found:
} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
- if (!found && slot)
+ if (!found && slot.path)
swap(iter, slot);
- insert->k.p = iter->pos;
- ret = bch2_trans_update(trans, iter, insert, 0);
+ insert->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, insert, 0);
}
goto out;
int bch2_hash_delete_at(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- struct btree_iter *iter)
+ struct btree_iter *iter,
+ unsigned update_flags)
{
struct bkey_i *delete;
int ret;
delete->k.p = iter->pos;
delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
- return bch2_trans_update(trans, iter, delete, 0);
+ return bch2_trans_update(trans, iter, delete, update_flags);
}
static __always_inline
int bch2_hash_delete(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
- u64 inode, const void *key)
+ subvol_inum inum, const void *key)
{
- struct btree_iter *iter;
+ struct btree_iter iter;
int ret;
- iter = bch2_hash_lookup(trans, desc, info, inode, key,
+ ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
+ if (ret)
+ return ret;
- ret = bch2_hash_delete_at(trans, desc, info, iter);
- bch2_trans_iter_put(trans, iter);
+ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "error.h"
+#include "fs.h"
+#include "subvolume.h"
+
+/* Snapshot tree: */
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *);
+static void bch2_delete_dead_snapshots(struct bch_fs *);
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+ pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+ BCH_SNAPSHOT_SUBVOL(s.v),
+ BCH_SNAPSHOT_DELETED(s.v),
+ le32_to_cpu(s.v->parent),
+ le32_to_cpu(s.v->children[0]),
+ le32_to_cpu(s.v->children[1]),
+ le32_to_cpu(s.v->subvol));
+}
+
+const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_snapshot s;
+ u32 i, id;
+
+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
+ bkey_cmp(k.k->p, POS(0, 1)) < 0)
+ return "bad pos";
+
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
+ return "bad val size";
+
+ s = bkey_s_c_to_snapshot(k);
+
+ id = le32_to_cpu(s.v->parent);
+ if (id && id <= k.k->p.offset)
+ return "bad parent node";
+
+ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
+ return "children not normalized";
+
+ if (s.v->children[0] &&
+ s.v->children[0] == s.v->children[1])
+ return "duplicate child nodes";
+
+ for (i = 0; i < 2; i++) {
+ id = le32_to_cpu(s.v->children[i]);
+
+ if (id >= k.k->p.offset)
+ return "bad child node";
+ }
+
+ return NULL;
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_t *t;
+
+ t = genradix_ptr_alloc(&c->snapshots,
+ U32_MAX - new.k->p.offset,
+ GFP_KERNEL);
+ if (!t)
+ return -ENOMEM;
+
+ if (new.k->type == KEY_TYPE_snapshot) {
+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+ t->parent = le32_to_cpu(s.v->parent);
+ t->children[0] = le32_to_cpu(s.v->children[0]);
+ t->children[1] = le32_to_cpu(s.v->children[1]);
+ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+ } else {
+ t->parent = 0;
+ t->children[0] = 0;
+ t->children[1] = 0;
+ t->subvol = 0;
+ }
+
+ return 0;
+}
+
+static int snapshot_lookup(struct btree_trans *trans, u32 id,
+ struct bch_snapshot *s)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_WITH_UPDATES);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
+
+ if (!ret)
+ *s = *bkey_s_c_to_snapshot(k).v;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int snapshot_live(struct btree_trans *trans, u32 id)
+{
+ struct bch_snapshot v;
+ int ret;
+
+ if (!id)
+ return 0;
+
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %u not found", id);
+ if (ret)
+ return ret;
+
+ return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot snap;
+ unsigned i;
+ int ret;
+
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ u32 id = k.k->p.offset, child[2];
+ unsigned nr_live = 0, live_idx;
+
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ child[0] = le32_to_cpu(snap.v->children[0]);
+ child[1] = le32_to_cpu(snap.v->children[1]);
+
+ for (i = 0; i < 2; i++) {
+ ret = snapshot_live(trans, child[i]);
+ if (ret < 0)
+ break;
+
+ if (ret)
+ live_idx = i;
+ nr_live += ret;
+ }
+
+ snapshot_t(c, id)->equiv = nr_live == 1
+ ? snapshot_t(c, child[live_idx])->equiv
+ : id;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ bch_err(c, "error walking snapshots: %i", ret);
+
+ return ret;
+}
+
+/* fsck: */
+static int bch2_snapshot_check(struct btree_trans *trans,
+ struct bkey_s_c_snapshot s)
+{
+ struct bch_subvolume subvol;
+ struct bch_snapshot v;
+ u32 i, id;
+ int ret;
+
+ id = le32_to_cpu(s.v->subvol);
+ ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+ bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+ s.k->p.offset);
+ return -EINVAL;
+ }
+
+ id = le32_to_cpu(s.v->parent);
+ if (id) {
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
+ le32_to_cpu(v.children[1]) != s.k->p.offset) {
+ bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+ id, s.k->p.offset);
+ return -EINVAL;
+ }
+ }
+
+ for (i = 0; i < 2 && s.v->children[i]; i++) {
+ id = le32_to_cpu(s.v->children[i]);
+
+ ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+ if (ret == -ENOENT)
+ bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+ s.k->p.offset, id);
+ if (ret)
+ return ret;
+
+ if (le32_to_cpu(v.parent) != s.k->p.offset) {
+ bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+ id, le32_to_cpu(v.parent), s.k->p.offset);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bch_snapshot s;
+ unsigned id;
+ int ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error %i checking snapshots", ret);
+ goto err;
+ }
+
+ for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_subvolume)
+ continue;
+again_2:
+ id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+ ret = snapshot_lookup(&trans, id, &s);
+
+ if (ret == -EINTR) {
+ k = bch2_btree_iter_peek(&iter);
+ goto again_2;
+ } else if (ret == -ENOENT)
+ bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+ k.k->p.offset, id);
+ else if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+ genradix_free(&c->snapshots);
+}
+
+int bch2_fs_snapshots_start(struct bch_fs *c)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ bool have_deleted = false;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+ break;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(c, "found wrong key type %u in snapshot node table",
+ k.k->type);
+ continue;
+ }
+
+ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+ have_deleted = true;
+
+ ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshots_set_equiv(&trans);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_exit(&trans);
+
+ if (!ret && have_deleted) {
+ bch_info(c, "restarting deletion of dead snapshots");
+ if (c->opts.fsck) {
+ bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
+ } else {
+ bch2_delete_dead_snapshots(c);
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_snapshot *s;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* already deleted? */
+ if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+ goto err;
+
+ s = bch2_trans_kmalloc(trans, sizeof(*s));
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&s->k_i, k);
+
+ SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+ struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot s;
+ struct bkey_i_snapshot *parent;
+ u32 parent_id;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ s = bkey_s_c_to_snapshot(k);
+
+ BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+ parent_id = le32_to_cpu(s.v->parent);
+
+ if (parent_id) {
+ bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
+ POS(0, parent_id),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&p_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ parent = bch2_trans_kmalloc(trans, sizeof(*parent));
+ ret = PTR_ERR_OR_ZERO(parent);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&parent->k_i, k);
+
+ for (i = 0; i < 2; i++)
+ if (le32_to_cpu(parent->v.children[i]) == id)
+ break;
+
+ if (i == 2)
+ bch_err(trans->c, "snapshot %u missing child pointer to %u",
+ parent_id, id);
+ else
+ parent->v.children[i] = 0;
+
+ if (le32_to_cpu(parent->v.children[0]) <
+ le32_to_cpu(parent->v.children[1]))
+ swap(parent->v.children[0],
+ parent->v.children[1]);
+
+ ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &p_iter);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+ u32 *new_snapids,
+ u32 *snapshot_subvols,
+ unsigned nr_snapids)
+{
+ struct btree_iter iter;
+ struct bkey_i_snapshot *n;
+ struct bkey_s_c k;
+ unsigned i;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+ POS_MIN, BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < nr_snapids; i++) {
+ k = bch2_btree_iter_prev_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (!k.k || !k.k->p.offset) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ n = bch2_trans_kmalloc(trans, sizeof(*n));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_snapshot_init(&n->k_i);
+ n->k.p = iter.pos;
+ n->v.flags = 0;
+ n->v.parent = cpu_to_le32(parent);
+ n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
+ n->v.pad = 0;
+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+ ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
+ bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+ if (ret)
+ goto err;
+
+ new_snapids[i] = iter.pos.offset;
+ }
+
+ if (parent) {
+ bch2_btree_iter_set_pos(&iter, POS(0, parent));
+ k = bch2_btree_iter_peek(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_snapshot) {
+ bch_err(trans->c, "snapshot %u not found", parent);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ n = bch2_trans_kmalloc(trans, sizeof(*n));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&n->k_i, k);
+
+ if (n->v.children[0] || n->v.children[1]) {
+ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ n->v.children[0] = cpu_to_le32(new_snapids[0]);
+ n->v.children[1] = cpu_to_le32(new_snapids[1]);
+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
+ ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+ if (ret)
+ goto err;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+{
+ BUG_ON(snapshot_list_has_id(s, id));
+
+ if (s->nr == s->size) {
+ size_t new_size = max(8U, s->size * 2);
+ void *n = krealloc(s->d,
+ new_size * sizeof(s->d[0]),
+ GFP_KERNEL);
+ if (!n) {
+ pr_err("error allocating snapshot ID list");
+ return -ENOMEM;
+ }
+
+ s->d = n;
+ s->size = new_size;
+ };
+
+ s->d[s->nr++] = id;
+ return 0;
+}
+
+static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
+ struct snapshot_id_list *deleted,
+ enum btree_id btree_id)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct snapshot_id_list equiv_seen = { 0 };
+ struct bpos last_pos = POS_MIN;
+ int ret = 0;
+
+ /*
+ * XXX: We should also delete whiteouts that no longer overwrite
+ * anything
+ */
+
+ bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_NOT_EXTENTS|
+ BTREE_ITER_ALL_SNAPSHOTS);
+
+ while ((bch2_trans_begin(trans),
+ (k = bch2_btree_iter_peek(&iter)).k) &&
+ !(ret = bkey_err(k))) {
+ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
+
+ if (bkey_cmp(k.k->p, last_pos))
+ equiv_seen.nr = 0;
+ last_pos = k.k->p;
+
+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+ snapshot_list_has_id(&equiv_seen, equiv)) {
+ if (btree_id == BTREE_ID_inodes &&
+ bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
+ continue;
+
+ ret = __bch2_trans_do(trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL,
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(trans, &iter,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+ if (ret)
+ break;
+ } else {
+ ret = snapshot_id_add(&equiv_seen, equiv);
+ if (ret)
+ break;
+ }
+
+ bch2_btree_iter_advance(&iter);
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ kfree(equiv_seen.d);
+
+ return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_snapshot snap;
+ struct snapshot_id_list deleted = { 0 };
+ u32 i, id, children[2];
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, 0, 0);
+
+ /*
+ * For every snapshot node: If we have no live children and it's not
+ * pointed to by a subvolume, delete it:
+ */
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ BCH_SNAPSHOT_SUBVOL(snap.v))
+ continue;
+
+ children[0] = le32_to_cpu(snap.v->children[0]);
+ children[1] = le32_to_cpu(snap.v->children[1]);
+
+ ret = snapshot_live(&trans, children[0]) ?:
+ snapshot_live(&trans, children[1]);
+ if (ret < 0)
+ break;
+ if (ret)
+ continue;
+
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
+ if (ret) {
+ bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error walking snapshots: %i", ret);
+ goto err;
+ }
+
+ ret = bch2_snapshots_set_equiv(&trans);
+ if (ret)
+ goto err;
+
+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ret) {
+ if (k.k->type != KEY_TYPE_snapshot)
+ continue;
+
+ snap = bkey_s_c_to_snapshot(k);
+ if (BCH_SNAPSHOT_DELETED(snap.v)) {
+ ret = snapshot_id_add(&deleted, k.k->p.offset);
+ if (ret)
+ break;
+ }
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
+ if (ret) {
+ bch_err(c, "error walking snapshots: %i", ret);
+ goto err;
+ }
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!btree_type_has_snapshots(id))
+ continue;
+
+ ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+ if (ret) {
+ bch_err(c, "error deleting snapshot keys: %i", ret);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < deleted.nr; i++) {
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ bch2_snapshot_node_delete(&trans, deleted.d[i]));
+ if (ret) {
+ bch_err(c, "error deleting snapshot %u: %i",
+ deleted.d[i], ret);
+ goto err;
+ }
+ }
+err:
+ kfree(deleted.d);
+ bch2_trans_exit(&trans);
+ percpu_ref_put(&c->writes);
+}
+
+static void bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+ if (unlikely(!percpu_ref_tryget(&c->writes)))
+ return;
+
+ if (!queue_work(system_long_wq, &c->snapshot_delete_work))
+ percpu_ref_put(&c->writes);
+}
+
+static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *h)
+{
+ bch2_delete_dead_snapshots(trans->c);
+ return 0;
+}
+
+/* Subvolumes: */
+
+const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
+ return "invalid pos";
+
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+ return "invalid pos";
+
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
+ return "bad val size";
+
+ return NULL;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+ pr_buf(out, "root %llu snapshot id %u",
+ le64_to_cpu(s.v->inode),
+ le32_to_cpu(s.v->snapshot));
+}
+
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+ bool inconsistent_if_not_found,
+ int iter_flags,
+ struct bch_subvolume *s)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
+ iter_flags);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
+
+ if (ret == -ENOENT && inconsistent_if_not_found)
+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+ if (!ret)
+ *s = *bkey_s_c_to_subvolume(k).v;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+ struct bch_subvolume *subvol)
+{
+ struct bch_snapshot snap;
+
+ return snapshot_lookup(trans, snapshot, &snap) ?:
+ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+ u32 *snapid)
+{
+ struct bch_subvolume s;
+ int ret;
+
+ ret = bch2_subvolume_get(trans, subvol, true,
+ BTREE_ITER_CACHED|
+ BTREE_ITER_WITH_UPDATES,
+ &s);
+
+ *snapid = le32_to_cpu(s.snapshot);
+ return ret;
+}
+
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_s_c_subvolume subvol;
+ struct btree_trans_commit_hook *h;
+ struct bkey_i *delete;
+ u32 snapid;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+ POS(0, subvolid),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+ ret = -EIO;
+ goto err;
+ }
+
+ subvol = bkey_s_c_to_subvolume(k);
+ snapid = le32_to_cpu(subvol.v->snapshot);
+
+ delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+ ret = PTR_ERR_OR_ZERO(delete);
+ if (ret)
+ goto err;
+
+ bkey_init(&delete->k);
+ delete->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, delete, 0);
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshot_node_set_deleted(trans, snapid);
+
+ h = bch2_trans_kmalloc(trans, sizeof(*h));
+ ret = PTR_ERR_OR_ZERO(h);
+ if (ret)
+ goto err;
+
+ h->fn = bch2_delete_dead_snapshots_hook;
+ bch2_trans_commit_hook(trans, h);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs,
+ snapshot_wait_for_pagecache_and_delete_work);
+ struct snapshot_id_list s;
+ u32 *id;
+ int ret = 0;
+
+ while (!ret) {
+ mutex_lock(&c->snapshots_unlinked_lock);
+ s = c->snapshots_unlinked;
+ memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+ mutex_unlock(&c->snapshots_unlinked_lock);
+
+ if (!s.nr)
+ break;
+
+ bch2_evict_subvolume_inodes(c, &s);
+
+ for (id = s.d; id < s.d + s.nr; id++) {
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_subvolume_delete(&trans, *id));
+ if (ret) {
+ bch_err(c, "error %i deleting subvolume %u", ret, *id);
+ break;
+ }
+ }
+
+ kfree(s.d);
+ }
+
+ percpu_ref_put(&c->writes);
+}
+
+struct subvolume_unlink_hook {
+ struct btree_trans_commit_hook h;
+ u32 subvol;
+};
+
+int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+ struct btree_trans_commit_hook *_h)
+{
+ struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ mutex_lock(&c->snapshots_unlinked_lock);
+ if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+ ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+ mutex_unlock(&c->snapshots_unlinked_lock);
+
+ if (ret)
+ return ret;
+
+ if (unlikely(!percpu_ref_tryget(&c->writes)))
+ return -EROFS;
+
+ if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+ percpu_ref_put(&c->writes);
+ return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_subvolume *n;
+ struct subvolume_unlink_hook *h;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+ POS(0, subvolid),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+ ret = -EIO;
+ goto err;
+ }
+
+ n = bch2_trans_kmalloc(trans, sizeof(*n));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&n->k_i, k);
+ SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+
+ ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+ if (ret)
+ goto err;
+
+ h = bch2_trans_kmalloc(trans, sizeof(*h));
+ ret = PTR_ERR_OR_ZERO(h);
+ if (ret)
+ goto err;
+
+ h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook;
+ h->subvol = subvolid;
+ bch2_trans_commit_hook(trans, &h->h);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+ u32 src_subvolid,
+ u32 *new_subvolid,
+ u32 *new_snapshotid,
+ bool ro)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+ struct bkey_i_subvolume *new_subvol = NULL;
+ struct bkey_i_subvolume *src_subvol = NULL;
+ struct bkey_s_c k;
+ u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+ int ret = 0;
+
+ for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+ break;
+
+ /*
+ * bch2_subvolume_delete() doesn't flush the btree key cache -
+ * ideally it would but that's tricky
+ */
+ if (bkey_deleted(k.k) &&
+ !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
+ goto found_slot;
+ }
+
+ if (!ret)
+ ret = -ENOSPC;
+ goto err;
+found_slot:
+ snapshot_subvols[0] = dst_iter.pos.offset;
+ snapshot_subvols[1] = src_subvolid;
+
+ if (src_subvolid) {
+ /* Creating a snapshot: */
+ src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
+ ret = PTR_ERR_OR_ZERO(src_subvol);
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
+ POS(0, src_subvolid),
+ BTREE_ITER_CACHED|
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&src_iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_subvolume) {
+ bch_err(c, "subvolume %u not found", src_subvolid);
+ ret = -ENOENT;
+ goto err;
+ }
+
+ bkey_reassemble(&src_subvol->k_i, k);
+ parent = le32_to_cpu(src_subvol->v.snapshot);
+ }
+
+ ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+ snapshot_subvols,
+ src_subvolid ? 2 : 1);
+ if (ret)
+ goto err;
+
+ if (src_subvolid) {
+ src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+ ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+ if (ret)
+ goto err;
+ }
+
+ new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+ ret = PTR_ERR_OR_ZERO(new_subvol);
+ if (ret)
+ goto err;
+
+ bkey_subvolume_init(&new_subvol->k_i);
+ new_subvol->v.flags = 0;
+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
+ new_subvol->v.inode = cpu_to_le64(inode);
+ SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+ SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+ new_subvol->k.p = dst_iter.pos;
+ ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
+ if (ret)
+ goto err;
+
+ *new_subvolid = new_subvol->k.p.offset;
+ *new_snapshotid = new_nodes[0];
+err:
+ bch2_trans_iter_exit(trans, &src_iter);
+ bch2_trans_iter_exit(trans, &dst_iter);
+ return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+ INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+ INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+ bch2_subvolume_wait_for_pagecache_and_delete);
+ mutex_init(&c->snapshots_unlinked_lock);
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+#include "subvolume_types.h"
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_snapshot (struct bkey_ops) { \
+ .key_invalid = bch2_snapshot_invalid, \
+ .val_to_text = bch2_snapshot_to_text, \
+}
+
+int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
+ struct bkey_s_c, unsigned);
+
+static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+ return genradix_ptr(&c->snapshots, U32_MAX - id);
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+ return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
+{
+ struct snapshot_t *s = snapshot_t(c, id);
+
+ return s->children[0] || s->children[1];
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+ struct snapshot_t *s;
+ u32 parent = bch2_snapshot_parent(c, id);
+
+ if (!parent)
+ return 0;
+
+ s = snapshot_t(c, bch2_snapshot_parent(c, id));
+ if (id == s->children[0])
+ return s->children[1];
+ if (id == s->children[1])
+ return s->children[0];
+ return 0;
+}
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+ while (id && id < ancestor)
+ id = bch2_snapshot_parent(c, id);
+
+ return id == ancestor;
+}
+
+struct snapshots_seen {
+ struct bpos pos;
+ size_t nr;
+ size_t size;
+ u32 *d;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+ kfree(s->d);
+ s->d = NULL;
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+ memset(s, 0, sizeof(*s));
+}
+
+static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+ if (s->nr == s->size) {
+ size_t new_size = max(s->size, (size_t) 128) * 2;
+ u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
+
+ if (!d) {
+ bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
+ new_size);
+ return -ENOMEM;
+ }
+
+ s->size = new_size;
+ s->d = d;
+ }
+
+ s->d[s->nr++] = id;
+ return 0;
+}
+
+static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+{
+ unsigned i;
+
+ for (i = 0; i < s->nr; i++)
+ if (id == s->d[i])
+ return true;
+ return false;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+int bch2_fs_snapshots_start(struct bch_fs *);
+
+const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume (struct bkey_ops) { \
+ .key_invalid = bch2_subvolume_invalid, \
+ .val_to_text = bch2_subvolume_to_text, \
+}
+
+int bch2_subvolume_get(struct btree_trans *, unsigned,
+ bool, int, struct bch_subvolume *);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+ struct bch_subvolume *);
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+ u32 *, u32 *, unsigned);
+
+int bch2_subvolume_delete(struct btree_trans *, u32);
+int bch2_subvolume_unlink(struct btree_trans *, u32);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+ u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+struct snapshot_id_list {
+ u32 nr;
+ u32 size;
+ u32 *d;
+};
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
NULL
};
-static const char *bch2_sb_field_validate(struct bch_sb *,
- struct bch_sb_field *);
+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
+ struct printbuf *);
struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
enum bch_sb_field_type type)
BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
}
-static const char *validate_sb_layout(struct bch_sb_layout *layout)
+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
{
u64 offset, prev_offset, max_sectors;
unsigned i;
- if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
- return "Not a bcachefs superblock layout";
+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
+ pr_buf(out, "Not a bcachefs superblock layout");
+ return -EINVAL;
+ }
- if (layout->layout_type != 0)
- return "Invalid superblock layout type";
+ if (layout->layout_type != 0) {
+ pr_buf(out, "Invalid superblock layout type %u",
+ layout->layout_type);
+ return -EINVAL;
+ }
- if (!layout->nr_superblocks)
- return "Invalid superblock layout: no superblocks";
+ if (!layout->nr_superblocks) {
+ pr_buf(out, "Invalid superblock layout: no superblocks");
+ return -EINVAL;
+ }
- if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
- return "Invalid superblock layout: too many superblocks";
+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
+ pr_buf(out, "Invalid superblock layout: too many superblocks");
+ return -EINVAL;
+ }
max_sectors = 1 << layout->sb_max_size_bits;
for (i = 1; i < layout->nr_superblocks; i++) {
offset = le64_to_cpu(layout->sb_offset[i]);
- if (offset < prev_offset + max_sectors)
- return "Invalid superblock layout: superblocks overlap";
+ if (offset < prev_offset + max_sectors) {
+ pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
+ " (sb %u ends at %llu next starts at %llu",
+ i - 1, prev_offset + max_sectors, offset);
+ return -EINVAL;
+ }
prev_offset = offset;
}
- return NULL;
+ return 0;
}
-const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
{
struct bch_sb *sb = disk_sb->sb;
struct bch_sb_field *f;
struct bch_sb_field_members *mi;
- const char *err;
u32 version, version_min;
u16 block_size;
+ int ret;
version = le16_to_cpu(sb->version);
version_min = version >= bcachefs_metadata_version_new_versioning
? le16_to_cpu(sb->version_min)
: version;
- if (version >= bcachefs_metadata_version_max ||
- version_min < bcachefs_metadata_version_min)
- return "Unsupported superblock version";
+ if (version >= bcachefs_metadata_version_max) {
+ pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+ return -EINVAL;
+ }
+
+ if (version_min < bcachefs_metadata_version_min) {
+ pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+ return -EINVAL;
+ }
- if (version_min > version)
- return "Bad minimum version";
+ if (version_min > version) {
+ pr_buf(out, "Bad minimum version %u, greater than version field %u",
+ version_min, version);
+ return -EINVAL;
+ }
if (sb->features[1] ||
- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
- return "Filesystem has incompatible features";
+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
+ pr_buf(out, "Filesystem has incompatible features");
+ return -EINVAL;
+ }
block_size = le16_to_cpu(sb->block_size);
- if (!is_power_of_2(block_size) ||
- block_size > PAGE_SECTORS)
- return "Bad block size";
+ if (block_size > PAGE_SECTORS) {
+ pr_buf(out, "Block size too big (got %u, max %u)",
+ block_size, PAGE_SECTORS);
+ return -EINVAL;
+ }
- if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
- return "Bad user UUID";
+ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) {
+ pr_buf(out, "Bad user UUID (got zeroes)");
+ return -EINVAL;
+ }
- if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
- return "Bad internal UUID";
+ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) {
+ pr_buf(out, "Bad intenal UUID (got zeroes)");
+ return -EINVAL;
+ }
if (!sb->nr_devices ||
- sb->nr_devices <= sb->dev_idx ||
- sb->nr_devices > BCH_SB_MEMBERS_MAX)
- return "Bad number of member devices";
-
- if (!BCH_SB_META_REPLICAS_WANT(sb) ||
- BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
-
- if (!BCH_SB_META_REPLICAS_REQ(sb) ||
- BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
-
- if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
- BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
- return "Invalid number of data replicas";
-
- if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
- BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
- return "Invalid number of data replicas";
-
- if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
- return "Invalid metadata checksum type";
-
- if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
- return "Invalid metadata checksum type";
-
- if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
- return "Invalid compression type";
-
- if (!BCH_SB_BTREE_NODE_SIZE(sb))
- return "Btree node size not set";
-
- if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
- return "Btree node size not a power of two";
+ sb->nr_devices > BCH_SB_MEMBERS_MAX) {
+ pr_buf(out, "Bad number of member devices %u (max %u)",
+ sb->nr_devices, BCH_SB_MEMBERS_MAX);
+ return -EINVAL;
+ }
- if (BCH_SB_GC_RESERVE(sb) < 5)
- return "gc reserve percentage too small";
+ if (sb->dev_idx >= sb->nr_devices) {
+ pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
+ sb->dev_idx, sb->nr_devices);
+ return -EINVAL;
+ }
if (!sb->time_precision ||
- le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
- return "invalid time precision";
+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
+ pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
+ le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
+ return -EINVAL;
+ }
/* validate layout */
- err = validate_sb_layout(&sb->layout);
- if (err)
- return err;
+ ret = validate_sb_layout(&sb->layout, out);
+ if (ret)
+ return ret;
vstruct_for_each(sb, f) {
- if (!f->u64s)
- return "Invalid superblock: invalid optional field";
+ if (!f->u64s) {
+ pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
+ le32_to_cpu(f->type));
+ return -EINVAL;
+ }
- if (vstruct_next(f) > vstruct_last(sb))
- return "Invalid superblock: invalid optional field";
+ if (vstruct_next(f) > vstruct_last(sb)) {
+ pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+ le32_to_cpu(f->type));
+ return -EINVAL;
+ }
}
/* members must be validated first: */
mi = bch2_sb_get_members(sb);
- if (!mi)
- return "Invalid superblock: member info area missing";
+ if (!mi) {
+ pr_buf(out, "Invalid superblock: member info area missing");
+ return -EINVAL;
+ }
- err = bch2_sb_field_validate(sb, &mi->field);
- if (err)
- return err;
+ ret = bch2_sb_field_validate(sb, &mi->field, out);
+ if (ret)
+ return ret;
vstruct_for_each(sb, f) {
if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
continue;
- err = bch2_sb_field_validate(sb, f);
- if (err)
- return err;
+ ret = bch2_sb_field_validate(sb, f, out);
+ if (ret)
+ return ret;
}
- return NULL;
+ return 0;
}
/* device open: */
c->sb.nr_devices = src->nr_devices;
c->sb.clean = BCH_SB_CLEAN(src);
c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
- c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
__copy_super(&c->disk_sb, src);
- if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
- set_bit(BCH_FS_ERROR, &c->flags);
- if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
- set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+ if (BCH_SB_INITIALIZED(c->disk_sb.sb))
+ set_bit(BCH_FS_INITIALIZED, &c->flags);
ret = bch2_sb_replicas_to_cpu_replicas(c);
if (ret)
/* read superblock: */
-static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
{
struct bch_csum csum;
+ u32 version, version_min;
size_t bytes;
+ int ret;
reread:
bio_reset(sb->bio);
bio_set_dev(sb->bio, sb->bdev);
bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
- if (submit_bio_wait(sb->bio))
- return "IO error";
+ ret = submit_bio_wait(sb->bio);
+ if (ret) {
+ pr_buf(err, "IO error: %i", ret);
+ return ret;
+ }
- if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
- return "Not a bcachefs superblock";
+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
+ pr_buf(err, "Not a bcachefs superblock");
+ return -EINVAL;
+ }
- if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min ||
- le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
- return "Unsupported superblock version";
+ version = le16_to_cpu(sb->sb->version);
+ version_min = version >= bcachefs_metadata_version_new_versioning
+ ? le16_to_cpu(sb->sb->version_min)
+ : version;
+
+ if (version >= bcachefs_metadata_version_max) {
+ pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+ version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+ return -EINVAL;
+ }
+
+ if (version_min < bcachefs_metadata_version_min) {
+ pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+ version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+ return -EINVAL;
+ }
bytes = vstruct_bytes(sb->sb);
- if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
- return "Bad superblock: too big";
+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+ pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+ bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+ return -EINVAL;
+ }
if (bytes > sb->buffer_size) {
if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
- return "cannot allocate memory";
+ return -ENOMEM;
goto reread;
}
- if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
- return "unknown csum type";
+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+ pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+ return -EINVAL;
+ }
/* XXX: verify MACs */
csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
null_nonce(), sb->sb);
- if (bch2_crc_cmp(csum, sb->sb->csum))
- return "bad checksum reading superblock";
+ if (bch2_crc_cmp(csum, sb->sb->csum)) {
+ pr_buf(err, "bad checksum");
+ return -EINVAL;
+ }
sb->seq = le64_to_cpu(sb->sb->seq);
- return NULL;
+ return 0;
}
int bch2_read_super(const char *path, struct bch_opts *opts,
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
- const char *err;
+ char *_err;
+ struct printbuf err;
__le64 *i;
int ret;
+ _err = kmalloc(4096, GFP_KERNEL);
+ if (!_err)
+ return -ENOMEM;
+ err = _PBUF(_err, 4096);
+
pr_verbose_init(*opts, "");
memset(sb, 0, sizeof(*sb));
goto out;
}
- err = "cannot allocate memory";
ret = bch2_sb_realloc(sb, 0);
- if (ret)
+ if (ret) {
+ pr_buf(&err, "error allocating memory for superblock");
goto err;
+ }
- ret = -EFAULT;
- err = "dynamic fault";
- if (bch2_fs_init_fault("read_super"))
+ if (bch2_fs_init_fault("read_super")) {
+ pr_buf(&err, "dynamic fault");
+ ret = -EFAULT;
goto err;
+ }
- ret = -EINVAL;
- err = read_one_super(sb, offset);
- if (!err)
+ ret = read_one_super(sb, offset, &err);
+ if (!ret)
goto got_super;
if (opt_defined(*opts, sb))
goto err;
- pr_err("error reading default superblock: %s", err);
+ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
+ path, _err);
+ err = _PBUF(_err, 4096);
/*
* Error reading primary superblock - read location of backup
*/
bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
- err = "IO error";
- if (submit_bio_wait(sb->bio))
+ ret = submit_bio_wait(sb->bio);
+ if (ret) {
+ pr_buf(&err, "IO error: %i", ret);
goto err;
+ }
memcpy(&layout, sb->sb, sizeof(layout));
- err = validate_sb_layout(&layout);
- if (err)
+ ret = validate_sb_layout(&layout, &err);
+ if (ret)
goto err;
for (i = layout.sb_offset;
if (offset == opt_get(*opts, sb))
continue;
- err = read_one_super(sb, offset);
- if (!err)
+ ret = read_one_super(sb, offset, &err);
+ if (!ret)
goto got_super;
}
- ret = -EINVAL;
goto err;
got_super:
- err = "Superblock block size smaller than device block size";
- ret = -EINVAL;
if (le16_to_cpu(sb->sb->block_size) << 9 <
- bdev_logical_block_size(sb->bdev))
+ bdev_logical_block_size(sb->bdev)) {
+ pr_buf(&err, "block size (%u) smaller than device block size (%u)",
+ le16_to_cpu(sb->sb->block_size) << 9,
+ bdev_logical_block_size(sb->bdev));
+ ret = -EINVAL;
goto err;
+ }
ret = 0;
sb->have_layout = true;
+
+ ret = bch2_sb_validate(sb, &err);
+ if (ret) {
+ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
+ path, _err);
+ goto err_no_print;
+ }
out:
pr_verbose_init(*opts, "ret %i", ret);
+ kfree(_err);
return ret;
err:
+ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
+ path, _err);
+err_no_print:
bch2_free_super(sb);
- pr_err("error reading superblock: %s", err);
goto out;
}
struct closure *cl = &c->sb_write;
struct bch_dev *ca;
unsigned i, sb = 0, nr_wrote;
- const char *err;
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
bch2_sb_from_fs(c, ca);
for_each_online_member(ca, c, i) {
- err = bch2_sb_validate(&ca->disk_sb);
- if (err) {
- bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
- ret = -1;
+ struct printbuf buf = { NULL, NULL };
+
+ ret = bch2_sb_validate(&ca->disk_sb, &buf);
+ if (ret) {
+ char *_buf = kmalloc(4096, GFP_NOFS);
+ if (_buf) {
+ buf = _PBUF(_buf, 4096);
+ bch2_sb_validate(&ca->disk_sb, &buf);
+ }
+
+ bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf);
+ kfree(_buf);
+ percpu_ref_put(&ca->io_ref);
goto out;
}
}
closure_sync(cl);
for_each_online_member(ca, c, i) {
- if (!ca->sb_write_error &&
- ca->disk_sb.seq !=
- le64_to_cpu(ca->sb_read_scratch->seq)) {
+ if (ca->sb_write_error)
+ continue;
+
+ if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+ bch2_fs_fatal_error(c,
+ "Superblock write was silently dropped! (seq %llu expected %llu)",
+ le64_to_cpu(ca->sb_read_scratch->seq),
+ ca->disk_sb.seq);
+ percpu_ref_put(&ca->io_ref);
+ ret = -EROFS;
+ goto out;
+ }
+
+ if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
bch2_fs_fatal_error(c,
- "Superblock modified by another process");
+ "Superblock modified by another process (seq %llu expected %llu)",
+ le64_to_cpu(ca->sb_read_scratch->seq),
+ ca->disk_sb.seq);
percpu_ref_put(&ca->io_ref);
ret = -EROFS;
goto out;
!can_mount_with_written ||
(can_mount_without_written &&
!can_mount_with_written), c,
- "Unable to write superblock to sufficient devices"))
+ "Unable to write superblock to sufficient devices (from %ps)",
+ (void *) _RET_IP_))
ret = -1;
out:
/* Make new options visible after they're persistent: */
return l < r ? -1 : l > r ? 1 : 0;
}
-static const char *bch2_sb_validate_journal(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_validate_journal(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_journal *journal = field_to_type(f, journal);
struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
- const char *err;
+ int ret = -EINVAL;
unsigned nr;
unsigned i;
u64 *b;
- journal = bch2_sb_get_journal(sb);
- if (!journal)
- return NULL;
-
nr = bch2_nr_journal_buckets(journal);
if (!nr)
- return NULL;
+ return 0;
b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
if (!b)
- return "cannot allocate memory";
+ return -ENOMEM;
for (i = 0; i < nr; i++)
b[i] = le64_to_cpu(journal->buckets[i]);
sort(b, nr, sizeof(u64), u64_cmp, NULL);
- err = "journal bucket at sector 0";
- if (!b[0])
+ if (!b[0]) {
+ pr_buf(err, "journal bucket at sector 0");
goto err;
+ }
- err = "journal bucket before first bucket";
- if (m && b[0] < le16_to_cpu(m->first_bucket))
+ if (b[0] < le16_to_cpu(m->first_bucket)) {
+ pr_buf(err, "journal bucket %llu before first bucket %u",
+ b[0], le16_to_cpu(m->first_bucket));
goto err;
+ }
- err = "journal bucket past end of device";
- if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
+ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+ pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+ b[nr - 1], le64_to_cpu(m->nbuckets));
goto err;
+ }
- err = "duplicate journal buckets";
for (i = 0; i + 1 < nr; i++)
- if (b[i] == b[i + 1])
+ if (b[i] == b[i + 1]) {
+ pr_buf(err, "duplicate journal buckets %llu", b[i]);
goto err;
+ }
- err = NULL;
+ ret = 0;
err:
kfree(b);
- return err;
+ return ret;
}
static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
/* BCH_SB_FIELD_members: */
-static const char *bch2_sb_validate_members(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_validate_members(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_members *mi = field_to_type(f, members);
- struct bch_member *m;
+ unsigned i;
if ((void *) (mi->members + sb->nr_devices) >
- vstruct_end(&mi->field))
- return "Invalid superblock: bad member info";
+ vstruct_end(&mi->field)) {
+ pr_buf(err, "too many devices for section size");
+ return -EINVAL;
+ }
+
+ for (i = 0; i < sb->nr_devices; i++) {
+ struct bch_member *m = mi->members + i;
- for (m = mi->members;
- m < mi->members + sb->nr_devices;
- m++) {
if (!bch2_member_exists(m))
continue;
- if (le64_to_cpu(m->nbuckets) > LONG_MAX)
- return "Too many buckets";
+ if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
+ pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
+ i, le64_to_cpu(m->nbuckets), LONG_MAX);
+ return -EINVAL;
+ }
if (le64_to_cpu(m->nbuckets) -
- le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
- return "Not enough buckets";
+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
+ pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
+ i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
+ return -EINVAL;
+ }
if (le16_to_cpu(m->bucket_size) <
- le16_to_cpu(sb->block_size))
- return "bucket size smaller than block size";
+ le16_to_cpu(sb->block_size)) {
+ pr_buf(err, "device %u: bucket size %u smaller than block size %u",
+ i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
+ return -EINVAL;
+ }
if (le16_to_cpu(m->bucket_size) <
- BCH_SB_BTREE_NODE_SIZE(sb))
- return "bucket size smaller than btree node size";
+ BCH_SB_BTREE_NODE_SIZE(sb)) {
+ pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
+ i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+ return -EINVAL;
+ }
}
- return NULL;
+ return 0;
}
static const struct bch_sb_field_ops bch_sb_field_ops_members = {
/* BCH_SB_FIELD_crypt: */
-static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_validate_crypt(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
- if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
- return "invalid field crypt: wrong size";
+ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+ pr_buf(err, "wrong size (got %llu should be %zu)",
+ vstruct_bytes(&crypt->field), sizeof(*crypt));
+ return -EINVAL;
+ }
- if (BCH_CRYPT_KDF_TYPE(crypt))
- return "invalid field crypt: bad kdf type";
+ if (BCH_CRYPT_KDF_TYPE(crypt)) {
+ pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
struct jset_entry_usage, entry);
u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = FS_USAGE_INODES;
+ u->entry.btree_id = BCH_FS_USAGE_inodes;
u->v = cpu_to_le64(c->usage_base->nr_inodes);
}
struct jset_entry_usage, entry);
u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = FS_USAGE_KEY_VERSION;
+ u->entry.btree_id = BCH_FS_USAGE_key_version;
u->v = cpu_to_le64(atomic64_read(&c->key_version));
}
struct jset_entry_usage, entry);
u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = FS_USAGE_RESERVED;
+ u->entry.btree_id = BCH_FS_USAGE_reserved;
u->entry.level = i;
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
}
mutex_unlock(&c->sb_lock);
}
-static const char *bch2_sb_validate_clean(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_validate_clean(struct bch_sb *sb,
+ struct bch_sb_field *f,
+ struct printbuf *err)
{
struct bch_sb_field_clean *clean = field_to_type(f, clean);
- if (vstruct_bytes(&clean->field) < sizeof(*clean))
- return "invalid field crypt: wrong size";
+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+ pr_buf(err, "wrong size (got %llu should be %zu)",
+ vstruct_bytes(&clean->field), sizeof(*clean));
+ return -EINVAL;
+ }
- return NULL;
+ return 0;
}
static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
#undef x
};
-static const char *bch2_sb_field_validate(struct bch_sb *sb,
- struct bch_sb_field *f)
+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ struct printbuf *orig_err)
{
unsigned type = le32_to_cpu(f->type);
+ struct printbuf err = *orig_err;
+ int ret;
- return type < BCH_SB_FIELD_NR
- ? bch2_sb_field_ops[type]->validate(sb, f)
- : NULL;
+ if (type >= BCH_SB_FIELD_NR)
+ return 0;
+
+ pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]);
+
+ ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
+ if (ret) {
+ pr_buf(&err, "\n");
+ bch2_sb_field_to_text(&err, sb, f);
+ *orig_err = err;
+ }
+
+ return ret;
}
void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
extern const char * const bch2_sb_fields[];
struct bch_sb_field_ops {
- const char * (*validate)(struct bch_sb *, struct bch_sb_field *);
- void (*to_text)(struct printbuf *, struct bch_sb *,
- struct bch_sb_field *);
+ int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+ void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
};
static inline __le64 bch2_sb_magic(struct bch_fs *c)
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
-const char *bch2_sb_validate(struct bch_sb_handle *);
-
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
void __bch2_check_set_feature(struct bch_fs *, unsigned);
.bucket_size = le16_to_cpu(mi->bucket_size),
.group = BCH_MEMBER_GROUP(mi),
.state = BCH_MEMBER_STATE(mi),
- .replacement = BCH_MEMBER_REPLACEMENT(mi),
.discard = BCH_MEMBER_DISCARD(mi),
.data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
.durability = BCH_MEMBER_DURABILITY(mi)
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
+#include "buckets_waiting_for_journal.h"
#include "chardev.h"
#include "checksum.h"
#include "clock.h"
#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
+#include "subvolume.h"
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
if (ret)
goto err;
- /*
- * We need to write out a journal entry before we start doing btree
- * updates, to ensure that on unclean shutdown new journal blacklist
- * entries are created:
- */
- bch2_journal_meta(&c->journal);
-
clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
for_each_rw_member(ca, c, i)
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+ bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_io_exit(c);
+ bch2_fs_buckets_waiting_for_journal_exit(c);
bch2_fs_btree_interior_update_exit(c);
bch2_fs_btree_iter_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
bch2_journal_entries_free(&c->journal_entries);
percpu_free_rwsem(&c->mark_lock);
- if (c->btree_iters_bufs)
+ if (c->btree_paths_bufs)
for_each_possible_cpu(cpu)
- kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+ kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
free_percpu(c->online_reserved);
- free_percpu(c->btree_iters_bufs);
+ free_percpu(c->btree_paths_bufs);
free_percpu(c->pcpu);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
bch2_fs_free(c);
}
-static const char *bch2_fs_online(struct bch_fs *c)
+static int bch2_fs_online(struct bch_fs *c)
{
struct bch_dev *ca;
- const char *err = NULL;
unsigned i;
- int ret;
+ int ret = 0;
lockdep_assert_held(&bch_fs_list_lock);
- if (!list_empty(&c->list))
- return NULL;
-
- if (__bch2_uuid_to_fs(c->sb.uuid))
- return "filesystem UUID already open";
+ if (__bch2_uuid_to_fs(c->sb.uuid)) {
+ bch_err(c, "filesystem UUID already open");
+ return -EINVAL;
+ }
ret = bch2_fs_chardev_init(c);
- if (ret)
- return "error creating character device";
+ if (ret) {
+ bch_err(c, "error creating character device");
+ return ret;
+ }
bch2_fs_debug_init(c);
- if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
- kobject_add(&c->internal, &c->kobj, "internal") ||
- kobject_add(&c->opts_dir, &c->kobj, "options") ||
- kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
- bch2_opts_create_sysfs_files(&c->opts_dir))
- return "error creating sysfs objects";
+ ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+ kobject_add(&c->internal, &c->kobj, "internal") ?:
+ kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+ kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+ bch2_opts_create_sysfs_files(&c->opts_dir);
+ if (ret) {
+ bch_err(c, "error creating sysfs objects");
+ return ret;
+ }
down_write(&c->state_lock);
- err = "error creating sysfs objects";
- for_each_member_device(ca, c, i)
- if (bch2_dev_sysfs_online(c, ca)) {
+ for_each_member_device(ca, c, i) {
+ ret = bch2_dev_sysfs_online(c, ca);
+ if (ret) {
+ bch_err(c, "error creating sysfs objects");
percpu_ref_put(&ca->ref);
goto err;
}
+ }
+ BUG_ON(!list_empty(&c->list));
list_add(&c->list, &bch_fs_list);
- err = NULL;
err:
up_write(&c->state_lock);
- return err;
+ return ret;
}
static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
struct bch_sb_field_members *mi;
struct bch_fs *c;
unsigned i, iter_size;
- const char *err;
+ int ret = 0;
pr_verbose_init(opts, "");
c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
- if (!c)
+ if (!c) {
+ c = ERR_PTR(-ENOMEM);
goto out;
+ }
__module_get(THIS_MODULE);
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
init_rwsem(&c->gc_lock);
+ mutex_init(&c->gc_gens_lock);
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
mutex_init(&c->usage_scratch_lock);
mutex_init(&c->bio_bounce_pages_lock);
+ mutex_init(&c->snapshot_table_lock);
spin_lock_init(&c->btree_write_error_lock);
INIT_LIST_HEAD(&c->ec_stripe_new_list);
mutex_init(&c->ec_stripe_new_lock);
+ INIT_LIST_HEAD(&c->data_progress_list);
+ mutex_init(&c->data_progress_lock);
+
spin_lock_init(&c->ec_stripes_heap_lock);
seqcount_init(&c->gc_pos_lock);
c->rebalance.enabled = 1;
c->promote_whole_extents = true;
- c->journal.write_time = &c->times[BCH_TIME_journal_write];
- c->journal.delay_time = &c->times[BCH_TIME_journal_delay];
- c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
- c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
+ c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
+ c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
+ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);
mutex_init(&c->sectors_available_lock);
- if (percpu_init_rwsem(&c->mark_lock))
+ ret = percpu_init_rwsem(&c->mark_lock);
+ if (ret)
goto err;
mutex_lock(&c->sb_lock);
+ ret = bch2_sb_to_fs(c, sb);
+ mutex_unlock(&c->sb_lock);
- if (bch2_sb_to_fs(c, sb)) {
- mutex_unlock(&c->sb_lock);
+ if (ret)
goto err;
- }
- mutex_unlock(&c->sb_lock);
+ uuid_unparse_lower(c->sb.user_uuid.b, c->name);
+
+ /* Compat: */
+ if (sb->version <= bcachefs_metadata_version_inode_v2 &&
+ !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
- scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
+ if (sb->version <= bcachefs_metadata_version_inode_v2 &&
+ !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
c->opts = bch2_opts_default;
- bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
+ ret = bch2_opts_from_sb(&c->opts, sb);
+ if (ret)
+ goto err;
+
bch2_opts_apply(&c->opts, opts);
- c->block_bits = ilog2(c->opts.block_size);
+ /* key cache currently disabled for inodes, because of snapshots: */
+ c->opts.inodes_use_key_cache = 0;
+
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+ if (c->opts.inodes_use_key_cache)
+ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+
+ c->block_bits = ilog2(block_sectors(c));
c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
- if (bch2_fs_init_fault("fs_alloc"))
+ if (bch2_fs_init_fault("fs_alloc")) {
+ bch_err(c, "fs_alloc fault injected");
+ ret = -EFAULT;
goto err;
+ }
iter_size = sizeof(struct sort_iter) +
(btree_blocks(c) + 1) * 2 *
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
- !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
+ !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
- sizeof(u64), GFP_KERNEL)) ||
- bch2_io_clock_init(&c->io_clock[READ]) ||
- bch2_io_clock_init(&c->io_clock[WRITE]) ||
- bch2_fs_journal_init(&c->journal) ||
- bch2_fs_replicas_init(c) ||
- bch2_fs_btree_cache_init(c) ||
- bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
- bch2_fs_btree_iter_init(c) ||
- bch2_fs_btree_interior_update_init(c) ||
- bch2_fs_io_init(c) ||
- bch2_fs_encryption_init(c) ||
- bch2_fs_compress_init(c) ||
- bch2_fs_ec_init(c) ||
- bch2_fs_fsio_init(c))
+ sizeof(u64), GFP_KERNEL))) {
+ ret = -ENOMEM;
goto err;
+ }
+
+ ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
+ bch2_io_clock_init(&c->io_clock[WRITE]) ?:
+ bch2_fs_journal_init(&c->journal) ?:
+ bch2_fs_replicas_init(c) ?:
+ bch2_fs_btree_cache_init(c) ?:
+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
+ bch2_fs_btree_iter_init(c) ?:
+ bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_buckets_waiting_for_journal_init(c);
+ bch2_fs_subvolumes_init(c) ?:
+ bch2_fs_io_init(c) ?:
+ bch2_fs_encryption_init(c) ?:
+ bch2_fs_compress_init(c) ?:
+ bch2_fs_ec_init(c) ?:
+ bch2_fs_fsio_init(c);
+ if (ret)
+ goto err;
+
+ if (c->opts.nochanges)
+ set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
- bch2_dev_alloc(c, i))
+ bch2_dev_alloc(c, i)) {
+ ret = -EEXIST;
goto err;
+ }
bch2_journal_entry_res_resize(&c->journal,
&c->btree_root_journal_res,
(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
mutex_lock(&bch_fs_list_lock);
- err = bch2_fs_online(c);
+ ret = bch2_fs_online(c);
mutex_unlock(&bch_fs_list_lock);
- if (err) {
- bch_err(c, "bch2_fs_online() error: %s", err);
+
+ if (ret)
goto err;
- }
out:
- pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
return c;
err:
bch2_fs_free(c);
- c = NULL;
+ c = ERR_PTR(ret);
goto out;
}
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
- if (!(opt->mode & OPT_MOUNT))
+ if (!(opt->flags & OPT_MOUNT))
continue;
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
int bch2_fs_start(struct bch_fs *c)
{
- const char *err = "cannot allocate memory";
struct bch_sb_field_members *mi;
struct bch_dev *ca;
time64_t now = ktime_get_real_seconds();
if (ret)
goto err;
- err = "dynamic fault";
ret = -EINVAL;
- if (bch2_fs_init_fault("fs_start"))
+ if (bch2_fs_init_fault("fs_start")) {
+ bch_err(c, "fs_start fault injected");
goto err;
+ }
set_bit(BCH_FS_STARTED, &c->flags);
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
- err = "error going read write";
ret = !test_bit(BCH_FS_RW, &c->flags)
? bch2_fs_read_write(c)
: bch2_fs_read_write_late(c);
case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers");
pr_cont("mount with -o fix_errors to repair\n");
- err = "fsck error";
break;
case BCH_FSCK_REPAIR_UNIMPLEMENTED:
bch_err(c, "filesystem contains errors: please report this to the developers");
pr_cont("repair unimplemented: inform the developers so that it can be added\n");
- err = "fsck error";
break;
case BCH_FSCK_REPAIR_IMPOSSIBLE:
bch_err(c, "filesystem contains errors, but repair impossible");
- err = "fsck error";
break;
case BCH_FSCK_UNKNOWN_VERSION:
- err = "unknown metadata version";;
+ bch_err(c, "unknown metadata version");
break;
case -ENOMEM:
- err = "cannot allocate memory";
+ bch_err(c, "cannot allocate memory");
break;
case -EIO:
- err = "IO error";
+ bch_err(c, "IO error");
break;
}
if (!sb_mi)
return "Invalid superblock: member info area missing";
- if (le16_to_cpu(sb->block_size) != c->opts.block_size)
+ if (le16_to_cpu(sb->block_size) != block_sectors(c))
return "mismatched block size";
if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
ca->disk_sb.bdev->bd_holder = ca;
memset(sb, 0, sizeof(*sb));
+ ca->dev = ca->disk_sb.bdev->bd_dev;
+
percpu_ref_reinit(&ca->io_ref);
return 0;
bch2_copygc_start(c);
}
-static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- if (bch2_dev_allocator_start(ca))
- return "error starting allocator thread";
-
- return NULL;
+ return bch2_dev_allocator_start(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (new_state == BCH_MEMBER_STATE_rw &&
- __bch2_dev_read_write(c, ca))
- ret = -ENOMEM;
+ if (new_state == BCH_MEMBER_STATE_rw)
+ ret = __bch2_dev_read_write(c, ca);
rebalance_wakeup(c);
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < ca->mi.nbuckets; i++) {
- ret = bch2_btree_key_cache_flush(&trans,
- BTREE_ID_alloc, POS(ca->dev_idx, i));
+ ret = lockrestart_do(&trans,
+ bch2_btree_key_cache_flush(&trans,
+ BTREE_ID_alloc, POS(ca->dev_idx, i)));
if (ret)
break;
}
bch2_trans_exit(&trans);
- if (ret)
+ if (ret) {
+ bch_err(c, "error %i removing dev alloc info", ret);
return ret;
+ }
return bch2_btree_delete_range(c, BTREE_ID_alloc,
POS(ca->dev_idx, 0),
POS(ca->dev_idx + 1, 0),
- NULL);
+ 0, NULL);
}
int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
struct bch_sb_field_members *mi;
struct bch_member dev_mi;
unsigned dev_idx, nr_devices, u64s;
+ char *_errbuf;
+ struct printbuf errbuf;
int ret;
- ret = bch2_read_super(path, &opts, &sb);
- if (ret)
- return ret;
+ _errbuf = kmalloc(4096, GFP_KERNEL);
+ if (!_errbuf)
+ return -ENOMEM;
- err = bch2_sb_validate(&sb);
- if (err)
- return -EINVAL;
+ errbuf = _PBUF(_errbuf, 4096);
+
+ ret = bch2_read_super(path, &opts, &sb);
+ if (ret) {
+ bch_err(c, "device add error: error reading super: %i", ret);
+ goto err;
+ }
dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
err = bch2_dev_may_add(sb.sb, c);
- if (err)
- return -EINVAL;
+ if (err) {
+ bch_err(c, "device add error: %s", err);
+ ret = -EINVAL;
+ goto err;
+ }
ca = __bch2_dev_alloc(c, &dev_mi);
if (!ca) {
bch2_free_super(&sb);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto err;
}
ret = __bch2_dev_attach_bdev(ca, &sb);
if (ret) {
bch2_dev_free(ca);
- return ret;
+ goto err;
}
- /*
- * We want to allocate journal on the new device before adding the new
- * device to the filesystem because allocating after we attach requires
- * spinning up the allocator thread, and the allocator thread requires
- * doing btree writes, which if the existing devices are RO isn't going
- * to work
- *
- * So we have to mark where the superblocks are, but marking allocated
- * data normally updates the filesystem usage too, so we have to mark,
- * allocate the journal, reset all the marks, then remark after we
- * attach...
- */
- bch2_mark_dev_superblock(NULL, ca, 0);
-
- err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
- if (ret)
+ if (ret) {
+ bch_err(c, "device add error: journal alloc failed");
goto err;
+ }
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
- err = "insufficient space in new superblock";
ret = bch2_sb_from_fs(c, ca);
- if (ret)
+ if (ret) {
+ bch_err(c, "device add error: new device superblock too small");
goto err_unlock;
+ }
mi = bch2_sb_get_members(ca->disk_sb.sb);
if (!bch2_sb_resize_members(&ca->disk_sb,
le32_to_cpu(mi->field.u64s) +
sizeof(dev_mi) / sizeof(u64))) {
+ bch_err(c, "device add error: new device superblock too small");
ret = -ENOSPC;
goto err_unlock;
}
if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
goto have_slot;
no_slot:
- err = "no slots available in superblock";
+ bch_err(c, "device add error: already have maximum number of devices");
ret = -ENOSPC;
goto err_unlock;
u64s = (sizeof(struct bch_sb_field_members) +
sizeof(struct bch_member) * nr_devices) / sizeof(u64);
- err = "no space in superblock for member info";
- ret = -ENOSPC;
-
mi = bch2_sb_resize_members(&c->disk_sb, u64s);
- if (!mi)
+ if (!mi) {
+ bch_err(c, "device add error: no room in superblock for member info");
+ ret = -ENOSPC;
goto err_unlock;
+ }
/* success: */
bch2_dev_usage_journal_reserve(c);
- err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret)
+ if (ret) {
+ bch_err(c, "device add error: error marking new superblock: %i", ret);
goto err_late;
+ }
+
+ ca->new_fs_bucket_idx = 0;
if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- err = __bch2_dev_read_write(c, ca);
- if (err)
+ ret = __bch2_dev_read_write(c, ca);
+ if (ret) {
+ bch_err(c, "device add error: error going RW on new device: %i", ret);
goto err_late;
+ }
}
up_write(&c->state_lock);
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
- bch_err(c, "Unable to add device: %s", err);
+ kfree(_errbuf);
return ret;
err_late:
up_write(&c->state_lock);
- bch_err(c, "Error going rw after adding device: %s", err);
- return -EINVAL;
+ ca = NULL;
+ goto err;
}
/* Hot add existing device to running filesystem: */
dev_idx = sb.sb->dev_idx;
err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
- if (err)
+ if (err) {
+ bch_err(c, "error bringing %s online: %s", path, err);
goto err;
+ }
- if (bch2_dev_attach_bdev(c, &sb)) {
- err = "bch2_dev_attach_bdev() error";
+ ret = bch2_dev_attach_bdev(c, &sb);
+ if (ret)
goto err;
- }
ca = bch_dev_locked(c, dev_idx);
- if (bch2_trans_mark_dev_sb(c, ca)) {
- err = "bch2_trans_mark_dev_sb() error";
+ ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret) {
+ bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
+ path, ret);
goto err;
}
if (ca->mi.state == BCH_MEMBER_STATE_rw) {
- err = __bch2_dev_read_write(c, ca);
- if (err)
+ ret = __bch2_dev_read_write(c, ca);
+ if (ret)
goto err;
}
err:
up_write(&c->state_lock);
bch2_free_super(&sb);
- bch_err(c, "error bringing %s online: %s", path, err);
return -EINVAL;
}
}
/* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
struct bch_dev *ca;
- dev_t dev;
unsigned i;
- int ret;
-
- ret = lookup_bdev(path, &dev);
- if (ret)
- return ERR_PTR(ret);
rcu_read_lock();
for_each_member_device_rcu(ca, c, i, NULL)
- if (ca->disk_sb.bdev->bd_dev == dev)
+ if (!strcmp(name, ca->name))
goto found;
ca = ERR_PTR(-ENOENT);
found:
struct bch_sb_field_members *mi;
unsigned i, best_sb = 0;
const char *err;
- int ret = -ENOMEM;
+ char *_errbuf = NULL;
+ struct printbuf errbuf;
+ int ret = 0;
+
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(-ENODEV);
pr_verbose_init(opts, "");
if (!nr_devices) {
- c = ERR_PTR(-EINVAL);
- goto out2;
+ ret = -EINVAL;
+ goto err;
}
- if (!try_module_get(THIS_MODULE)) {
- c = ERR_PTR(-ENODEV);
- goto out2;
+ _errbuf = kmalloc(4096, GFP_KERNEL);
+ if (!_errbuf) {
+ ret = -ENOMEM;
+ goto err;
}
+ errbuf = _PBUF(_errbuf, 4096);
+
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
- if (!sb)
+ if (!sb) {
+ ret = -ENOMEM;
goto err;
+ }
for (i = 0; i < nr_devices; i++) {
ret = bch2_read_super(devices[i], &opts, &sb[i]);
if (ret)
goto err;
- err = bch2_sb_validate(&sb[i]);
- if (err)
- goto err_print;
}
for (i = 1; i < nr_devices; i++)
i++;
}
- ret = -ENOMEM;
c = bch2_fs_alloc(sb[best_sb].sb, opts);
- if (!c)
+ if (IS_ERR(c)) {
+ ret = PTR_ERR(c);
goto err;
+ }
- err = "bch2_dev_online() error";
down_write(&c->state_lock);
- for (i = 0; i < nr_devices; i++)
- if (bch2_dev_attach_bdev(c, &sb[i])) {
+ for (i = 0; i < nr_devices; i++) {
+ ret = bch2_dev_attach_bdev(c, &sb[i]);
+ if (ret) {
up_write(&c->state_lock);
- goto err_print;
+ goto err;
}
+ }
up_write(&c->state_lock);
err = "insufficient devices";
}
out:
kfree(sb);
+ kfree(_errbuf);
module_put(THIS_MODULE);
-out2:
pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
return c;
err_print:
devices[0], err);
ret = -EINVAL;
err:
- if (c)
+ if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
- for (i = 0; i < nr_devices; i++)
- bch2_free_super(&sb[i]);
+ if (sb)
+ for (i = 0; i < nr_devices; i++)
+ bch2_free_super(&sb[i]);
c = ERR_PTR(ret);
goto out;
}
-static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
- struct bch_opts opts)
-{
- const char *err;
- struct bch_fs *c;
- bool allocated_fs = false;
- int ret;
-
- err = bch2_sb_validate(sb);
- if (err)
- return err;
-
- mutex_lock(&bch_fs_list_lock);
- c = __bch2_uuid_to_fs(sb->sb->uuid);
- if (c) {
- closure_get(&c->cl);
-
- err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
- if (err)
- goto err;
- } else {
- c = bch2_fs_alloc(sb->sb, opts);
- err = "cannot allocate memory";
- if (!c)
- goto err;
-
- allocated_fs = true;
- }
-
- err = "bch2_dev_online() error";
-
- mutex_lock(&c->sb_lock);
- if (bch2_dev_attach_bdev(c, sb)) {
- mutex_unlock(&c->sb_lock);
- goto err;
- }
- mutex_unlock(&c->sb_lock);
-
- if (!c->opts.nostart && bch2_fs_may_start(c)) {
- err = "error starting filesystem";
- ret = bch2_fs_start(c);
- if (ret)
- goto err;
- }
-
- closure_put(&c->cl);
- mutex_unlock(&bch_fs_list_lock);
-
- return NULL;
-err:
- mutex_unlock(&bch_fs_list_lock);
-
- if (allocated_fs)
- bch2_fs_stop(c);
- else if (c)
- closure_put(&c->cl);
-
- return err;
-}
-
-const char *bch2_fs_open_incremental(const char *path)
-{
- struct bch_sb_handle sb;
- struct bch_opts opts = bch2_opts_empty();
- const char *err;
-
- if (bch2_read_super(path, &opts, &sb))
- return "error reading superblock";
-
- err = __bch2_fs_open_incremental(&sb, opts);
- bch2_free_super(&sb);
-
- return err;
-}
-
/* Global interfaces/init */
static void bcachefs_exit(void)
return devs;
}
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+ u64 b_offset = bucket_to_sector(ca, b);
+ u64 b_end = bucket_to_sector(ca, b + 1);
+ unsigned i;
+
+ if (!b)
+ return true;
+
+ for (i = 0; i < layout->nr_superblocks; i++) {
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+ u64 end = offset + (1 << layout->sb_max_size_bits);
+
+ if (!(offset >= b_end || end <= b_offset))
+ return true;
+ }
+
+ return false;
+}
+
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
int bch2_fs_start(struct bch_fs *);
struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
-const char *bch2_fs_open_incremental(const char *path);
#endif /* _BCACHEFS_SUPER_H */
u16 bucket_size; /* sectors */
u16 group;
u8 state;
- u8 replacement;
u8 discard;
u8 data_allowed;
u8 durability;
#include "bcachefs.h"
#include "alloc_background.h"
+#include "alloc_foreground.h"
#include "sysfs.h"
#include "btree_cache.h"
#include "btree_io.h"
return strtoi_h(buf, &var) ?: (ssize_t) size; \
} while (0)
-write_attribute(trigger_journal_flush);
write_attribute(trigger_gc);
write_attribute(prune_cache);
rw_attribute(btree_gc_periodic);
read_attribute(uuid);
read_attribute(minor);
read_attribute(bucket_size);
-read_attribute(block_size);
-read_attribute(btree_node_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
read_attribute(durability);
read_attribute(btree_avg_write_size);
-read_attribute(bucket_quantiles_last_read);
-read_attribute(bucket_quantiles_last_write);
-read_attribute(bucket_quantiles_fragmentation);
-read_attribute(bucket_quantiles_oldest_gen);
-
read_attribute(reserve_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(extent_migrate_done);
read_attribute(extent_migrate_raced);
-rw_attribute(journal_write_delay_ms);
-rw_attribute(journal_reclaim_delay_ms);
-
rw_attribute(discard);
-rw_attribute(cache_replacement_policy);
rw_attribute(label);
rw_attribute(copy_gc_enabled);
read_attribute(io_timers_read);
read_attribute(io_timers_write);
+read_attribute(data_jobs);
+
#ifdef CONFIG_BCACHEFS_TESTS
write_attribute(perf_test);
#endif /* CONFIG_BCACHEFS_TESTS */
return nr ? div64_u64(sectors, nr) : 0;
}
-static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
+static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
+ long ret = 0;
+ struct bch_move_stats *stats;
- if (!fs_usage)
- return -ENOMEM;
-
- bch2_fs_usage_to_text(out, c, fs_usage);
-
- percpu_up_read(&c->mark_lock);
+ mutex_lock(&c->data_progress_lock);
+ list_for_each_entry(stats, &c->data_progress_list, list) {
+ pr_buf(out, "%s: data type %s btree_id %s position: ",
+ stats->name,
+ bch2_data_types[stats->data_type],
+ bch2_btree_ids[stats->btree_id]);
+ bch2_bpos_to_text(out, stats->pos);
+ pr_buf(out, "%s", "\n");
+ }
- kfree(fs_usage);
- return 0;
+ mutex_unlock(&c->data_progress_lock);
+ return ret;
}
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
- u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+ enum btree_id id;
+ u64 nr_uncompressed_extents = 0,
nr_compressed_extents = 0,
+ nr_incompressible_extents = 0,
+ uncompressed_sectors = 0,
+ incompressible_sectors = 0,
compressed_sectors_compressed = 0,
compressed_sectors_uncompressed = 0;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
- if (k.k->type == KEY_TYPE_extent) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ if (!((1U << id) & BTREE_ID_HAS_PTRS))
+ continue;
+
+ for_each_btree_key(&trans, iter, id, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
-
- extent_for_each_ptr_decode(e, p, entry) {
- if (!crc_is_compressed(p.crc)) {
- nr_uncompressed_extents++;
- uncompressed_sectors += e.k->size;
- } else {
- nr_compressed_extents++;
+ bool compressed = false, uncompressed = false, incompressible = false;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ switch (p.crc.compression_type) {
+ case BCH_COMPRESSION_TYPE_none:
+ uncompressed = true;
+ uncompressed_sectors += k.k->size;
+ break;
+ case BCH_COMPRESSION_TYPE_incompressible:
+ incompressible = true;
+ incompressible_sectors += k.k->size;
+ break;
+ default:
compressed_sectors_compressed +=
p.crc.compressed_size;
compressed_sectors_uncompressed +=
p.crc.uncompressed_size;
+ compressed = true;
+ break;
}
-
- /* only looking at the first ptr */
- break;
}
+
+ if (incompressible)
+ nr_incompressible_extents++;
+ else if (uncompressed)
+ nr_uncompressed_extents++;
+ else if (compressed)
+ nr_compressed_extents++;
}
+ bch2_trans_iter_exit(&trans, &iter);
+ }
+
+ bch2_trans_exit(&trans);
- ret = bch2_trans_exit(&trans) ?: ret;
if (ret)
return ret;
- pr_buf(out,
- "uncompressed data:\n"
- " nr extents: %llu\n"
- " size (bytes): %llu\n"
- "compressed data:\n"
- " nr extents: %llu\n"
- " compressed size (bytes): %llu\n"
- " uncompressed size (bytes): %llu\n",
- nr_uncompressed_extents,
- uncompressed_sectors << 9,
- nr_compressed_extents,
- compressed_sectors_compressed << 9,
- compressed_sectors_uncompressed << 9);
+ pr_buf(out, "uncompressed:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents);
+ pr_buf(out, " size: ");
+ bch2_hprint(out, uncompressed_sectors << 9);
+ pr_buf(out, "\n");
+
+ pr_buf(out, "compressed:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_compressed_extents);
+ pr_buf(out, " compressed size: ");
+ bch2_hprint(out, compressed_sectors_compressed << 9);
+ pr_buf(out, "\n");
+ pr_buf(out, " uncompressed size: ");
+ bch2_hprint(out, compressed_sectors_uncompressed << 9);
+ pr_buf(out, "\n");
+
+ pr_buf(out, "incompressible:\n");
+ pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents);
+ pr_buf(out, " size: ");
+ bch2_hprint(out, incompressible_sectors << 9);
+ pr_buf(out, "\n");
return 0;
}
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
- sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms);
- sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
-
- sysfs_print(block_size, block_bytes(c));
- sysfs_print(btree_node_size, btree_bytes(c));
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
/* Debugging: */
- if (attr == &sysfs_alloc_debug)
- return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf;
-
if (attr == &sysfs_journal_debug) {
bch2_journal_debug_to_text(&out, &c->journal);
return out.pos - buf;
return out.pos - buf;
}
+ if (attr == &sysfs_data_jobs) {
+ data_progress_to_text(&out, c);
+ return out.pos - buf;
+ }
+
return 0;
}
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
- sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
- sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
-
if (attr == &sysfs_btree_gc_periodic) {
ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
?: (ssize_t) size;
/* Debugging: */
- if (attr == &sysfs_trigger_journal_flush)
- bch2_journal_meta(&c->journal);
+ if (!test_bit(BCH_FS_RW, &c->flags))
+ return -EROFS;
+
+ if (attr == &sysfs_prune_cache) {
+ struct shrink_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.nr_to_scan = strtoul_or_return(buf);
+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+ }
if (attr == &sysfs_trigger_gc) {
/*
#endif
}
- if (attr == &sysfs_prune_cache) {
- struct shrink_control sc;
-
- sc.gfp_mask = GFP_KERNEL;
- sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
- }
-
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
struct attribute *bch2_fs_files[] = {
&sysfs_minor,
- &sysfs_block_size,
- &sysfs_btree_node_size,
&sysfs_btree_cache_size,
&sysfs_btree_avg_write_size,
- &sysfs_journal_write_delay_ms,
- &sysfs_journal_reclaim_delay_ms,
-
&sysfs_promote_whole_extents,
&sysfs_compression_stats,
SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
- &sysfs_alloc_debug,
&sysfs_journal_debug,
&sysfs_journal_pins,
&sysfs_btree_updates,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_btree_transactions,
+ &sysfs_new_stripes,
&sysfs_stripes_heap,
&sysfs_open_buckets,
+ &sysfs_io_timers_read,
+ &sysfs_io_timers_write,
+
+ &sysfs_trigger_gc,
+ &sysfs_prune_cache,
&sysfs_read_realloc_races,
&sysfs_extent_migrate_done,
&sysfs_extent_migrate_raced,
- &sysfs_trigger_journal_flush,
- &sysfs_trigger_gc,
&sysfs_gc_gens_pos,
- &sysfs_prune_cache,
&sysfs_copy_gc_enabled,
&sysfs_copy_gc_wait,
&sysfs_rebalance_work,
sysfs_pd_controller_files(rebalance),
- &sysfs_new_stripes,
-
- &sysfs_io_timers_read,
- &sysfs_io_timers_write,
+ &sysfs_data_jobs,
&sysfs_internal_uuid,
NULL
if (!tmp)
return -ENOMEM;
- ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+ ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
kfree(tmp);
if (ret < 0)
if (ret < 0)
return ret;
- if (opt->set_sb != SET_NO_SB_OPT) {
- mutex_lock(&c->sb_lock);
- opt->set_sb(c->disk_sb.sb, v);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
-
+ bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
if ((id == Opt_background_target ||
for (i = bch2_opt_table;
i < bch2_opt_table + bch2_opts_nr;
i++) {
- if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME)))
+ if (!(i->flags & OPT_FS))
continue;
ret = sysfs_create_file(kobj, &i->attr);
NULL
};
-typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
- size_t, void *);
-
-static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
- size_t b, void *private)
-{
- int rw = (private ? 1 : 0);
-
- return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
-}
-
-static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
- size_t b, void *private)
-{
- struct bucket *g = bucket(ca, b);
- return bucket_sectors_used(g->mark);
-}
-
-static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
- size_t b, void *private)
-{
- return bucket_gc_gen(bucket(ca, b));
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
- const unsigned *l = _l;
- const unsigned *r = _r;
-
- return cmp_int(*l, *r);
-}
-
-static int quantiles_to_text(struct printbuf *out,
- struct bch_fs *c, struct bch_dev *ca,
- bucket_map_fn *fn, void *private)
-{
- size_t i, n;
- /* Compute 31 quantiles */
- unsigned q[31], *p;
-
- down_read(&ca->bucket_lock);
- n = ca->mi.nbuckets;
-
- p = vzalloc(n * sizeof(unsigned));
- if (!p) {
- up_read(&ca->bucket_lock);
- return -ENOMEM;
- }
-
- for (i = ca->mi.first_bucket; i < n; i++)
- p[i] = fn(c, ca, i, private);
-
- sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
- up_read(&ca->bucket_lock);
-
- while (n &&
- !p[n - 1])
- --n;
-
- for (i = 0; i < ARRAY_SIZE(q); i++)
- q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
-
- vfree(p);
-
- for (i = 0; i < ARRAY_SIZE(q); i++)
- pr_buf(out, "%u ", q[i]);
- pr_buf(out, "\n");
- return 0;
-}
-
static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
{
enum alloc_reserve i;
memset(nr, 0, sizeof(nr));
for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
- nr[c->open_buckets[i].type]++;
+ nr[c->open_buckets[i].data_type]++;
pr_buf(out,
"\t\t buckets\t sectors fragmented\n"
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
sysfs_print(bucket_size, bucket_bytes(ca));
- sysfs_print(block_size, block_bytes(c));
sysfs_print(first_bucket, ca->mi.first_bucket);
sysfs_print(nbuckets, ca->mi.nbuckets);
sysfs_print(durability, ca->mi.durability);
return out.pos - buf;
}
- if (attr == &sysfs_cache_replacement_policy) {
- bch2_string_opt_to_text(&out,
- bch2_cache_replacement_policies,
- ca->mi.replacement);
- pr_buf(&out, "\n");
- return out.pos - buf;
- }
-
if (attr == &sysfs_state_rw) {
bch2_string_opt_to_text(&out, bch2_member_states,
ca->mi.state);
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX);
- if (attr == &sysfs_bucket_quantiles_last_read)
- return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf;
- if (attr == &sysfs_bucket_quantiles_last_write)
- return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf;
- if (attr == &sysfs_bucket_quantiles_fragmentation)
- return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf;
- if (attr == &sysfs_bucket_quantiles_oldest_gen)
- return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf;
-
if (attr == &sysfs_reserve_stats) {
reserve_stats_to_text(&out, ca);
return out.pos - buf;
mutex_unlock(&c->sb_lock);
}
- if (attr == &sysfs_cache_replacement_policy) {
- ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
-
- if (v < 0)
- return v;
-
- mutex_lock(&c->sb_lock);
- mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-
- if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
- SET_BCH_MEMBER_REPLACEMENT(mi, v);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
- }
-
if (attr == &sysfs_label) {
char *tmp;
int ret;
struct attribute *bch2_dev_files[] = {
&sysfs_uuid,
&sysfs_bucket_size,
- &sysfs_block_size,
&sysfs_first_bucket,
&sysfs_nbuckets,
&sysfs_durability,
/* settings: */
&sysfs_discard,
- &sysfs_cache_replacement_policy,
&sysfs_state_rw,
&sysfs_label,
&sysfs_io_latency_stats_write,
&sysfs_congested,
- /* alloc info - other stats: */
- &sysfs_bucket_quantiles_last_read,
- &sysfs_bucket_quantiles_last_write,
- &sysfs_bucket_quantiles_fragmentation,
- &sysfs_bucket_quantiles_oldest_gen,
-
&sysfs_reserve_stats,
/* debug: */
#include "bcachefs.h"
#include "btree_update.h"
#include "journal_reclaim.h"
+#include "subvolume.h"
#include "tests.h"
#include "linux/kthread.h"
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_extents,
- POS(0, 0), POS(0, U64_MAX),
+ POS_MIN, SPOS_MAX,
+ BTREE_ITER_ALL_SNAPSHOTS,
NULL);
BUG_ON(ret);
ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- POS(0, 0), POS(0, U64_MAX),
+ POS_MIN, SPOS_MAX,
+ BTREE_ITER_ALL_SNAPSHOTS,
NULL);
BUG_ON(ret);
}
static int test_delete(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_i_cookie k;
int ret;
k.k.p.snapshot = U32_MAX;
bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+ BTREE_ITER_INTENT);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(&trans, iter, &k.k_i, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, &k.k_i, 0));
if (ret) {
bch_err(c, "update error in test_delete: %i", ret);
goto err;
pr_info("deleting once");
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(iter) ?:
- bch2_btree_delete_at(&trans, iter, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
bch_err(c, "delete error (first) in test_delete: %i", ret);
goto err;
pr_info("deleting twice");
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(iter) ?:
- bch2_btree_delete_at(&trans, iter, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
bch_err(c, "delete error (second) in test_delete: %i", ret);
goto err;
}
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
static int test_delete_written(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_i_cookie k;
int ret;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
- BTREE_ITER_INTENT);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+ BTREE_ITER_INTENT);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(&trans, iter, &k.k_i, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, &k.k_i, 0));
if (ret) {
bch_err(c, "update error in test_delete_written: %i", ret);
goto err;
bch2_journal_flush_all_pins(&c->journal);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(iter) ?:
- bch2_btree_delete_at(&trans, iter, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_delete_at(&trans, &iter, 0));
if (ret) {
bch_err(c, "delete error in test_delete_written: %i", ret);
goto err;
}
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
static int test_iterate(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
i = 0;
for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- POS_MIN, 0, k, ret) {
+ SPOS(0, 0, U32_MAX), 0, k, ret) {
if (k.k->p.inode)
break;
pr_info("iterating backwards");
- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k))
+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
BUG_ON(k.k->p.offset != --i);
BUG_ON(i);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter = NULL;
+ struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
i = 0;
for_each_btree_key(&trans, iter, BTREE_ID_extents,
- POS_MIN, 0, k, ret) {
+ SPOS(0, 0, U32_MAX), 0, k, ret) {
BUG_ON(bkey_start_offset(k.k) != i);
i = k.k->p.offset;
}
pr_info("iterating backwards");
- while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) {
+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
BUG_ON(k.k->p.offset != i);
i = bkey_start_offset(k.k);
}
BUG_ON(i);
err:
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
static int test_iterate_slots(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
- 0, k, ret) {
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0, k, ret) {
if (k.k->p.inode)
break;
BUG_ON(k.k->p.offset != i);
i += 2;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
BUG_ON(i != nr * 2);
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
BTREE_ITER_SLOTS, k, ret) {
BUG_ON(k.k->p.offset != i);
BUG_ON(bkey_deleted(k.k) != (i & 1));
if (i == nr * 2)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
err:
bch2_trans_exit(&trans);
return ret;
static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter = { NULL };
struct bkey_s_c k;
u64 i;
int ret = 0;
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
- 0, k, ret) {
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), 0, k, ret) {
BUG_ON(bkey_start_offset(k.k) != i + 8);
BUG_ON(k.k->size != 8);
i += 16;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
BUG_ON(i != nr);
i = 0;
- for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX),
BTREE_ITER_SLOTS, k, ret) {
BUG_ON(bkey_deleted(k.k) != !(i % 16));
if (i == nr)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
err:
bch2_trans_exit(&trans);
return 0;
static int test_peek_end(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
-
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
- bch2_trans_iter_put(&trans, iter);
-
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return 0;
}
static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
-
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
BUG_ON(k.k);
- bch2_trans_iter_put(&trans, iter);
-
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return 0;
}
struct bkey_i_cookie k;
int ret;
- //pr_info("inserting %llu-%llu v %llu", start, end, test_version);
-
bkey_cookie_init(&k.k_i);
k.k_i.k.p.offset = end;
k.k_i.k.p.snapshot = U32_MAX;
__test_extent_overwrite(c, 32, 64, 32, 128);
}
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie cookie;
+ int ret;
+
+ bkey_cookie_init(&cookie.k_i);
+ cookie.k.p.snapshot = snapid_hi;
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ bch2_trans_init(&trans, c, 0, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, snapid_lo), 0);
+ k = bch2_btree_iter_peek(&iter);
+
+ BUG_ON(k.k->p.snapshot != U32_MAX);
+
+ bch2_trans_iter_exit(&trans, &iter);
+ bch2_trans_exit(&trans);
+ return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+ struct bkey_i_cookie cookie;
+ u32 snapids[2];
+ u32 snapid_subvols[2] = { 1, 1 };
+ int ret;
+
+ bkey_cookie_init(&cookie.k_i);
+ cookie.k.p.snapshot = U32_MAX;
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+ NULL, NULL, 0);
+ if (ret)
+ return ret;
+
+ ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_snapshot_node_create(&trans, U32_MAX,
+ snapids,
+ snapid_subvols,
+ 2));
+ if (ret)
+ return ret;
+
+ if (snapids[0] > snapids[1])
+ swap(snapids[0], snapids[1]);
+
+ ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+ if (ret) {
+ bch_err(c, "err %i from test_snapshot_filter", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
/* perf tests */
static u64 test_rand(void)
static int rand_lookup(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
u64 i;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0);
for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
+ bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret) {
bch_err(c, "error in rand_lookup: %i", ret);
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
+static int rand_mixed_trans(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i_cookie *cookie,
+ u64 i, u64 pos)
+{
+ struct bkey_s_c k;
+ int ret;
+
+ bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
+
+ k = bch2_btree_iter_peek(iter);
+ ret = bkey_err(k);
+ if (ret && ret != -EINTR)
+ bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+ if (ret)
+ return ret;
+
+ if (!(i & 3) && k.k) {
+ bkey_cookie_init(&cookie->k_i);
+ cookie->k.p = iter->pos;
+ ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
+ }
+
+ return ret;
+}
+
static int rand_mixed(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
- struct bkey_s_c k;
+ struct btree_iter iter;
+ struct bkey_i_cookie cookie;
int ret = 0;
- u64 i;
+ u64 i, rand;
bch2_trans_init(&trans, c, 0, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0);
for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
-
- k = bch2_btree_iter_peek(iter);
- ret = bkey_err(k);
+ rand = test_rand();
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ rand_mixed_trans(&trans, &iter, &cookie, i, rand));
if (ret) {
- bch_err(c, "lookup error in rand_mixed: %i", ret);
+ bch_err(c, "update error in rand_mixed: %i", ret);
break;
}
-
- if (!(i & 3) && k.k) {
- struct bkey_i_cookie k;
-
- bkey_cookie_init(&k.k_i);
- k.k.p = iter->pos;
-
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(&trans, iter, &k.k_i, 0));
- if (ret) {
- bch_err(c, "update error in rand_mixed: %i", ret);
- break;
- }
- }
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
static int __do_delete(struct btree_trans *trans, struct bpos pos)
{
- struct btree_iter *iter;
- struct bkey_i delete;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
- iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
- BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(iter);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (!k.k)
goto err;
- bkey_init(&delete.k);
- delete.k.p = k.k->p;
-
- ret = bch2_trans_update(trans, iter, &delete, 0);
+ ret = bch2_btree_delete_at(trans, &iter, 0);
err:
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
bch2_trans_init(&trans, c, 0, 0);
for (i = 0; i < nr; i++) {
- struct bpos pos = POS(0, test_rand());
+ struct bpos pos = SPOS(0, test_rand(), U32_MAX);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
__do_delete(&trans, pos));
static int seq_insert(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_cookie insert;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
- insert.k.p = iter->pos;
+ insert.k.p = iter.pos;
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(&trans, iter, &insert.k_i, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, &insert.k_i, 0));
if (ret) {
bch_err(c, "error in seq_insert: %i", ret);
break;
if (++i == nr)
break;
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
static int seq_lookup(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), 0, k, ret)
;
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
static int seq_overwrite(struct bch_fs *c, u64 nr)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX),
BTREE_ITER_INTENT, k, ret) {
struct bkey_i_cookie u;
bkey_reassemble(&u.k_i, k);
ret = __bch2_trans_do(&trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(iter) ?:
- bch2_trans_update(&trans, iter, &u.k_i, 0));
+ bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update(&trans, &iter, &u.k_i, 0));
if (ret) {
bch_err(c, "error in seq_overwrite: %i", ret);
break;
}
}
- bch2_trans_iter_put(&trans, iter);
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
int ret;
ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
- POS(0, 0), POS(0, U64_MAX),
+ POS_MIN, SPOS_MAX,
+ BTREE_ITER_ALL_SNAPSHOTS,
NULL);
if (ret)
bch_err(c, "error in seq_delete: %i", ret);
wait_event(j->ready_wait, !atomic_read(&j->ready));
}
- ret = j->fn(j->c, j->nr / j->nr_threads);
- if (ret)
+ ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
+ if (ret) {
+ bch_err(j->c, "%ps: error %i", j->fn, ret);
j->ret = ret;
+ }
if (atomic_dec_and_test(&j->done)) {
j->finish = sched_clock();
perf_test(test_extent_overwrite_middle);
perf_test(test_extent_overwrite_all);
+ perf_test(test_snapshots);
+
if (!j.fn) {
pr_err("unknown test %s", testname);
return -EINVAL;
scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
bch2_hprint(&PBUF(nr_buf), nr);
- bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time);
+ bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
name_buf, nr_buf, nr_threads,
- time / NSEC_PER_SEC,
- time * nr_threads / nr,
+ div_u64(time, NSEC_PER_SEC),
+ div_u64(time * nr_threads, nr),
per_sec_buf);
return j.ret;
}
* 103 is magic: t is in the range [-1023, 1023] and we want
* to turn it into [-9, 9]
*/
- if (u && v < 100 && v > -100)
+ if (u && t && v < 100 && v > -100)
pr_buf(buf, ".%i", t / 103);
if (u)
pr_buf(buf, "%c", si_units[u]);
if (!page)
return -ENOMEM;
- BUG_ON(!bio_add_page(bio, page, len, 0));
+ if (unlikely(!bio_add_page(bio, page, len, 0))) {
+ __free_page(page);
+ break;
+ }
+
size -= len;
}
*/
u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
{
- u64 *ret = this_cpu_ptr(p);
+ u64 *ret;
int cpu;
+ /* access to pcpu vars has to be blocked by other locking */
+ preempt_disable();
+ ret = this_cpu_ptr(p);
+ preempt_enable();
+
for_each_possible_cpu(cpu) {
u64 *i = per_cpu_ptr(p, cpu);
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
-#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9)
-#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT)
-
struct closure;
#ifdef CONFIG_BCACHEFS_DEBUG
struct printbuf {
char *pos;
char *end;
+ unsigned indent;
};
static inline size_t printbuf_remaining(struct printbuf *buf)
__VA_ARGS__); \
} while (0)
+static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces)
+{
+ buf->indent += spaces;
+ while (spaces--)
+ pr_buf(buf, " ");
+}
+
+static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces)
+{
+ buf->indent -= spaces;
+}
+
+static inline void printbuf_newline(struct printbuf *buf)
+{
+ unsigned i;
+
+ pr_buf(buf, "\n");
+ for (i = 0; i < buf->indent; i++)
+ pr_buf(buf, " ");
+}
+
void bch_scnmemcpy(struct printbuf *, const char *, size_t);
int bch2_strtoint_h(const char *, int *);
return cmp_int(l, r);
}
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+ sprintf(out, "%plU", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
+
#endif /* _BCACHEFS_UTIL_H */
#include <linux/string.h>
#include <asm/unaligned.h>
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
#include "varint.h"
/**
*/
int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
{
+#ifdef CONFIG_VALGRIND
+ VALGRIND_MAKE_MEM_DEFINED(in, 8);
+#endif
u64 v = get_unaligned_le64(in);
- unsigned bytes = ffz(v & 255) + 1;
+ unsigned bytes = ffz(*in) + 1;
if (unlikely(in + bytes > end))
return -1;
const char *name, void *buffer, size_t size, int type)
{
struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c_xattr xattr;
struct bkey_s_c k;
int ret;
- iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash,
- inode->v.i_ino,
- &X_SEARCH(type, name, strlen(name)),
- 0);
- ret = PTR_ERR_OR_ZERO(iter);
+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+ inode_inum(inode),
+ &X_SEARCH(type, name, strlen(name)),
+ 0);
if (ret)
- goto err;
+ goto err1;
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- goto err;
+ goto err2;
xattr = bkey_s_c_to_xattr(k);
ret = le16_to_cpu(xattr.v->x_val_len);
else
memcpy(buffer, xattr_val(xattr.v), ret);
}
- bch2_trans_iter_put(trans, iter);
-err:
+err2:
+ bch2_trans_iter_exit(trans, &iter);
+err1:
return ret == -ENOENT ? -ENODATA : ret;
}
bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
}
-int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
const struct bch_hash_info *hash_info,
const char *name, const void *value, size_t size,
int type, int flags)
{
+ struct btree_iter inode_iter = { NULL };
+ struct bch_inode_unpacked inode_u;
int ret;
+ /*
+ * We need to do an inode update so that bi_journal_sync gets updated
+ * and fsync works:
+ *
+ * Perhaps we should be updating bi_mtime too?
+ */
+
+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+ bch2_inode_write(trans, &inode_iter, &inode_u);
+ bch2_trans_iter_exit(trans, &inode_iter);
+
+ if (ret)
+ return ret;
+
if (value) {
struct bkey_i_xattr *xattr;
unsigned namelen = strlen(name);
struct bch_fs *c = dentry->d_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_iter iter;
struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
- u64 inum = dentry->d_inode->i_ino;
+ u64 offset = 0, inum = inode->ei_inode.bi_inum;
+ u32 snapshot;
int ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+ iter = (struct btree_iter) { NULL };
+
+ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+ if (ret)
+ goto err;
- for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
- POS(inum, 0), 0, k, ret) {
+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
+ SPOS(inum, offset, snapshot), 0, k, ret) {
BUG_ON(k.k->p.inode < inum);
if (k.k->p.inode > inum)
if (ret)
break;
}
- bch2_trans_iter_put(&trans, iter);
- ret = bch2_trans_exit(&trans) ?: ret;
+ offset = iter.pos.offset;
+ bch2_trans_iter_exit(&trans, &iter);
+err:
+ if (ret == -EINTR)
+ goto retry;
+
+ bch2_trans_exit(&trans);
if (ret)
return ret;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
- return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
- bch2_xattr_set(&trans, inode->v.i_ino, &hash,
+ return bch2_trans_do(c, NULL, NULL, 0,
+ bch2_xattr_set(&trans, inode_inum(inode), &hash,
name, value, size,
handler->flags, flags));
}
memcpy(buf, value, size);
buf[size] = '\0';
- ret = bch2_opt_parse(c, opt, buf, &v);
+ ret = bch2_opt_parse(c, NULL, opt, buf, &v);
kfree(buf);
if (ret < 0)
int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
const char *, void *, size_t, int);
-int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+ const struct bch_hash_info *,
const char *, const void *, size_t, int, int);
ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
#include "tools-util.h"
+struct fops {
+ void (*init)(void);
+ void (*cleanup)(void);
+ void (*read)(struct bio *bio, struct iovec * iov, unsigned i);
+ void (*write)(struct bio *bio, struct iovec * iov, unsigned i);
+};
+
+static struct fops *fops;
static io_context_t aio_ctx;
static atomic_t running_requests;
#endif
}
- struct iocb iocb = {
- .data = bio,
- .aio_fildes = bio->bi_opf & REQ_FUA
- ? bio->bi_bdev->bd_sync_fd
- : bio->bi_bdev->bd_fd,
- }, *iocbp = &iocb;
-
switch (bio_op(bio)) {
case REQ_OP_READ:
- iocb.aio_lio_opcode = IO_CMD_PREADV;
- iocb.u.v.vec = iov;
- iocb.u.v.nr = i;
- iocb.u.v.offset = bio->bi_iter.bi_sector << 9;
-
- atomic_inc(&running_requests);
- ret = io_submit(aio_ctx, 1, &iocbp);
- if (ret != 1)
- die("io_submit err: %s", strerror(-ret));
+ fops->read(bio, iov, i);
break;
case REQ_OP_WRITE:
- iocb.aio_lio_opcode = IO_CMD_PWRITEV;
- iocb.u.v.vec = iov;
- iocb.u.v.nr = i;
- iocb.u.v.offset = bio->bi_iter.bi_sector << 9;
-
- atomic_inc(&running_requests);
- ret = io_submit(aio_ctx, 1, &iocbp);
- if (ret != 1)
- die("io_submit err: %s", strerror(-ret));
+ fops->write(bio, iov, i);
break;
case REQ_OP_FLUSH:
ret = fsync(bio->bi_bdev->bd_fd);
bdev->bd_sync_fd = sync_fd;
bdev->bd_holder = holder;
bdev->bd_disk = &bdev->__bd_disk;
- bdev->bd_bdi = &bdev->__bd_bdi;
- bdev->queue.backing_dev_info = bdev->bd_bdi;
+ bdev->bd_disk->bdi = &bdev->bd_disk->__bdi;
+ bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
return bdev;
}
return -EINVAL;
}
+static void io_fallback(void)
+{
+ fops++;
+ if (fops->init == NULL)
+ die("no fallback possible, something is very wrong");
+ fops->init();
+}
+
+static void sync_check(struct bio *bio, int ret)
+{
+ if (ret != bio->bi_iter.bi_size) {
+ die("IO error: %s\n", strerror(-ret));
+ }
+
+ if (bio->bi_opf & REQ_FUA) {
+ ret = fdatasync(bio->bi_bdev->bd_fd);
+ if (ret)
+ die("fsync error: %s\n", strerror(-ret));
+ }
+ bio_endio(bio);
+}
+
+static void sync_init(void) {}
+
+static void sync_cleanup(void)
+{
+ /* not necessary? */
+ sync();
+}
+
+static void sync_read(struct bio *bio, struct iovec * iov, unsigned i)
+{
+
+ int fd = bio->bi_opf & REQ_FUA
+ ? bio->bi_bdev->bd_sync_fd
+ : bio->bi_bdev->bd_fd;
+ ssize_t ret = preadv(fd, iov, i, bio->bi_iter.bi_sector << 9);
+ sync_check(bio, ret);
+}
+
+static void sync_write(struct bio *bio, struct iovec * iov, unsigned i)
+{
+ int fd = bio->bi_opf & REQ_FUA
+ ? bio->bi_bdev->bd_sync_fd
+ : bio->bi_bdev->bd_fd;
+ ssize_t ret = pwritev(fd, iov, i, bio->bi_iter.bi_sector << 9);
+ sync_check(bio, ret);
+}
+
static int aio_completion_thread(void *arg)
{
struct io_event events[8], *ev;
static struct task_struct *aio_task = NULL;
-__attribute__((constructor(102)))
-static void blkdev_init(void)
+static void aio_init(void)
{
struct task_struct *p;
+ long err = io_setup(256, &aio_ctx);
+ if (!err) {
+ p = kthread_run(aio_completion_thread, NULL, "aio_completion");
+ BUG_ON(IS_ERR(p));
- if (io_setup(256, &aio_ctx))
- die("io_setup() error: %m");
-
- p = kthread_run(aio_completion_thread, NULL, "aio_completion");
- BUG_ON(IS_ERR(p));
+ aio_task = p;
- aio_task = p;
+ } else if (err == -ENOSYS) {
+ io_fallback();
+ } else {
+ die("io_setup() error: %s", strerror(err));
+ }
}
-__attribute__((destructor(102)))
-static void blkdev_cleanup(void)
+static void aio_cleanup(void)
{
struct task_struct *p = NULL;
swap(aio_task, p);
close(fds[0]);
close(fds[1]);
}
+
+static void aio_op(struct bio *bio, struct iovec *iov, unsigned i, int opcode)
+{
+ ssize_t ret;
+ struct iocb iocb = {
+ .data = bio,
+ .aio_fildes = bio->bi_opf & REQ_FUA
+ ? bio->bi_bdev->bd_sync_fd
+ : bio->bi_bdev->bd_fd,
+ .aio_lio_opcode = opcode,
+ .u.c.buf = iov,
+ .u.c.nbytes = i,
+ .u.c.offset = bio->bi_iter.bi_sector << 9,
+
+ }, *iocbp = &iocb;
+
+ atomic_inc(&running_requests);
+ ret = io_submit(aio_ctx, 1, &iocbp);
+ if (ret != 1)
+ die("io_submit err: %s", strerror(-ret));
+}
+
+static void aio_read(struct bio *bio, struct iovec *iov, unsigned i)
+{
+ aio_op(bio, iov, i, IO_CMD_PREADV);
+}
+
+static void aio_write(struct bio *bio, struct iovec * iov, unsigned i)
+{
+ aio_op(bio, iov, i, IO_CMD_PWRITEV);
+}
+
+
+/* not implemented */
+static void uring_init(void) {
+ io_fallback();
+}
+
+struct fops fops_list[] = {
+ {
+ .init = uring_init,
+ }, {
+ .init = aio_init,
+ .cleanup = aio_cleanup,
+ .read = aio_read,
+ .write = aio_write,
+ }, {
+ .init = sync_init,
+ .cleanup = sync_cleanup,
+ .read = sync_read,
+ .write = sync_write,
+ }, {
+ /* NULL */
+ }
+};
+
+__attribute__((constructor(102)))
+static void blkdev_init(void)
+{
+ fops = fops_list;
+ fops->init();
+}
+
+__attribute__((destructor(102)))
+static void blkdev_cleanup(void)
+{
+ fops->cleanup();
+}
--- /dev/null
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
+ */
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+#include <linux/dcache.h>
+#include <asm/word-at-a-time.h>
+#endif
+
+#define SIPROUND \
+ do { \
+ v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \
+ v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \
+ v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \
+ v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \
+ } while (0)
+
+#define PREAMBLE(len) \
+ u64 v0 = 0x736f6d6570736575ULL; \
+ u64 v1 = 0x646f72616e646f6dULL; \
+ u64 v2 = 0x6c7967656e657261ULL; \
+ u64 v3 = 0x7465646279746573ULL; \
+ u64 b = ((u64)(len)) << 56; \
+ v3 ^= key->key[1]; \
+ v2 ^= key->key[0]; \
+ v1 ^= key->key[1]; \
+ v0 ^= key->key[0];
+
+#define POSTAMBLE \
+ v3 ^= b; \
+ SIPROUND; \
+ SIPROUND; \
+ v0 ^= b; \
+ v2 ^= 0xff; \
+ SIPROUND; \
+ SIPROUND; \
+ SIPROUND; \
+ SIPROUND; \
+ return (v0 ^ v1) ^ (v2 ^ v3);
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
+{
+ const u8 *end = data + len - (len % sizeof(u64));
+ const u8 left = len & (sizeof(u64) - 1);
+ u64 m;
+ PREAMBLE(len)
+ for (; data != end; data += sizeof(u64)) {
+ m = le64_to_cpup(data);
+ v3 ^= m;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= m;
+ }
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+ if (left)
+ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+ bytemask_from_count(left)));
+#else
+ switch (left) {
+ case 7: b |= ((u64)end[6]) << 48; fallthrough;
+ case 6: b |= ((u64)end[5]) << 40; fallthrough;
+ case 5: b |= ((u64)end[4]) << 32; fallthrough;
+ case 4: b |= le32_to_cpup(data); break;
+ case 3: b |= ((u64)end[2]) << 16; fallthrough;
+ case 2: b |= le16_to_cpup(data); break;
+ case 1: b |= end[0];
+ }
+#endif
+ POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
+{
+ const u8 *end = data + len - (len % sizeof(u64));
+ const u8 left = len & (sizeof(u64) - 1);
+ u64 m;
+ PREAMBLE(len)
+ for (; data != end; data += sizeof(u64)) {
+ m = get_unaligned_le64(data);
+ v3 ^= m;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= m;
+ }
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+ if (left)
+ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+ bytemask_from_count(left)));
+#else
+ switch (left) {
+ case 7: b |= ((u64)end[6]) << 48; fallthrough;
+ case 6: b |= ((u64)end[5]) << 40; fallthrough;
+ case 5: b |= ((u64)end[4]) << 32; fallthrough;
+ case 4: b |= get_unaligned_le32(end); break;
+ case 3: b |= ((u64)end[2]) << 16; fallthrough;
+ case 2: b |= get_unaligned_le16(end); break;
+ case 1: b |= end[0];
+ }
+#endif
+ POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_unaligned);
+#endif
+
+/**
+ * siphash_1u64 - compute 64-bit siphash PRF value of a u64
+ * @first: first u64
+ * @key: the siphash key
+ */
+u64 siphash_1u64(const u64 first, const siphash_key_t *key)
+{
+ PREAMBLE(8)
+ v3 ^= first;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= first;
+ POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u64);
+
+/**
+ * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
+ * @first: first u64
+ * @second: second u64
+ * @key: the siphash key
+ */
+u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
+{
+ PREAMBLE(16)
+ v3 ^= first;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= first;
+ v3 ^= second;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= second;
+ POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_2u64);
+
+/**
+ * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @key: the siphash key
+ */
+u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
+ const siphash_key_t *key)
+{
+ PREAMBLE(24)
+ v3 ^= first;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= first;
+ v3 ^= second;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= second;
+ v3 ^= third;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= third;
+ POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u64);
+
+/**
+ * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @forth: forth u64
+ * @key: the siphash key
+ */
+u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
+ const u64 forth, const siphash_key_t *key)
+{
+ PREAMBLE(32)
+ v3 ^= first;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= first;
+ v3 ^= second;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= second;
+ v3 ^= third;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= third;
+ v3 ^= forth;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= forth;
+ POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_4u64);
+
+u64 siphash_1u32(const u32 first, const siphash_key_t *key)
+{
+ PREAMBLE(4)
+ b |= first;
+ POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u32);
+
+u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
+ const siphash_key_t *key)
+{
+ u64 combined = (u64)second << 32 | first;
+ PREAMBLE(12)
+ v3 ^= combined;
+ SIPROUND;
+ SIPROUND;
+ v0 ^= combined;
+ b |= third;
+ POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u32);
+
+#if BITS_PER_LONG == 64
+/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
+ * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
+ */
+
+#define HSIPROUND SIPROUND
+#define HPREAMBLE(len) PREAMBLE(len)
+#define HPOSTAMBLE \
+ v3 ^= b; \
+ HSIPROUND; \
+ v0 ^= b; \
+ v2 ^= 0xff; \
+ HSIPROUND; \
+ HSIPROUND; \
+ HSIPROUND; \
+ return (v0 ^ v1) ^ (v2 ^ v3);
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+ const u8 *end = data + len - (len % sizeof(u64));
+ const u8 left = len & (sizeof(u64) - 1);
+ u64 m;
+ HPREAMBLE(len)
+ for (; data != end; data += sizeof(u64)) {
+ m = le64_to_cpup(data);
+ v3 ^= m;
+ HSIPROUND;
+ v0 ^= m;
+ }
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+ if (left)
+ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+ bytemask_from_count(left)));
+#else
+ switch (left) {
+ case 7: b |= ((u64)end[6]) << 48; fallthrough;
+ case 6: b |= ((u64)end[5]) << 40; fallthrough;
+ case 5: b |= ((u64)end[4]) << 32; fallthrough;
+ case 4: b |= le32_to_cpup(data); break;
+ case 3: b |= ((u64)end[2]) << 16; fallthrough;
+ case 2: b |= le16_to_cpup(data); break;
+ case 1: b |= end[0];
+ }
+#endif
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+ const hsiphash_key_t *key)
+{
+ const u8 *end = data + len - (len % sizeof(u64));
+ const u8 left = len & (sizeof(u64) - 1);
+ u64 m;
+ HPREAMBLE(len)
+ for (; data != end; data += sizeof(u64)) {
+ m = get_unaligned_le64(data);
+ v3 ^= m;
+ HSIPROUND;
+ v0 ^= m;
+ }
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+ if (left)
+ b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+ bytemask_from_count(left)));
+#else
+ switch (left) {
+ case 7: b |= ((u64)end[6]) << 48; fallthrough;
+ case 6: b |= ((u64)end[5]) << 40; fallthrough;
+ case 5: b |= ((u64)end[4]) << 32; fallthrough;
+ case 4: b |= get_unaligned_le32(end); break;
+ case 3: b |= ((u64)end[2]) << 16; fallthrough;
+ case 2: b |= get_unaligned_le16(end); break;
+ case 1: b |= end[0];
+ }
+#endif
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+ HPREAMBLE(4)
+ b |= first;
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+ u64 combined = (u64)second << 32 | first;
+ HPREAMBLE(8)
+ v3 ^= combined;
+ HSIPROUND;
+ v0 ^= combined;
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+ const hsiphash_key_t *key)
+{
+ u64 combined = (u64)second << 32 | first;
+ HPREAMBLE(12)
+ v3 ^= combined;
+ HSIPROUND;
+ v0 ^= combined;
+ b |= third;
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+ const u32 forth, const hsiphash_key_t *key)
+{
+ u64 combined = (u64)second << 32 | first;
+ HPREAMBLE(16)
+ v3 ^= combined;
+ HSIPROUND;
+ v0 ^= combined;
+ combined = (u64)forth << 32 | third;
+ v3 ^= combined;
+ HSIPROUND;
+ v0 ^= combined;
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#else
+#define HSIPROUND \
+ do { \
+ v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \
+ v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \
+ v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \
+ v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \
+ } while (0)
+
+#define HPREAMBLE(len) \
+ u32 v0 = 0; \
+ u32 v1 = 0; \
+ u32 v2 = 0x6c796765U; \
+ u32 v3 = 0x74656462U; \
+ u32 b = ((u32)(len)) << 24; \
+ v3 ^= key->key[1]; \
+ v2 ^= key->key[0]; \
+ v1 ^= key->key[1]; \
+ v0 ^= key->key[0];
+
+#define HPOSTAMBLE \
+ v3 ^= b; \
+ HSIPROUND; \
+ v0 ^= b; \
+ v2 ^= 0xff; \
+ HSIPROUND; \
+ HSIPROUND; \
+ HSIPROUND; \
+ return v1 ^ v3;
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+ const u8 *end = data + len - (len % sizeof(u32));
+ const u8 left = len & (sizeof(u32) - 1);
+ u32 m;
+ HPREAMBLE(len)
+ for (; data != end; data += sizeof(u32)) {
+ m = le32_to_cpup(data);
+ v3 ^= m;
+ HSIPROUND;
+ v0 ^= m;
+ }
+ switch (left) {
+ case 3: b |= ((u32)end[2]) << 16; fallthrough;
+ case 2: b |= le16_to_cpup(data); break;
+ case 1: b |= end[0];
+ }
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+ const hsiphash_key_t *key)
+{
+ const u8 *end = data + len - (len % sizeof(u32));
+ const u8 left = len & (sizeof(u32) - 1);
+ u32 m;
+ HPREAMBLE(len)
+ for (; data != end; data += sizeof(u32)) {
+ m = get_unaligned_le32(data);
+ v3 ^= m;
+ HSIPROUND;
+ v0 ^= m;
+ }
+ switch (left) {
+ case 3: b |= ((u32)end[2]) << 16; fallthrough;
+ case 2: b |= get_unaligned_le16(end); break;
+ case 1: b |= end[0];
+ }
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+ HPREAMBLE(4)
+ v3 ^= first;
+ HSIPROUND;
+ v0 ^= first;
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+ HPREAMBLE(8)
+ v3 ^= first;
+ HSIPROUND;
+ v0 ^= first;
+ v3 ^= second;
+ HSIPROUND;
+ v0 ^= second;
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+ const hsiphash_key_t *key)
+{
+ HPREAMBLE(12)
+ v3 ^= first;
+ HSIPROUND;
+ v0 ^= first;
+ v3 ^= second;
+ HSIPROUND;
+ v0 ^= second;
+ v3 ^= third;
+ HSIPROUND;
+ v0 ^= third;
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+ const u32 forth, const hsiphash_key_t *key)
+{
+ HPREAMBLE(16)
+ v3 ^= first;
+ HSIPROUND;
+ v0 ^= first;
+ v3 ^= second;
+ HSIPROUND;
+ v0 ^= second;
+ v3 ^= third;
+ HSIPROUND;
+ v0 ^= third;
+ v3 ^= forth;
+ HSIPROUND;
+ v0 ^= forth;
+ HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#endif
+++ /dev/null
-fn main() {
- use std::path::PathBuf;
- use std::process::Command;
-
- let out_dir: PathBuf = std::env::var_os("OUT_DIR").unwrap().into();
- let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR").unwrap().into();
- let libbcachefs_inc_dir = std::env::var("LIBBCACHEFS_INCLUDE")
- .unwrap_or_else(|_| top_dir.join("libbcachefs").display().to_string());
- let libbcachefs_inc_dir = std::path::Path::new(&libbcachefs_inc_dir);
- println!("{}", libbcachefs_inc_dir.display());
-
- let libbcachefs_dir = top_dir.join("libbcachefs").join("libbcachefs");
- let bindings = bindgen::builder()
- .header(top_dir
- .join("src")
- .join("libbcachefs_wrapper.h")
- .display()
- .to_string())
- .clang_arg(format!(
- "-I{}",
- libbcachefs_inc_dir.join("include").display()
- ))
- .clang_arg(format!("-I{}", libbcachefs_inc_dir.display()))
- .clang_arg("-DZSTD_STATIC_LINKING_ONLY")
- .clang_arg("-DNO_BCACHEFS_FS")
- .clang_arg("-D_GNU_SOURCE")
- .derive_debug(false)
- .derive_default(true)
- .default_enum_style(bindgen::EnumVariation::Rust {
- non_exhaustive: true,
- })
- .whitelist_function("bch2_read_super")
- .whitelist_function("bch2_sb_field_.*")
- .whitelist_function("bch2_chacha_encrypt_key")
- .whitelist_function("derive_passphrase")
- .whitelist_function("request_key")
- .whitelist_function("add_key")
- .whitelist_function("keyctl_search")
- .whitelist_var("BCH_.*")
- .whitelist_var("KEY_SPEC_.*")
- .whitelist_type("bch_kdf_types")
- .whitelist_type("bch_sb_field_.*")
- .whitelist_type("bch_encrypted_key")
- .whitelist_type("nonce")
- .rustified_enum("bch_kdf_types")
- .opaque_type("gendisk")
- .opaque_type("bkey")
- .generate()
- .unwrap();
- bindings.write_to_file(out_dir.join("bcachefs.rs")).unwrap();
-
- let keyutils = pkg_config::probe_library("libkeyutils").unwrap();
- let bindings = bindgen::builder()
- .header(top_dir
- .join("src")
- .join("keyutils_wrapper.h")
- .display()
- .to_string())
- .clang_args(
- keyutils.include_paths
- .iter()
- .map(|p| format!("-I{}", p.display())),
- )
- .generate()
- .unwrap();
- bindings.write_to_file(out_dir.join("keyutils.rs")).unwrap();
-}
+++ /dev/null
-extern "C" {
- pub static stdout: *mut libc::FILE;
-}
-
-use getset::{CopyGetters, Getters};
-use std::path::PathBuf;
-#[derive(Getters, CopyGetters)]
-pub struct FileSystem {
- /// External UUID of the bcachefs
- #[getset(get = "pub")]
- uuid: uuid::Uuid,
- /// Whether filesystem is encrypted
- #[getset(get_copy = "pub")]
- encrypted: bool,
- /// Super block
- #[getset(get = "pub")]
- sb: bcachefs::bch_sb_handle,
- /// Member devices for this filesystem
- #[getset(get = "pub")]
- devices: Vec<PathBuf>,
-}
-
-/// Parse a comma-separated mount options and split out mountflags and filesystem
-/// specific options.
-fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, u64) {
- use either::Either::*;
- let (opts, flags) = options
- .as_ref()
- .split(",")
- .map(|o| match o {
- "dirsync" => Left(libc::MS_DIRSYNC),
- "lazytime" => Left(1 << 25), // MS_LAZYTIME
- "mand" => Left(libc::MS_MANDLOCK),
- "noatime" => Left(libc::MS_NOATIME),
- "nodev" => Left(libc::MS_NODEV),
- "nodiratime" => Left(libc::MS_NODIRATIME),
- "noexec" => Left(libc::MS_NOEXEC),
- "nosuid" => Left(libc::MS_NOSUID),
- "ro" => Left(libc::MS_RDONLY),
- "rw" => Left(0),
- "relatime" => Left(libc::MS_RELATIME),
- "strictatime" => Left(libc::MS_STRICTATIME),
- "sync" => Left(libc::MS_SYNCHRONOUS),
- "" => Left(0),
- o @ _ => Right(o),
- })
- .fold((Vec::new(), 0), |(mut opts, flags), next| match next {
- Left(f) => (opts, flags | f),
- Right(o) => {
- opts.push(o);
- (opts, flags)
- }
- });
-
- use itertools::Itertools;
- (
- if opts.len() == 0 {
- None
- } else {
- Some(opts.iter().join(","))
- },
- flags,
- )
-}
-
-impl FileSystem {
- pub(crate) fn new(sb: bcachefs::bch_sb_handle) -> Self {
- Self {
- uuid: sb.sb().uuid(),
- encrypted: sb.sb().crypt().is_some(),
- sb: sb,
- devices: vec![],
- }
- }
-
- pub fn mount(
- &self,
- target: impl AsRef<std::path::Path>,
- options: impl AsRef<str>,
- ) -> anyhow::Result<()> {
- use itertools::Itertools;
- use std::ffi::c_void;
- use std::os::raw::c_char;
- use std::os::unix::ffi::OsStrExt;
- let src = self.devices.iter().map(|d| d.display()).join(":");
- let (data, mountflags) = parse_mount_options(options);
- let fstype = c_str!("bcachefs");
-
- let src = std::ffi::CString::new(src)?; // bind the CString to keep it alive
- let target = std::ffi::CString::new(target.as_ref().as_os_str().as_bytes())?; // ditto
- let data = data.map(|data| std::ffi::CString::new(data)).transpose()?; // ditto
-
- let src = src.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
- let target = target.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
- let data = data.as_ref().map_or(std::ptr::null(), |data| {
- data.as_c_str().to_bytes_with_nul().as_ptr() as *const c_void
- });
-
- let ret = unsafe { libc::mount(src, target, fstype, mountflags, data) };
- if ret == 0 {
- Ok(())
- } else {
- Err(crate::ErrnoError(errno::errno()).into())
- }
- }
-}
-
-use crate::bcachefs;
-use std::collections::HashMap;
-use uuid::Uuid;
-pub fn probe_filesystems() -> anyhow::Result<HashMap<Uuid, FileSystem>> {
- use std::os::unix::ffi::OsStrExt;
- let mut udev = udev::Enumerator::new()?;
- let mut fss = HashMap::new();
- udev.match_subsystem("block")?;
-
- {
- // Stop libbcachefs from spamming the output
- let _gag = gag::Gag::stdout().unwrap();
- for dev in udev.scan_devices()? {
- if let Some(p) = dev.devnode() {
- let path =
- std::ffi::CString::new(p.as_os_str().as_bytes()).unwrap();
- let result = unsafe {
- let mut opts = std::mem::MaybeUninit::zeroed();
- let mut sb = std::mem::MaybeUninit::zeroed();
- let ret = bcachefs::bch2_read_super(
- path.as_ptr(),
- opts.as_mut_ptr(),
- sb.as_mut_ptr(),
- );
- if ret == -libc::EACCES {
- Err(std::io::Error::new(
- std::io::ErrorKind::PermissionDenied,
- "no permission",
- ))
- } else if ret != 0 {
- Err(std::io::Error::new(
- std::io::ErrorKind::Other,
- "failed to read super",
- ))
- } else {
- Ok((opts.assume_init(), sb.assume_init()))
- }
- };
- match result {
- Ok((_, sb)) => match fss.get_mut(&sb.sb().uuid()) {
- None => {
- let mut fs = FileSystem::new(sb);
- fs.devices.push(p.to_owned());
- fss.insert(fs.uuid, fs);
- }
- Some(fs) => {
- fs.devices.push(p.to_owned());
- }
- },
- Err(e) if e.kind()
- != std::io::ErrorKind::PermissionDenied =>
- {
- ()
- }
- e @ Err(_) => {
- e?;
- }
- }
- }
- }
- // Flush stdout so buffered output don't get printed after we remove the gag
- unsafe {
- libc::fflush(stdout);
- }
- }
- Ok(fss)
-}
+++ /dev/null
-use structopt::StructOpt;
-use anyhow::anyhow;
-
-#[macro_export]
-macro_rules! c_str {
- ($lit:expr) => {
- unsafe { std::ffi::CStr::from_ptr(concat!($lit, "\0").as_ptr() as *const std::os::raw::c_char)
- .to_bytes_with_nul()
- .as_ptr() as *const std::os::raw::c_char }
- };
-}
-
-#[derive(Debug)]
-struct ErrnoError(errno::Errno);
-impl std::fmt::Display for ErrnoError {
- fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
- self.0.fmt(f)
- }
-}
-impl std::error::Error for ErrnoError {}
-
-#[derive(Debug)]
-pub(crate) enum KeyLocation {
- Fail,
- Wait,
- Ask,
-}
-
-impl std::str::FromStr for KeyLocation {
- type Err = anyhow::Error;
- fn from_str(s: &str) -> anyhow::Result<Self> {
- use anyhow::anyhow;
- match s {
- "fail" => Ok(Self::Fail),
- "wait" => Ok(Self::Wait),
- "ask" => Ok(Self::Ask),
- _ => Err(anyhow!("invalid password option"))
- }
- }
-}
-
-#[derive(StructOpt, Debug)]
-/// Mount a bcachefs filesystem by its UUID.
-struct Options {
- /// Where the password would be loaded from.
- ///
- /// Possible values are:
- /// "fail" - don't ask for password, fail if filesystem is encrypted;
- /// "wait" - wait for password to become available before mounting;
- /// "ask" - prompt the user for password;
- #[structopt(short, long, default_value = "fail")]
- key_location: KeyLocation,
-
- /// External UUID of the bcachefs filesystem
- uuid: uuid::Uuid,
-
- /// Where the filesystem should be mounted. If not set, then the filesystem
- /// won't actually be mounted. But all steps preceeding mounting the
- /// filesystem (e.g. asking for passphrase) will still be performed.
- mountpoint: Option<std::path::PathBuf>,
-
- /// Mount options
- #[structopt(short, default_value = "")]
- options: String,
-}
-
-mod filesystem;
-mod key;
-mod keyutils {
- #![allow(non_upper_case_globals)]
- #![allow(non_camel_case_types)]
- #![allow(non_snake_case)]
- #![allow(unused)]
-
- include!(concat!(env!("OUT_DIR"), "/keyutils.rs"));
-}
-
-mod bcachefs {
- #![allow(non_upper_case_globals)]
- #![allow(non_camel_case_types)]
- #![allow(non_snake_case)]
- #![allow(unused)]
-
- include!(concat!(env!("OUT_DIR"), "/bcachefs.rs"));
-
- use bitfield::bitfield;
- bitfield! {
- pub struct bch_scrypt_flags(u64);
- pub N, _: 15, 0;
- pub R, _: 31, 16;
- pub P, _: 47, 32;
- }
- bitfield! {
- pub struct bch_crypt_flags(u64);
- TYPE, _: 4, 0;
- }
- use memoffset::offset_of;
- impl bch_sb_field_crypt {
- pub fn scrypt_flags(&self) -> Option<bch_scrypt_flags> {
- let t = bch_crypt_flags(self.flags);
- if t.TYPE() != bch_kdf_types::BCH_KDF_SCRYPT as u64 {
- None
- } else {
- Some(bch_scrypt_flags(self.kdf_flags))
- }
- }
- pub fn key(&self) -> &bch_encrypted_key {
- &self.key
- }
- }
- impl bch_sb {
- pub fn crypt(&self) -> Option<&bch_sb_field_crypt> {
- unsafe {
- let ptr = bch2_sb_field_get(
- self as *const _ as *mut _,
- bch_sb_field_type::BCH_SB_FIELD_crypt,
- ) as *const u8;
- if ptr.is_null() {
- None
- } else {
- let offset = offset_of!(bch_sb_field_crypt, field);
- Some(&*((ptr.sub(offset)) as *const _))
- }
- }
- }
- pub fn uuid(&self) -> uuid::Uuid {
- uuid::Uuid::from_bytes(self.user_uuid.b)
- }
-
- /// Get the nonce used to encrypt the superblock
- pub fn nonce(&self) -> nonce {
- use byteorder::{ReadBytesExt, LittleEndian};
- let mut internal_uuid = &self.uuid.b[..];
- let dword1 = internal_uuid.read_u32::<LittleEndian>().unwrap();
- let dword2 = internal_uuid.read_u32::<LittleEndian>().unwrap();
- nonce { d: [0, 0, dword1, dword2] }
- }
- }
- impl bch_sb_handle {
- pub fn sb(&self) -> &bch_sb {
- unsafe { &*self.sb }
- }
- }
-}
-
-fn main_inner() -> anyhow::Result<()> {
- use itertools::Itertools;
- use log::{info, trace};
-
- env_logger::init();
- let opt = Options::from_args();
- trace!("{:?}", opt);
-
- let fss = filesystem::probe_filesystems()?;
- info!("Found {} bcachefs filesystems: ", fss.len());
- for fs in fss.values() {
- info!(
- "{} ({}): {}",
- fs.uuid(),
- if fs.encrypted() {
- "encrypted"
- } else {
- "unencrypted"
- },
- fs.devices().iter().map(|d| d.display()).join(" ")
- );
- }
-
- if let Some(fs) = fss.get(&opt.uuid) {
- if fs.encrypted() {
- info!("Making sure key is loaded for this filesystem");
- key::prepare_key(&fs, opt.key_location)?;
- }
-
- if let Some(p) = opt.mountpoint {
- fs.mount(&p, &opt.options)
- } else {
- Ok(())
- }
- } else {
- Err(anyhow!("Filesystem {} is not found", opt.uuid))
- }
-}
-
-#[no_mangle]
-pub extern "C" fn main() {
- if let Err(e) = main_inner() {
- println!("Error: {:?}", e);
- }
-}
--- /dev/null
+{ lib
+, fetchpatch
+, fetchgit
+, fetchFromGitHub
+, buildLinux
+, commit
+, sha256 ? lib.fakeSha256
+, kernelVersion ? "5.13.0"
+, kernelPatches ? [] # must always be defined in bcachefs' all-packages.nix entry because it's also a top-level attribute supplied by callPackage
+, argsOverride ? {}
+, versionString ? (builtins.substring 0 8 commit)
+, ...
+} @ args:
+
+buildLinux {
+ inherit kernelPatches;
+
+ # pname = "linux";
+ version = "${kernelVersion}-bcachefs-${versionString}";
+
+ modDirVersion = kernelVersion;
+
+
+ src = fetchFromGitHub {
+ name = "bcachefs-kernel-src";
+ owner = "koverstreet";
+ repo = "bcachefs";
+ rev = commit;
+ inherit sha256;
+ };
+
+ extraConfig = "BCACHEFS_FS m";
+ # NIX_DEBUG=5;
+}
\ No newline at end of file
--- /dev/null
+sha256-JsWrbuxrs047YKGES+r7mMfPdDWIMAGrg1fWi8qU4+A=
\ No newline at end of file
--- /dev/null
+{ filter, self, ... }:
+final: prev: {
+ bcachefs = {
+ tools = final.callPackage ../default.nix {
+ testWithValgrind = false;
+ filter = filter.lib;
+ lastModified = builtins.substring 0 8 self.lastModifiedDate;
+ versionString = self.version;
+ };
+ toolsValgrind = final.bcachefs.tools.override {
+ testWithValgrind = true;
+ };
+ toolsDebug = final.bcachefs.toolsValgrind.override {
+ debugMode = true;
+ };
+
+ bch_bindgen = final.callPackage ../rust-src/bch_bindgen {};
+
+ mount = final.callPackage ../rust-src/mount {};
+
+ kernelPackages = final.recurseIntoAttrs (final.linuxPackagesFor final.bcachefs.kernel);
+ kernel = final.callPackage ./bcachefs-kernel.nix {
+ commit = final.bcachefs.tools.bcachefs_revision;
+ # This needs to be recalculated for every revision change
+ sha256 = builtins.readFile ./bcachefs.rev.sha256;
+ kernelPatches = [];
+ };
+ };
+}
BuildRequires: libaio-devel
BuildRequires: libattr-devel
BuildRequires: libblkid-devel
-BuildRequires: libscrypt-devel
BuildRequires: libsodium-devel
BuildRequires: libtool-ltdl-devel
BuildRequires: libuuid-devel
Requires: libaio
Requires: libattr
Requires: libblkid
-Requires: libscrypt
Requires: libsodium
Requires: libtool-ltdl
Requires: libuuid
if (img->l1_index != -1) {
img->l1_table[img->l1_index] =
cpu_to_be64(img->offset|QCOW_OFLAG_COPIED);
- xpwrite(img->fd, img->l2_table, img->block_size, img->offset);
+ xpwrite(img->fd, img->l2_table, img->block_size, img->offset,
+ "qcow2 l2 table");
img->offset += img->block_size;
memset(img->l2_table, 0, img->block_size);
img.offset += img.block_size;
xpread(infd, buf, block_size, src_offset);
- xpwrite(outfd, buf, block_size, dst_offset);
+ xpwrite(outfd, buf, block_size, dst_offset,
+ "qcow2 data");
add_l2(&img, src_offset / block_size, dst_offset);
}
/* Write L1 table: */
dst_offset = img.offset;
img.offset += round_up(l1_size * sizeof(u64), block_size);
- xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset);
+ xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset,
+ "qcow2 l1 table");
/* Write header: */
hdr.magic = cpu_to_be32(QCOW_MAGIC);
memset(buf, 0, block_size);
memcpy(buf, &hdr, sizeof(hdr));
- xpwrite(img.fd, buf, block_size, 0);
+ xpwrite(img.fd, buf, block_size, 0,
+ "qcow2 header");
free(img.l2_table);
free(img.l1_table);
--- /dev/null
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
+# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+# Required By Nix
+# Cargo.lock
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
--- /dev/null
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "anyhow"
+version = "1.0.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1"
+
+[[package]]
+name = "autocfg"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
+
+[[package]]
+name = "bch_bindgen"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bindgen",
+ "bitfield",
+ "byteorder",
+ "gag",
+ "libc",
+ "memoffset",
+ "pkg-config",
+ "tracing",
+ "tracing-attributes",
+ "udev",
+ "uuid",
+]
+
+[[package]]
+name = "bindgen"
+version = "0.59.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375"
+dependencies = [
+ "bitflags",
+ "cexpr",
+ "clang-sys",
+ "lazy_static",
+ "lazycell",
+ "peeking_take_while",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+]
+
+[[package]]
+name = "bitfield"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitvec"
+version = "0.19.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
+[[package]]
+name = "byteorder"
+version = "1.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+
+[[package]]
+name = "cexpr"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "clang-sys"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10612c0ec0e0a1ff0e97980647cb058a6e7aedb913d01d009c406b8b7d0b26ee"
+dependencies = [
+ "glob",
+ "libc",
+]
+
+[[package]]
+name = "filedescriptor"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f"
+dependencies = [
+ "libc",
+ "thiserror",
+ "winapi",
+]
+
+[[package]]
+name = "funty"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
+
+[[package]]
+name = "gag"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972"
+dependencies = [
+ "filedescriptor",
+ "tempfile",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "glob"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
+[[package]]
+name = "libc"
+version = "0.2.103"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd8f7255a17a627354f321ef0055d63b898c6fb27eff628af4d1b66b7331edf6"
+
+[[package]]
+name = "libudev-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c8469b4a23b962c1396b9b451dda50ef5b283e8dd309d69033475fa9b334324"
+dependencies = [
+ "libc",
+ "pkg-config",
+]
+
+[[package]]
+name = "memchr"
+version = "2.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
+
+[[package]]
+name = "memoffset"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "nom"
+version = "6.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c5c51b9083a3c620fa67a2a635d1ce7d95b897e957d6b28ff9a5da960a103a6"
+dependencies = [
+ "bitvec",
+ "funty",
+ "memchr",
+ "version_check",
+]
+
+[[package]]
+name = "peeking_take_while"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443"
+
+[[package]]
+name = "pkg-config"
+version = "0.3.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c9b1041b4387893b91ee6746cddfc28516aff326a3519fb2adf820932c5e6cb"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d"
+dependencies = [
+ "unicode-xid",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "radium"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
+
+[[package]]
+name = "rand"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+ "rand_hc",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_hc"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "regex"
+version = "1.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
+dependencies = [
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
+
+[[package]]
+name = "remove_dir_all"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
+[[package]]
+name = "shlex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+
+[[package]]
+name = "syn"
+version = "1.0.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5239bc68e0fef57495900cfea4e8dc75596d9a319d7e16b1e0a440d24e6fe0a0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-xid",
+]
+
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
+[[package]]
+name = "tempfile"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "rand",
+ "redox_syscall",
+ "remove_dir_all",
+ "winapi",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84f96e095c0c82419687c20ddf5cb3eadb61f4e1405923c9dc8e53a1adacbda8"
+dependencies = [
+ "cfg-if",
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98863d0dd09fa59a1b79c6750ad80dbda6b75f4e71c437a6a1a8cb91a8bcbd77"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46125608c26121c81b0c6d693eab5a420e416da7e43c426d2e8f7df8da8a3acf"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "udev"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24953d50a3bce0f5f5a9a2766567072dc9af8096f8c40ea81815da651066bc9f"
+dependencies = [
+ "libc",
+ "libudev-sys",
+]
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
+
+[[package]]
+name = "uuid"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
+
+[[package]]
+name = "version_check"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
+
+[[package]]
+name = "wasi"
+version = "0.10.2+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wyz"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214"
--- /dev/null
+[package]
+name = "bch_bindgen"
+version = "0.1.0"
+authors = [ "Kayla Firestack <dev@kaylafire.me>", "Yuxuan Shui <yshuiv7@gmail.com>" ]
+edition = "2018"
+
+[lib]
+crate-type = ["lib"]
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+tracing = "0.1.26"
+anyhow = "1.0"
+udev = "0.4"
+uuid = "0.8"
+bitfield = "0.13"
+memoffset = "0.5"
+byteorder = "1.3"
+tracing-attributes = "0.1.15"
+libc = "0.2.69"
+gag = "1.0.0"
+
+
+[build-dependencies]
+pkg-config = "0.3"
+bindgen = { version = "0.59.1", default-features = false }
--- /dev/null
+fn main() {
+ use std::path::PathBuf;
+ // use std::process::Command;
+
+ let out_dir: PathBuf = std::env::var_os("OUT_DIR").expect("ENV Var 'OUT_DIR' Expected").into();
+ let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR")
+ .expect("ENV Var 'CARGO_MANIFEST_DIR' Expected")
+ .into();
+ let libbcachefs_inc_dir =
+ std::env::var("LIBBCACHEFS_INCLUDE").unwrap_or_else(|_| top_dir.join("libbcachefs").display().to_string());
+ let libbcachefs_inc_dir = std::path::Path::new(&libbcachefs_inc_dir);
+ println!("{}", libbcachefs_inc_dir.display());
+
+ println!("cargo:rustc-link-lib=dylib=bcachefs");
+ println!("cargo:rustc-link-search={}", env!("LIBBCACHEFS_LIB"));
+
+ let _libbcachefs_dir = top_dir.join("libbcachefs").join("libbcachefs");
+ let bindings = bindgen::builder()
+ .header(top_dir.join("src").join("libbcachefs_wrapper.h").display().to_string())
+ .clang_arg(format!("-I{}", libbcachefs_inc_dir.join("include").display()))
+ .clang_arg(format!("-I{}", libbcachefs_inc_dir.display()))
+ .clang_arg("-DZSTD_STATIC_LINKING_ONLY")
+ .clang_arg("-DNO_BCACHEFS_FS")
+ .clang_arg("-D_GNU_SOURCE")
+ .derive_debug(true)
+ .derive_default(true)
+ .derive_eq(true)
+ .layout_tests(true)
+ .default_enum_style(bindgen::EnumVariation::Rust { non_exhaustive: true })
+ .allowlist_function(".*bch2_.*")
+ // .allowlist_function("bch2_read_super")
+ // .allowlist_function("bch2_sb_field_.*")
+ // .allowlist_function("bch2_super_write")
+ // .allowlist_function("bch2_chacha_encrypt_key")
+ // .allowlist_function("__bch2_super_read")
+ .allowlist_function("bio_.*")
+ .allowlist_function("bch2_super_write_fd")
+ .allowlist_function("derive_passphrase")
+ .allowlist_function("request_key")
+ .allowlist_function("add_key")
+ .allowlist_function("keyctl_search")
+ .blocklist_type("bch_extent_ptr")
+ .blocklist_type("btree_node")
+ .blocklist_type("bch_extent_crc32")
+ .blocklist_type("rhash_lock_head")
+ .blocklist_type("srcu_struct")
+ .allowlist_var("BCH_.*")
+ .allowlist_var("KEY_SPEC_.*")
+ .allowlist_type("bch_kdf_types")
+ .allowlist_type("bch_sb_field_.*")
+ .allowlist_type("bch_encrypted_key")
+ .allowlist_type("nonce")
+ .newtype_enum("bch_kdf_types")
+ .opaque_type("gendisk")
+ .opaque_type("bkey")
+ // .opaque_type("bch_extent_ptr")
+ // .opaque_type("bch_extent_crc32")
+ .opaque_type("open_bucket.*")
+ .generate()
+ .expect("BindGen Generation Failiure: [libbcachefs_wrapper]");
+ bindings
+ .write_to_file(out_dir.join("bcachefs.rs"))
+ .expect("Writing to output file failed for: `bcachefs.rs`");
+
+ let keyutils = pkg_config::probe_library("libkeyutils").expect("Failed to find keyutils lib");
+ let bindings = bindgen::builder()
+ .header(top_dir.join("src").join("keyutils_wrapper.h").display().to_string())
+ .clang_args(keyutils.include_paths.iter().map(|p| format!("-I{}", p.display())))
+ .generate()
+ .expect("BindGen Generation Failiure: [Keyutils]");
+ bindings
+ .write_to_file(out_dir.join("keyutils.rs"))
+ .expect("Writing to output file failed for: `keyutils.rs`");
+}
--- /dev/null
+{ lib
+, stdenv
+, rustPlatform
+, llvmPackages
+, bcachefs
+, pkg-config
+
+, udev
+, liburcu
+, zstd
+, keyutils
+, libaio
+
+, lz4 # liblz4
+, libsodium
+, libuuid
+, zlib # zlib1g
+, libscrypt
+
+, rustfmt
+
+, glibc
+, ...
+}: let
+ include = {
+ glibc = "${glibc.dev}/include";
+ clang = let libc = llvmPackages.libclang; in
+ "${libc.lib}/lib/clang/${libc.version}/include";
+ urcu = "${liburcu}/include";
+ zstd = "${zstd.dev}/include";
+ };
+ cargo = lib.trivial.importTOML ./Cargo.toml;
+in rustPlatform.buildRustPackage {
+ pname = cargo.package.name;
+ version = cargo.package.version;
+
+ src = builtins.path { path = ./.; name = "bch_bindgen"; };
+
+ cargoLock = { lockFile = ./Cargo.lock; };
+
+ nativeBuildInputs = [ rustfmt pkg-config ];
+ buildInputs = [
+
+ # libaio
+ keyutils # libkeyutils
+ lz4 # liblz4
+ libsodium
+ liburcu
+ libuuid
+ zstd # libzstd
+ zlib # zlib1g
+ udev
+ libscrypt
+ libaio
+ ];
+
+ LIBBCACHEFS_LIB ="${bcachefs.tools}/lib";
+ LIBBCACHEFS_INCLUDE = bcachefs.tools.src;
+ LIBCLANG_PATH = "${llvmPackages.libclang.lib}/lib";
+ BINDGEN_EXTRA_CLANG_ARGS = lib.replaceStrings ["\n" "\t"] [" " ""] ''
+ -std=gnu99
+ -I${include.glibc}
+ -I${include.clang}
+ -I${include.urcu}
+ -I${include.zstd}
+ '';
+
+ postPatch = ''
+ cp ${./Cargo.lock} Cargo.lock
+ '';
+
+
+ doCheck = true;
+
+ # NIX_DEBUG = 4;
+}
\ No newline at end of file
--- /dev/null
+max_width=120
+hard_tabs = true
--- /dev/null
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(unused)]
+
+include!(concat!(env!("OUT_DIR"), "/bcachefs.rs"));
+
+use bitfield::bitfield;
+bitfield! {
+ pub struct bch_scrypt_flags(u64);
+ pub N, _: 15, 0;
+ pub R, _: 31, 16;
+ pub P, _: 47, 32;
+}
+bitfield! {
+ pub struct bch_crypt_flags(u64);
+ pub TYPE, _: 4, 0;
+}
+use memoffset::offset_of;
+impl bch_sb_field_crypt {
+ pub fn scrypt_flags(&self) -> Option<bch_scrypt_flags> {
+ use std::convert::TryInto;
+ match bch_kdf_types(bch_crypt_flags(self.flags).TYPE().try_into().ok()?) {
+ bch_kdf_types::BCH_KDF_SCRYPT => Some(bch_scrypt_flags(self.kdf_flags)),
+ _ => None,
+ }
+ }
+ pub fn key(&self) -> &bch_encrypted_key {
+ &self.key
+ }
+}
+impl PartialEq for bch_sb {
+ fn eq(&self, other: &Self) -> bool {
+ self.magic.b == other.magic.b
+ && self.user_uuid.b == other.user_uuid.b
+ && self.block_size == other.block_size
+ && self.version == other.version
+ && self.uuid.b == other.uuid.b
+ && self.seq == other.seq
+ }
+}
+
+impl std::fmt::Debug for bch_sb {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.debug_struct("bch_sb")
+ .field("uuid", &self.uuid())
+ .field("version", &(self.version, self.version_min))
+ .field("block_size", &self.block_size)
+ .field("device_idx", &self.dev_idx)
+ .field("seq", &self.seq)
+ .field("csum", &(self.csum.lo, self.csum.hi))
+ .field("offset", &self.offset)
+ .finish_non_exhaustive()
+ }
+}
+
+
+impl bch_sb {
+ pub fn crypt(&self) -> Option<&bch_sb_field_crypt> {
+ unsafe {
+ let ptr = bch2_sb_field_get(self as *const _ as *mut _, bch_sb_field_type::BCH_SB_FIELD_crypt) as *const u8;
+ if ptr.is_null() {
+ None
+ } else {
+ let offset = offset_of!(bch_sb_field_crypt, field);
+ Some(&*((ptr.sub(offset)) as *const _))
+ }
+ }
+ }
+ pub fn uuid(&self) -> uuid::Uuid {
+ uuid::Uuid::from_bytes(self.user_uuid.b)
+ }
+
+ /// Get the nonce used to encrypt the superblock
+ pub fn nonce(&self) -> nonce {
+ use byteorder::{LittleEndian, ReadBytesExt};
+ let mut internal_uuid = &self.uuid.b[..];
+ let dword1 = internal_uuid.read_u32::<LittleEndian>().unwrap();
+ let dword2 = internal_uuid.read_u32::<LittleEndian>().unwrap();
+ nonce {
+ d: [0, 0, dword1, dword2],
+ }
+ }
+}
+impl bch_sb_handle {
+ pub fn sb(&self) -> &bch_sb {
+ unsafe { &*self.sb }
+ }
+
+ pub fn bdev(&self) -> &block_device {
+ unsafe { &*self.bdev }
+ }
+}
+
+#[repr(C)]
+// #[repr(align(8))]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct bch_extent_ptr {
+ pub _bitfield_1: __BindgenBitfieldUnit<[u8; 8usize]>,
+}
+
+#[repr(C, packed(8))]
+pub struct btree_node {
+ pub csum: bch_csum,
+ pub magic: __le64,
+ pub flags: __le64,
+ pub min_key: bpos,
+ pub max_key: bpos,
+ pub _ptr: bch_extent_ptr,
+ pub format: bkey_format,
+ pub __bindgen_anon_1: btree_node__bindgen_ty_1,
+}
+
+#[repr(C, packed(8))]
+// #[repr(align(8))]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct bch_extent_crc32 {
+ pub _bitfield_1: __BindgenBitfieldUnit<[u8; 4usize]>,
+ pub csum: __u32,
+}
+
+// #[repr(u8)]
+pub enum rhash_lock_head {}
+pub enum srcu_struct {}
--- /dev/null
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(unused)]
+
+include!(concat!(env!("OUT_DIR"), "/keyutils.rs"));
--- /dev/null
+pub mod bcachefs;
+pub mod keyutils;
+pub mod rs;
+
+pub mod c {
+ pub use crate::bcachefs::*;
+}
#include "../libbcachefs/super-io.h"
#include "../libbcachefs/checksum.h"
#include "../libbcachefs/bcachefs_format.h"
+#include "../libbcachefs/opts.h"
+#include "../libbcachefs.h"
#include "../crypto.h"
+#include "../include/linux/bio.h"
+
--- /dev/null
+use crate::bcachefs;
+
+pub const SUPERBLOCK_MAGIC: uuid::Uuid = uuid::Uuid::from_u128(
+ 0x_c68573f6_4e1a_45ca_8265_f57f48ba6d81
+);
+
+extern "C" {
+ pub static stdout: *mut libc::FILE;
+}
+
+pub enum ReadSuperErr {
+ Io(std::io::Error),
+}
+
+type RResult<T> = std::io::Result<std::io::Result<T>>;
+
+#[tracing_attributes::instrument(skip(opts))]
+pub fn read_super_opts(path: &std::path::Path, mut opts: bcachefs::bch_opts) -> RResult<bcachefs::bch_sb_handle> {
+ // let devp = camino::Utf8Path::from_path(devp).unwrap();
+
+ use std::os::unix::ffi::OsStrExt;
+ let path = std::ffi::CString::new(path.as_os_str().as_bytes())?;
+
+ let mut sb = std::mem::MaybeUninit::zeroed();
+
+ // use gag::{BufferRedirect};
+ // // Stop libbcachefs from spamming the output
+ // let gag = BufferRedirect::stderr().unwrap();
+ // tracing::trace!("entering libbcachefs");
+
+ let ret = unsafe { crate::bcachefs::bch2_read_super(path.as_ptr(), &mut opts, sb.as_mut_ptr()) };
+ tracing::trace!(%ret);
+
+ match -ret {
+ libc::EACCES => Err(std::io::Error::new(
+ std::io::ErrorKind::PermissionDenied,
+ "Access Permission Denied",
+ )),
+ 0 => Ok(Ok(unsafe { sb.assume_init() })),
+ 22 => Ok(Err(std::io::Error::new(
+ std::io::ErrorKind::InvalidData,
+ "Not a BCacheFS SuperBlock",
+ ))),
+ code => {
+ tracing::debug!(msg = "BCacheFS return error code", ?code);
+ Ok(Err(std::io::Error::new(
+ std::io::ErrorKind::Other,
+ "Failed to Read SuperBlock",
+ )))
+ }
+ }
+}
+
+#[tracing_attributes::instrument]
+pub fn read_super(path: &std::path::Path) -> RResult<bcachefs::bch_sb_handle> {
+ let opts = bcachefs::bch_opts::default(); //unsafe {std::mem::MaybeUninit::zeroed().assume_init()};
+ read_super_opts(path, opts)
+}
--- /dev/null
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
+# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+# Needed by nix
+# Cargo.lock
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
+version = 3
+
[[package]]
name = "aho-corasick"
version = "0.7.10"
"winapi",
]
+[[package]]
+name = "ansi_term"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
+dependencies = [
+ "winapi",
+]
+
[[package]]
name = "anyhow"
version = "1.0.28"
[[package]]
name = "bcachefs-mount"
-version = "0.1.0"
+version = "0.3.1"
dependencies = [
"anyhow",
- "bindgen",
- "bitfield",
+ "bch_bindgen",
"byteorder",
+ "camino",
"clap",
"either",
- "env_logger",
"errno",
"gag",
"getset",
"itertools",
"libc",
- "log",
- "memoffset",
"parse-display",
- "pkg-config",
"rpassword",
"structopt",
+ "tracing",
+ "tracing-attributes",
+ "tracing-log",
+ "tracing-subscriber",
+ "udev",
+ "uuid",
+]
+
+[[package]]
+name = "bch_bindgen"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bindgen",
+ "bitfield",
+ "byteorder",
+ "gag",
+ "libc",
+ "memoffset",
+ "pkg-config",
+ "tracing",
+ "tracing-attributes",
"udev",
"uuid",
]
[[package]]
name = "bindgen"
-version = "0.53.2"
+version = "0.59.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bb26d6a69a335b8cb0e7c7e9775cd5666611dc50a37177c3f2cedcfc040e8c8"
+checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375"
dependencies = [
"bitflags",
"cexpr",
- "cfg-if",
"clang-sys",
"lazy_static",
"lazycell",
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
+[[package]]
+name = "bitvec"
+version = "0.19.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
[[package]]
name = "byteorder"
version = "1.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
+[[package]]
+name = "camino"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52d74260d9bf6944e2208aa46841b4b8f0d7ffc0849a06837b2f510337f86b2b"
+
[[package]]
name = "cexpr"
-version = "0.4.0"
+version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27"
+checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89"
dependencies = [
"nom",
]
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
+dependencies = [
+ "libc",
+ "num-integer",
+ "num-traits",
+ "winapi",
+]
+
[[package]]
name = "clang-sys"
-version = "0.29.3"
+version = "1.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a"
+checksum = "10612c0ec0e0a1ff0e97980647cb058a6e7aedb913d01d009c406b8b7d0b26ee"
dependencies = [
"glob",
"libc",
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
dependencies = [
- "ansi_term",
+ "ansi_term 0.11.0",
"atty",
"bitflags",
"strsim",
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
-[[package]]
-name = "env_logger"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36"
-dependencies = [
- "log",
-]
-
[[package]]
name = "errno"
version = "0.2.5"
]
[[package]]
-name = "gag"
-version = "0.1.10"
+name = "filedescriptor"
+version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8cc0b9f53275dc5fada808f1d2f82e3688a6c14d735633d1590b7be8eb2307b5"
+checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f"
dependencies = [
"libc",
+ "thiserror",
+ "winapi",
+]
+
+[[package]]
+name = "funty"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
+
+[[package]]
+name = "gag"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972"
+dependencies = [
+ "filedescriptor",
"tempfile",
]
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb"
dependencies = [
- "cfg-if",
+ "cfg-if 0.1.10",
"libc",
"wasi",
]
"either",
]
+[[package]]
+name = "itoa"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
+
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7"
dependencies = [
- "cfg-if",
+ "cfg-if 0.1.10",
+]
+
+[[package]]
+name = "matchers"
+version = "0.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
+dependencies = [
+ "regex-automata",
]
[[package]]
[[package]]
name = "nom"
-version = "5.1.1"
+version = "6.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b471253da97532da4b61552249c521e01e736071f71c1a4f7ebbfbf0a06aad6"
+checksum = "9c5c51b9083a3c620fa67a2a635d1ce7d95b897e957d6b28ff9a5da960a103a6"
dependencies = [
+ "bitvec",
+ "funty",
"memchr",
"version_check",
]
+[[package]]
+name = "num-integer"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
+dependencies = [
+ "autocfg",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
+dependencies = [
+ "autocfg",
+]
+
[[package]]
name = "parse-display"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
+[[package]]
+name = "pin-project-lite"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443"
+
[[package]]
name = "pkg-config"
version = "0.3.17"
"proc-macro2",
]
+[[package]]
+name = "radium"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
+
[[package]]
name = "rand"
version = "0.7.3"
"thread_local",
]
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax",
+]
+
[[package]]
name = "regex-syntax"
version = "0.6.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+[[package]]
+name = "ryu"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
+
+[[package]]
+name = "serde"
+version = "1.0.130"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913"
+
+[[package]]
+name = "serde_json"
+version = "1.0.67"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950"
+dependencies = [
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982"
+dependencies = [
+ "lazy_static",
+]
+
[[package]]
name = "shlex"
-version = "0.1.1"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+
+[[package]]
+name = "smallvec"
+version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
+checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
[[package]]
name = "strsim"
[[package]]
name = "structopt"
-version = "0.3.14"
+version = "0.3.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef"
+checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa"
dependencies = [
"clap",
"lazy_static",
[[package]]
name = "structopt-derive"
-version = "0.4.7"
+version = "0.4.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a"
+checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba"
dependencies = [
"heck",
"proc-macro-error 1.0.2",
"syn",
]
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
[[package]]
name = "tempfile"
version = "3.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
dependencies = [
- "cfg-if",
+ "cfg-if 0.1.10",
"libc",
"rand",
"redox_syscall",
"unicode-width",
]
+[[package]]
+name = "thiserror"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "318234ffa22e0920fe9a40d7b8369b5f649d490980cf7aadcf1eb91594869b42"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cae2447b6282786c3493999f40a9be2a6ad20cb8bd268b0a0dbf5a065535c0ab"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
[[package]]
name = "thread_local"
version = "1.0.1"
"lazy_static",
]
+[[package]]
+name = "tracing"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
+dependencies = [
+ "cfg-if 1.0.0",
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c42e6fa53307c8a17e4ccd4dc81cf5ec38db9209f59b222210375b54ee40d1e2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ca517f43f0fb96e0c3072ed5c275fe5eece87e8cb52f4a77b69226d3b1c9df8"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
+dependencies = [
+ "lazy_static",
+ "log",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.2.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9cbe87a2fa7e35900ce5de20220a582a9483a7063811defce79d7cbd59d4cfe"
+dependencies = [
+ "ansi_term 0.12.1",
+ "chrono",
+ "lazy_static",
+ "matchers",
+ "regex",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
+]
+
[[package]]
name = "udev"
version = "0.4.0"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wyz"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214"
[package]
name = "bcachefs-mount"
-version = "0.1.0"
-authors = ["Yuxuan Shui <yshuiv7@gmail.com>"]
+version = "0.3.1"
+authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-log = "0.4"
+tracing = "0.1.26"
+tracing-log = "0.1.2"
+tracing-subscriber = "0.2.20"
+tracing-attributes = "0.1.15"
clap = { version = "2.33", features = [ "wrap_help" ] }
-env_logger = { version = "0.7", default-features = false }
anyhow = "1.0"
-udev = "0.4"
-uuid = "0.8"
libc = "0.2.69"
-gag = "0.1"
-bitfield = "0.13"
-memoffset = "0.5"
+uuid = "0.8"
+udev = "0.4"
+gag = "1.0.0"
getset = "0.1"
itertools = "0.9"
-structopt = "0.3"
+structopt = "0.3.23"
parse-display = "0.1"
errno = "0.2"
either = "1.5"
rpassword = "4"
+camino = "1.0.5"
+bch_bindgen = { path = "../bch_bindgen" }
byteorder = "1.3"
-[lib]
-crate-type = ["staticlib"]
-
-[build-dependencies]
-pkg-config = "0.3"
-bindgen = { version = "0.53", default-features = false }
--- /dev/null
+Usage
+=====
+
+```
+bcachefs-mount 0.1.0
+Mount a bcachefs filesystem by its UUID
+
+USAGE:
+ bcachefs-mount [OPTIONS] <uuid> <mountpoint>
+
+FLAGS:
+ -h, --help
+ Prints help information
+
+ -V, --version
+ Prints version information
+
+
+OPTIONS:
+ -o <options>
+ Mount options [default: ]
+
+ -p, --password <password>
+ Where the password would be loaded from.
+
+ Possible values are: "fail" - don't ask for password, fail if filesystem is encrypted; "wait" - wait for
+ password to become available before mounting; "ask" - prompt the user for password; [default: fail]
+
+ARGS:
+ <uuid>
+ External UUID of the bcachefs filesystem
+
+ <mountpoint>
+ Where the filesystem should be mounted
+```
+
+Caveats
+=======
+
+* `--password ask` is not yet implemented, but you can use `--password wait`, and load the key with `bcachefs unlock`.
+
+Build
+=====
+
+```sh
+$ git submodule update --init --recursive
+$ cargo build --release
+```
+
+Binary will be built in `target/release/bcachefs-mount`
+
+Dependencies:
+
+* rust
+* blkid
+* uuid
+* liburcu
+* libsodium
+* zlib
+* liblz4
+* libzstd
+* libkeyutils
--- /dev/null
+{ lib
+
+, stdenv
+, glibc
+, llvmPackages
+, rustPlatform
+
+, bcachefs
+
+, ...
+}: rustPlatform.buildRustPackage ( let
+ cargo = lib.trivial.importTOML ./Cargo.toml;
+in {
+ pname = "mount.bcachefs";
+ version = cargo.package.version;
+
+ src = builtins.path { path = ../.; name = "rust-src"; };
+ sourceRoot = "rust-src/mount";
+
+ cargoLock = { lockFile = ./Cargo.lock; };
+
+ nativeBuildInputs = bcachefs.bch_bindgen.nativeBuildInputs;
+ buildInputs = bcachefs.bch_bindgen.buildInputs;
+ inherit (bcachefs.bch_bindgen)
+ LIBBCACHEFS_INCLUDE
+ LIBBCACHEFS_LIB
+ LIBCLANG_PATH
+ BINDGEN_EXTRA_CLANG_ARGS;
+
+ postInstall = ''
+ ln $out/bin/${cargo.package.name} $out/bin/mount.bcachefs
+ ln -s $out/bin $out/sbin
+ '';
+ # -isystem ${llvmPackages.libclang.lib}/lib/clang/${lib.getVersion llvmPackages.libclang}/include";
+ # CFLAGS = "-I${llvmPackages.libclang.lib}/include";
+ # LDFLAGS = "-L${libcdev}";
+
+ doCheck = false;
+
+ # NIX_DEBUG = 4;
+})
\ No newline at end of file
--- /dev/null
+## Mirrors: https://github.com/NixOS/nixpkgs/blob/nixos-unstable/nixos/modules/tasks/filesystems/bcachefs.nix
+## with changes to use flakes and import mount.bcachefs
+{ config, lib, pkgs, utils, ... }:
+
+with lib;
+
+let
+
+ bootFs = filterAttrs (n: fs: (fs.fsType == "bcachefs") && (utils.fsNeededForBoot fs)) config.fileSystems;
+ cfg = config.filesystems.bcachefs;
+in
+
+{
+ options.filesystems.bcachefs.packages.tools = lib.mkOption {
+ description = "Which package to use to link in the bcachefs tools package";
+ default = pkgs.bcachefs.tools;
+ type = lib.types.package;
+ };
+ options.filesystems.bcachefs.packages.mount = lib.mkOption {
+ description = "Which package to use to link in the bcachefs mount package";
+ default = pkgs.bcachefs.mount;
+ type = lib.types.package;
+ };
+ options.filesystems.bcachefs.packages.kernelPackages = lib.mkOption {
+ description = "Which package to use to link in the kernel package to use";
+ default = pkgs.bcachefs.kernelPackages;
+ type = lib.types.attrs;
+
+ };
+
+ config = mkIf (elem "bcachefs" config.boot.supportedFilesystems) (mkMerge [
+ {
+ system.fsPackages = [ cfg.packages.tools cfg.packages.mount ];
+
+ # use kernel package with bcachefs support until it's in mainline
+ boot.kernelPackages = cfg.packages.kernelPackages;
+ }
+
+ (mkIf ((elem "bcachefs" config.boot.initrd.supportedFilesystems) || (bootFs != {})) {
+ # chacha20 and poly1305 are required only for decryption attempts
+ boot.initrd.availableKernelModules = [ "sha256" "chacha20" "poly1305" ];
+ boot.initrd.kernelModules = [ "bcachefs" ];
+
+ boot.initrd.extraUtilsCommands = ''
+ copy_bin_and_libs ${cfg.packages.tools}/bin/bcachefs
+ copy_bin_and_libs ${cfg.packages.mount}/bin/mount.bcachefs
+ '';
+ boot.initrd.extraUtilsCommandsTest = ''
+ $out/bin/bcachefs version
+ $out/bin/mount.bcachefs --version
+ '';
+ })
+ ]);
+}
--- /dev/null
+max_width=120
+hard_tabs = true
--- /dev/null
+extern "C" {
+ pub static stdout: *mut libc::FILE;
+}
+
+use getset::{CopyGetters, Getters};
+use std::path::PathBuf;
+#[derive(Getters, CopyGetters)]
+pub struct FileSystem {
+ /// External UUID of the bcachefs
+ #[getset(get = "pub")]
+ uuid: uuid::Uuid,
+ /// Whether filesystem is encrypted
+ #[getset(get_copy = "pub")]
+ encrypted: bool,
+ /// Super block
+ #[getset(get = "pub")]
+ sb: bcachefs::bch_sb_handle,
+ /// Member devices for this filesystem
+ #[getset(get = "pub")]
+ devices: Vec<PathBuf>,
+}
+impl std::fmt::Debug for FileSystem {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("FileSystem")
+ .field("uuid", &self.uuid)
+ .field("encrypted", &self.encrypted)
+ .field("devices", &self.device_string())
+ .finish()
+ }
+}
+use std::fmt;
+impl std::fmt::Display for FileSystem {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let devs = self.device_string();
+ write!(
+ f,
+ "{:?}: locked?={lock} ({}) ",
+ self.uuid,
+ devs,
+ lock = self.encrypted
+ )
+ }
+}
+
+impl FileSystem {
+ pub(crate) fn new(sb: bcachefs::bch_sb_handle) -> Self {
+ Self {
+ uuid: sb.sb().uuid(),
+ encrypted: sb.sb().crypt().is_some(),
+ sb: sb,
+ devices: Vec::new(),
+ }
+ }
+
+ pub fn device_string(&self) -> String {
+ use itertools::Itertools;
+ self.devices.iter().map(|d| d.display()).join(":")
+ }
+
+ pub fn mount(
+ &self,
+ target: impl AsRef<std::path::Path>,
+ options: impl AsRef<str>,
+ ) -> anyhow::Result<()> {
+ tracing::info_span!("mount").in_scope(|| {
+ let src = self.device_string();
+ let (data, mountflags) = parse_mount_options(options);
+ // let fstype = c_str!("bcachefs");
+
+ tracing::info!(msg="mounting bcachefs filesystem", target=%target.as_ref().display());
+ mount_inner(src, target, "bcachefs", mountflags, data)
+ })
+ }
+}
+
+fn mount_inner(
+ src: String,
+ target: impl AsRef<std::path::Path>,
+ fstype: &str,
+ mountflags: u64,
+ data: Option<String>,
+) -> anyhow::Result<()> {
+ use std::{
+ ffi::{c_void, CString},
+ os::{raw::c_char, unix::ffi::OsStrExt},
+ };
+
+ // bind the CStrings to keep them alive
+ let src = CString::new(src)?;
+ let target = CString::new(target.as_ref().as_os_str().as_bytes())?;
+ let data = data.map(CString::new).transpose()?;
+ let fstype = CString::new(fstype)?;
+
+ // convert to pointers for ffi
+ let src = src.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
+ let target = target.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
+ let data = data.as_ref().map_or(std::ptr::null(), |data| {
+ data.as_c_str().to_bytes_with_nul().as_ptr() as *const c_void
+ });
+ let fstype = fstype.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
+
+ let ret = {let _entered = tracing::info_span!("libc::mount").entered();
+ tracing::info!("mounting filesystem");
+ // REQUIRES: CAP_SYS_ADMIN
+ unsafe { libc::mount(src, target, fstype, mountflags, data) }
+ };
+ match ret {
+ 0 => Ok(()),
+ _ => Err(crate::ErrnoError(errno::errno()).into()),
+ }
+}
+
+/// Parse a comma-separated mount options and split out mountflags and filesystem
+/// specific options.
+#[tracing_attributes::instrument(skip(options))]
+fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, u64) {
+ use either::Either::*;
+ tracing::debug!(msg="parsing mount options", options=?options.as_ref());
+ let (opts, flags) = options
+ .as_ref()
+ .split(",")
+ .map(|o| match o {
+ "dirsync" => Left(libc::MS_DIRSYNC),
+ "lazytime" => Left(1 << 25), // MS_LAZYTIME
+ "mand" => Left(libc::MS_MANDLOCK),
+ "noatime" => Left(libc::MS_NOATIME),
+ "nodev" => Left(libc::MS_NODEV),
+ "nodiratime" => Left(libc::MS_NODIRATIME),
+ "noexec" => Left(libc::MS_NOEXEC),
+ "nosuid" => Left(libc::MS_NOSUID),
+ "ro" => Left(libc::MS_RDONLY),
+ "rw" => Left(0),
+ "relatime" => Left(libc::MS_RELATIME),
+ "strictatime" => Left(libc::MS_STRICTATIME),
+ "sync" => Left(libc::MS_SYNCHRONOUS),
+ "" => Left(0),
+ o @ _ => Right(o),
+ })
+ .fold((Vec::new(), 0), |(mut opts, flags), next| match next {
+ Left(f) => (opts, flags | f),
+ Right(o) => {
+ opts.push(o);
+ (opts, flags)
+ }
+ });
+
+ use itertools::Itertools;
+ (
+ if opts.len() == 0 {
+ None
+ } else {
+ Some(opts.iter().join(","))
+ },
+ flags,
+ )
+}
+
+use bch_bindgen::bcachefs;
+use std::collections::HashMap;
+use uuid::Uuid;
+
+#[tracing_attributes::instrument]
+pub fn probe_filesystems() -> anyhow::Result<HashMap<Uuid, FileSystem>> {
+ tracing::trace!("enumerating udev devices");
+ let mut udev = udev::Enumerator::new()?;
+
+ udev.match_subsystem("block")?; // find kernel block devices
+
+ let mut fs_map = HashMap::new();
+ let devresults =
+ udev.scan_devices()?
+ .into_iter()
+ .filter_map(|dev| dev.devnode().map(ToOwned::to_owned));
+
+ for pathbuf in devresults {
+ match get_super_block_uuid(&pathbuf)? {
+
+ Ok((uuid_key, superblock)) => {
+ let fs = fs_map.entry(uuid_key).or_insert_with(|| {
+ tracing::info!(msg="found bcachefs pool", uuid=?uuid_key);
+ FileSystem::new(superblock)
+ });
+
+ fs.devices.push(pathbuf);
+ },
+
+ Err(e) => { tracing::debug!(inner2_error=?e);}
+ }
+ }
+
+
+ tracing::info!(msg = "found filesystems", count = fs_map.len());
+ Ok(fs_map)
+}
+
+// #[tracing_attributes::instrument(skip(dev, fs_map))]
+fn get_super_block_uuid(path: &std::path::Path) -> std::io::Result<std::io::Result<(Uuid, bcachefs::bch_sb_handle)>> {
+ let sb = bch_bindgen::rs::read_super(&path)?;
+ let super_block = match sb {
+ Err(e) => { return Ok(Err(e)); }
+ Ok(sb) => sb,
+ };
+
+ let uuid = (&super_block).sb().uuid();
+ tracing::debug!(found="bcachefs superblock", devnode=?path, ?uuid);
+
+ Ok(Ok((uuid, super_block)))
+}
-use log::info;
+use tracing::info;
fn check_for_key(key_name: &std::ffi::CStr) -> anyhow::Result<bool> {
- use crate::keyutils::{self, keyctl_search};
+ use bch_bindgen::keyutils::{self, keyctl_search};
let key_name = key_name.to_bytes_with_nul().as_ptr() as *const _;
let key_type = c_str!("logon");
- let key_id =
- unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) };
+ let key_id = unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) };
if key_id > 0 {
info!("Key has became avaiable");
Ok(true)
const BCH_KEY_MAGIC: &str = "bch**key";
use crate::filesystem::FileSystem;
fn ask_for_key(fs: &FileSystem) -> anyhow::Result<()> {
- use crate::bcachefs::{self, bch2_chacha_encrypt_key, bch_encrypted_key, bch_key};
use anyhow::anyhow;
use byteorder::{LittleEndian, ReadBytesExt};
+ use bch_bindgen::bcachefs::{self, bch2_chacha_encrypt_key, bch_encrypted_key, bch_key};
use std::os::raw::c_char;
let key_name = std::ffi::CString::new(format!("bcachefs:{}", fs.uuid())).unwrap();
)
};
if ret != 0 {
- Err(anyhow!("chache decryption failure"))
+ Err(anyhow!("chacha decryption failure"))
} else if key.magic != bch_key_magic {
Err(anyhow!("failed to verify the password"))
} else {
let key_type = c_str!("logon");
let ret = unsafe {
- crate::keyutils::add_key(
+ bch_bindgen::keyutils::add_key(
key_type,
- key_name.as_c_str().to_bytes_with_nul() as *const _
- as *const c_char,
+ key_name.as_c_str().to_bytes_with_nul() as *const _ as *const c_char,
&output as *const _ as *const _,
std::mem::size_of::<bch_key>() as u64,
- crate::keyutils::KEY_SPEC_USER_KEYRING,
+ bch_bindgen::keyutils::KEY_SPEC_USER_KEYRING,
)
};
if ret == -1 {
}
}
-pub(crate) fn prepare_key(fs: &FileSystem, password: crate::KeyLocation) -> anyhow::Result<()> {
+#[tracing_attributes::instrument]
+pub fn prepare_key(fs: &FileSystem, password: crate::KeyLocation) -> anyhow::Result<()> {
use crate::KeyLocation::*;
use anyhow::anyhow;
+
+ tracing::info!(msg = "checking if key exists for filesystem");
match password {
Fail => Err(anyhow!("no key available")),
Wait => Ok(wait_for_key(fs.uuid())?),
--- /dev/null
+use anyhow::anyhow;
+use structopt::StructOpt;
+
+pub mod err {
+ pub enum GError {
+ Unknown{
+ message: std::borrow::Cow<'static, String>
+ }
+ }
+ pub type GResult<T, E, OE> =::core::result::Result< ::core::result::Result<T, E>, OE>;
+ pub type Result<T, E> = GResult<T, E, GError>;
+}
+
+#[macro_export]
+macro_rules! c_str {
+ ($lit:expr) => {
+ unsafe {
+ std::ffi::CStr::from_ptr(concat!($lit, "\0").as_ptr() as *const std::os::raw::c_char)
+ .to_bytes_with_nul()
+ .as_ptr() as *const std::os::raw::c_char
+ }
+ };
+}
+
+#[derive(Debug)]
+struct ErrnoError(errno::Errno);
+impl std::fmt::Display for ErrnoError {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+ self.0.fmt(f)
+ }
+}
+impl std::error::Error for ErrnoError {}
+
+#[derive(Debug)]
+pub enum KeyLocation {
+ Fail,
+ Wait,
+ Ask,
+}
+
+#[derive(Debug)]
+pub struct KeyLoc(pub Option<KeyLocation>);
+impl std::ops::Deref for KeyLoc {
+ type Target = Option<KeyLocation>;
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+impl std::str::FromStr for KeyLoc {
+ type Err = anyhow::Error;
+ fn from_str(s: &str) -> anyhow::Result<Self> {
+ // use anyhow::anyhow;
+ match s {
+ "" => Ok(KeyLoc(None)),
+ "fail" => Ok(KeyLoc(Some(KeyLocation::Fail))),
+ "wait" => Ok(KeyLoc(Some(KeyLocation::Wait))),
+ "ask" => Ok(KeyLoc(Some(KeyLocation::Ask))),
+ _ => Err(anyhow!("invalid password option")),
+ }
+ }
+}
+
+#[derive(StructOpt, Debug)]
+/// Mount a bcachefs filesystem by its UUID.
+pub struct Options {
+ /// Where the password would be loaded from.
+ ///
+ /// Possible values are:
+ /// "fail" - don't ask for password, fail if filesystem is encrypted;
+ /// "wait" - wait for password to become available before mounting;
+ /// "ask" - prompt the user for password;
+ #[structopt(short, long, default_value = "")]
+ pub key_location: KeyLoc,
+
+ /// External UUID of the bcachefs filesystem
+ pub uuid: uuid::Uuid,
+
+ /// Where the filesystem should be mounted. If not set, then the filesystem
+ /// won't actually be mounted. But all steps preceeding mounting the
+ /// filesystem (e.g. asking for passphrase) will still be performed.
+ pub mountpoint: Option<std::path::PathBuf>,
+
+ /// Mount options
+ #[structopt(short, default_value = "")]
+ pub options: String,
+}
+
+pub mod filesystem;
+pub mod key;
+
+// pub fn mnt_in_use()
--- /dev/null
+fn main() {
+ // convert existing log statements to tracing events
+ // tracing_log::LogTracer::init().expect("logtracer init failed!");
+ // format tracing log data to env_logger like stdout
+ tracing_subscriber::fmt::init();
+
+ if let Err(e) = crate::main_inner() {
+ tracing::error!(fatal_error = ?e);
+ }
+}
+
+
+
+#[tracing_attributes::instrument("main")]
+pub fn main_inner() -> anyhow::Result<()> {
+ use structopt::StructOpt;
+ use bcachefs_mount::{Options, filesystem, key};
+ unsafe {
+ libc::setvbuf(
+ filesystem::stdout,
+ std::ptr::null_mut(),
+ libc::_IONBF,
+ 0,
+ );
+ // libc::fflush(filesystem::stdout);
+ }
+ let opt = Options::from_args();
+
+
+ tracing::trace!(?opt);
+
+ let fss = filesystem::probe_filesystems()?;
+ let fs = fss
+ .get(&opt.uuid)
+ .ok_or_else(|| anyhow::anyhow!("filesystem was not found"))?;
+
+ tracing::info!(msg="found filesystem", %fs);
+ if fs.encrypted() {
+ let key = opt
+ .key_location
+ .0
+ .ok_or_else(|| anyhow::anyhow!("no keyoption specified for locked filesystem"))?;
+
+ key::prepare_key(&fs, key)?;
+ }
+
+ let mountpoint = opt
+ .mountpoint
+ .ok_or_else(|| anyhow::anyhow!("mountpoint option was not specified"))?;
+
+ fs.mount(&mountpoint, &opt.options)?;
+
+ Ok(())
+}
+
+#[cfg(test)]
+mod test {
+ // use insta::assert_debug_snapshot;
+ // #[test]
+ // fn snapshot_testing() {
+ // insta::assert_debug_snapshot!();
+ // }
+}
set -e
PYTEST="${PYTEST:-pytest-3}"
-spam=$(tempfile)
+spam=$(mktemp)
unset BCACHEFS_FUSE BCACHEFS_TEST_USE_VALGRIND BCACHEFS_DEBUG
trap "set +x; cat ${spam}; rm -f ${spam} ; echo; echo FAILED." EXIT
function test() {
echo Running tests.
(
- cd tests
${PYTEST} -n${JOBS}
) > ${spam} 2>&1
}
echo Running tests with valgrind.
(
export BCACHEFS_TEST_USE_VALGRIND=yes
- cd tests
${PYTEST} -n${JOBS}
) > ${spam} 2>&1
}
echo -- Test: debug with valgrind --
test_vg
-echo -- Test: fuse debug --
-export BCACHEFS_FUSE=1
-build
-test
+#echo -- Test: fuse debug --
+#export BCACHEFS_FUSE=1
+#build
+#test
-echo -- Test: fuse debug with valgrind --
-test_vg
+#echo -- Test: fuse debug with valgrind --
+#test_vg
rm -f ${spam}
trap "set +x; echo; echo SUCCESS." EXIT
# pytest fixture definitions.
import pytest
-import util
+from tests import util
@pytest.fixture
def bfuse(tmpdir):
# Basic bcachefs functionality tests.
import re
-import util
+from tests import util
def test_help():
ret = util.run_bch(valgrind=True)
assert len(ret.stderr) == 0
assert "recovering from clean shutdown" in ret.stdout
- # Totally arbitrary, feel free to update or remove after inspecting.
- assert len(ret.stdout.splitlines()) == 97
-
def test_list_inodes(tmpdir):
dev = util.format_1g(tmpdir)
#
# Tests of the functions in util.py
-import pytest
import signal
import subprocess
import time
+import os
+import pytest
-import util
-from pathlib import Path
+from tests import util
-#helper = Path('.') / 'test_helper'
-helper = './test_helper'
+helper = os.path.abspath(os.path.join(util.BASE_PATH, 'test_helper'))
def test_sparse_file(tmpdir):
dev = util.sparse_file(tmpdir / '1k', 1024)
@pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
def test_check():
with pytest.raises(subprocess.CalledProcessError):
- ret = util.run(helper, 'abort', check=True)
+ util.run(helper, 'abort', check=True)
@pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
def test_leak():
with pytest.raises(util.ValgrindFailedError):
- ret = util.run(helper, 'leak', valgrind=True)
+ util.run(helper, 'leak', valgrind=True)
@pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
def test_undefined():
with pytest.raises(util.ValgrindFailedError):
- ret = util.run(helper, 'undefined', valgrind=True)
+ util.run(helper, 'undefined', valgrind=True)
@pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
def test_undefined_branch():
with pytest.raises(util.ValgrindFailedError):
- ret = util.run(helper, 'undefined_branch', valgrind=True)
+ util.run(helper, 'undefined_branch', valgrind=True)
@pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
def test_read_after_free():
with pytest.raises(util.ValgrindFailedError):
- ret = util.run(helper, 'read_after_free', valgrind=True)
+ util.run(helper, 'read_after_free', valgrind=True)
@pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
def test_write_after_free():
with pytest.raises(util.ValgrindFailedError):
- ret = util.run(helper, 'write_after_free', valgrind=True)
+ util.run(helper, 'write_after_free', valgrind=True)
def test_mountpoint(tmpdir):
path = util.mountpoint(tmpdir)
import pytest
import os
-import util
+from tests import util
pytestmark = pytest.mark.skipif(
not util.have_fuse(), reason="bcachefs not built with fuse support.")
import errno
import os
-import pytest
import re
import subprocess
-import sys
import tempfile
import threading
import time
from pathlib import Path
-DIR = Path('..')
-BCH_PATH = DIR / 'bcachefs'
+BASE_PATH= os.path.dirname(__file__)
+BCH_PATH = os.path.abspath(os.path.join(BASE_PATH, '..', 'bcachefs'))
+VALGRIND_PATH= os.path.abspath(os.path.join(BASE_PATH,
+ 'valgrind-suppressions.txt'))
VPAT = re.compile(r'ERROR SUMMARY: (\d+) errors from (\d+) contexts')
cmds = [cmd] + list(args)
valgrind = valgrind and ENABLE_VALGRIND
+ print("Running '{}'".format(cmds))
if valgrind:
vout = tempfile.NamedTemporaryFile()
vcmd = ['valgrind',
'--leak-check=full',
'--gen-suppressions=all',
- '--suppressions=valgrind-suppressions.txt',
+ '--suppressions={}'.format(VALGRIND_PATH),
'--log-file={}'.format(vout.name)]
cmds = vcmd + cmds
- print("Running '{}'".format(cmds))
- res = subprocess.run(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
- encoding='utf-8', check=check)
-
- if valgrind:
+ res = subprocess.run(cmds, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, encoding='utf-8', check=check)
check_valgrind(vout.read().decode('utf-8'))
+ else:
+ res = subprocess.run(cmds, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, encoding='utf-8', check=check)
return res
This is typically used to create device files for bcachefs.
"""
path = Path(lpath)
- f = path.touch(mode = 0o600, exist_ok = False)
+ path.touch(mode = 0o600, exist_ok = False)
os.truncate(path, size)
return path
self.stdout = out1 + out2
self.stderr = err.read()
- self.vout = vlog.read().decode('utf-8')
+ if vlog:
+ self.vout = vlog.read().decode('utf-8')
def expect(self, pipe, regex):
"""Wait for the child process to mount."""
print("Waiting for thread to exit.")
self.thread.join(timeout)
if self.thread.is_alive():
- self.proc.kill()
+ if self.proc:
+ self.proc.kill()
self.thread.join()
else:
print("Thread was already done.")
check_valgrind(self.vout)
def verify(self):
+ # avoid throwing exception in assertion
+ assert self.stdout is not None
+ assert self.stderr is not None
assert self.returncode == 0
assert len(self.stdout) > 0
assert len(self.stderr) == 0
{
- <insert_a_suppression_name_here>
+ call_rcu_memb
Memcheck:Leak
match-leak-kinds: possible,definite
...
fun:get_default_call_rcu_data_memb
fun:call_rcu_memb
}
+{
+ call_rcu_data_init
+ Memcheck:Leak
+ match-leak-kinds: possible
+ fun:calloc
+ fun:_dl_allocate_tls
+ ...
+ fun:call_rcu_data_init
+}
}
}
-void xpwrite(int fd, const void *buf, size_t count, off_t offset)
+void xpwrite(int fd, const void *buf, size_t count, off_t offset, const char *msg)
{
ssize_t r = pwrite(fd, buf, count, offset);
if (r != count)
- die("write error (ret %zi err %m)", r);
+ die("error writing %s (ret %zi err %m)", msg, r);
}
struct stat xfstatat(int dirfd, const char *path, int flags)
{
char *buf = read_file_str(dirfd, path);
u64 v;
- if (kstrtou64(buf, 10, &v))
+ if (bch2_strtou64_h(buf, &v))
die("read_file_u64: error parsing %s (got %s)", path, buf);
free(buf);
return v;
return ret;
}
-/* Returns blocksize in units of 512 byte sectors: */
+/* Returns blocksize, in bytes: */
unsigned get_blocksize(const char *path, int fd)
{
struct stat statbuf = xfstat(fd);
if (!S_ISBLK(statbuf.st_mode))
- return statbuf.st_blksize >> 9;
+ return statbuf.st_blksize;
unsigned ret;
xioctl(fd, BLKPBSZGET, &ret);
- return ret >> 9;
+ return ret;
}
/* Open a block device, do magic blkid stuff to probe for existing filesystems: */
const char *fs_type = NULL, *fs_label = NULL;
size_t fs_type_len, fs_label_len;
- int fd = xopen(dev, O_RDWR|O_EXCL);
+ int fd = open(dev, O_RDWR|O_EXCL);
+ if (fd < 0)
+ die("Error opening device to format %s: %m", dev);
if (force)
return fd;
return *a_prefix ? NULL : a;
}
-unsigned hatoi_validate(const char *s, const char *msg)
-{
- u64 v;
-
- if (bch2_strtoull_h(s, &v))
- die("bad %s %s", msg, s);
-
- v /= 512;
-
- if (v > USHRT_MAX)
- die("%s too large\n", msg);
-
- if (!v)
- die("%s too small\n", msg);
-
- return v;
-}
-
/* crc32c */
static u32 crc32c_default(u32 crc, const void *buf, size_t size)
#define noreturn __attribute__((noreturn))
-void die(const char *, ...) noreturn;
+void die(const char *, ...)
+ __attribute__ ((format (printf, 1, 2))) noreturn;
char *mprintf(const char *, ...)
__attribute__ ((format (printf, 1, 2)));
void *xcalloc(size_t, size_t);
void *xmalloc(size_t);
void *xrealloc(void *, size_t);
void xpread(int, void *, size_t, off_t);
-void xpwrite(int, const void *, size_t, off_t);
+void xpwrite(int, const void *, size_t, off_t, const char *);
struct stat xfstatat(int, const char *, int);
struct stat xfstat(int);
struct stat xstat(const char *);
char *strcmp_prefix(char *, const char *);
-unsigned hatoi_validate(const char *, const char *);
-
u32 crc32c(u32, const void *, size_t);
char *dev_to_name(dev_t);